From 2c03baf04b363c0509e8a3561390b788f06916e9 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Tue, 16 Apr 2024 13:29:46 +0000 Subject: [PATCH] Squashed commits from internal development repo --- .github/issue_template.md | 0 .github/pull_request_template.md | 0 .github/workflows/black.yml | 0 .gitignore | 0 .pre-commit-config.yaml | 0 README.md | 0 docs/README.md | 0 docs/commands.md | 0 docs/model_guide.md | 0 docs/task_guide.md | 0 example_eval.yaml | 11 +- llava_repr_requirements.txt | 17 - lmms_eval/__init__.py | 0 lmms_eval/__main__.py | 0 lmms_eval/api/__init__.py | 0 lmms_eval/api/filter.py | 0 lmms_eval/api/instance.py | 0 lmms_eval/api/metrics.py | 0 lmms_eval/api/model.py | 0 lmms_eval/api/registry.py | 0 lmms_eval/api/samplers.py | 4 +- lmms_eval/api/task.py | 25 ++ lmms_eval/evaluator.py | 0 lmms_eval/filters/__init__.py | 3 +- lmms_eval/filters/decontamination.py | 0 lmms_eval/filters/extraction.py | 186 ++++++++- lmms_eval/filters/selection.py | 0 lmms_eval/filters/transformation.py | 0 lmms_eval/logging_utils.py | 18 +- lmms_eval/models/__init__.py | 11 +- lmms_eval/models/fuyu.py | 2 +- lmms_eval/models/gpt4v.py | 17 +- lmms_eval/models/instructblip.py | 2 + lmms_eval/models/llava.py | 63 +++- lmms_eval/models/llava_vid.py | 356 ++++++++++++++++++ lmms_eval/models/minicpm_v.py | 0 lmms_eval/models/model_utils/__init__.py | 0 .../model_utils/qwen/qwen_generate_utils.py | 0 lmms_eval/models/qwen_vl.py | 5 +- lmms_eval/tasks/__init__.py | 0 lmms_eval/tasks/_task_utils/file_utils.py | 0 lmms_eval/tasks/_task_utils/gpt_eval_utils.py | 0 .../tasks/_task_utils/vqa_eval_metric.py | 0 lmms_eval/tasks/ai2d/ai2d.yaml | 0 lmms_eval/tasks/ai2d/upload_ai2d.py | 0 lmms_eval/tasks/ai2d/utils.py | 0 lmms_eval/tasks/chartqa/chartqa.yaml | 0 lmms_eval/tasks/chartqa/upload_chartqa.py | 0 lmms_eval/tasks/chartqa/utils.py | 0 lmms_eval/tasks/cmmmu/_cmmmu.yaml | 0 .../tasks/cmmmu/_default_template_cmmmu_yaml | 0 lmms_eval/tasks/cmmmu/cmmmu_test.yaml | 0 lmms_eval/tasks/cmmmu/cmmmu_val.yaml | 0 lmms_eval/tasks/cmmmu/utils.py | 0 lmms_eval/tasks/coco_cap/coco2014_cap.yaml | 0 .../tasks/coco_cap/coco2014_cap_test.yaml | 0 .../tasks/coco_cap/coco2014_cap_val.yaml | 0 lmms_eval/tasks/coco_cap/coco2017_cap.yaml | 0 .../tasks/coco_cap/coco2017_cap_test.yaml | 0 .../tasks/coco_cap/coco2017_cap_val.yaml | 0 lmms_eval/tasks/coco_cap/coco_cap.yaml | 0 lmms_eval/tasks/coco_cap/utils.py | 0 .../docvqa/_default_template_docvqa_yaml | 0 lmms_eval/tasks/docvqa/docvqa.yaml | 0 lmms_eval/tasks/docvqa/docvqa_test.yaml | 0 lmms_eval/tasks/docvqa/docvqa_val.yaml | 0 lmms_eval/tasks/docvqa/utils.py | 0 lmms_eval/tasks/ferret/ferret.yaml | 0 lmms_eval/tasks/ferret/rule.json | 0 lmms_eval/tasks/ferret/utils.py | 0 lmms_eval/tasks/flickr30k/flickr30k.yaml | 0 lmms_eval/tasks/flickr30k/flickr30k_test.yaml | 0 lmms_eval/tasks/flickr30k/utils.py | 0 lmms_eval/tasks/gqa/gqa.yaml | 0 lmms_eval/tasks/gqa/utils.py | 0 .../tasks/hallusion_bench/evaluate_hb.py | 0 .../hallusion_bench_image.yaml | 0 lmms_eval/tasks/hallusion_bench/utils.py | 0 .../iconqa/_default_template_docvqa_yaml | 0 lmms_eval/tasks/iconqa/iconqa.yaml | 0 lmms_eval/tasks/iconqa/iconqa_test.yaml | 0 lmms_eval/tasks/iconqa/iconqa_val.yaml | 0 lmms_eval/tasks/iconqa/utils.py | 0 .../infovqa/_default_template_infovqa_yaml | 0 lmms_eval/tasks/infovqa/infovqa.yaml | 0 lmms_eval/tasks/infovqa/infovqa_test.yaml | 0 lmms_eval/tasks/infovqa/infovqa_val.yaml | 0 lmms_eval/tasks/infovqa/utils.py | 0 .../llava-bench-coco/llava-bench-coco.yaml | 0 lmms_eval/tasks/llava-bench-coco/rule.json | 0 lmms_eval/tasks/llava-bench-coco/utils.py | 0 .../llava-in-the-wild/llava-in-the-wild.yaml | 0 lmms_eval/tasks/llava-in-the-wild/rule.json | 0 lmms_eval/tasks/llava-in-the-wild/utils.py | 0 lmms_eval/tasks/mathvista/mathvista.yaml | 0 lmms_eval/tasks/mathvista/mathvista_evals.py | 2 +- lmms_eval/tasks/mathvista/mathvista_test.yaml | 0 .../tasks/mathvista/mathvista_testmini.yaml | 0 lmms_eval/tasks/mathvista/utils.py | 0 .../tasks/mix_evals/_default_template_yaml | 9 + .../tasks/mix_evals/mix_evals_video2text.yaml | 5 + .../mix_evals_video2text_freeform.yaml | 20 + .../mix_evals/mix_evals_video2text_mc.yaml | 28 ++ .../mix_evals_video2text_openended.yaml | 13 + lmms_eval/tasks/mix_evals/utils.py | 200 ++++++++++ lmms_eval/tasks/mmbench/cc_utils.py | 0 lmms_eval/tasks/mmbench/cn_utils.py | 0 lmms_eval/tasks/mmbench/en_utils.py | 0 lmms_eval/tasks/mmbench/mmbench.yaml | 0 lmms_eval/tasks/mmbench/mmbench_cc.yaml | 0 lmms_eval/tasks/mmbench/mmbench_cn.yaml | 0 lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml | 0 lmms_eval/tasks/mmbench/mmbench_cn_test.yaml | 0 lmms_eval/tasks/mmbench/mmbench_en.yaml | 0 lmms_eval/tasks/mmbench/mmbench_en_dev.yaml | 0 lmms_eval/tasks/mmbench/mmbench_en_test.yaml | 0 lmms_eval/tasks/mmbench/mmbench_evals.py | 0 lmms_eval/tasks/mme/mme.yaml | 0 lmms_eval/tasks/mme/utils.py | 0 lmms_eval/tasks/mmmu/mmmu.yaml | 0 lmms_eval/tasks/mmmu/mmmu_test.yaml | 0 lmms_eval/tasks/mmmu/mmmu_val.yaml | 0 lmms_eval/tasks/mmmu/utils.py | 12 +- lmms_eval/tasks/mmvet/mmvet.yaml | 0 lmms_eval/tasks/mmvet/utils.py | 0 lmms_eval/tasks/multidocvqa/multidocvqa.yaml | 0 .../tasks/multidocvqa/multidocvqa_test.yaml | 0 .../tasks/multidocvqa/multidocvqa_val.yaml | 0 lmms_eval/tasks/multidocvqa/utils.py | 0 .../nocaps/_default_template_nocaps_yaml | 0 lmms_eval/tasks/nocaps/nocaps.yaml | 0 lmms_eval/tasks/nocaps/nocaps_test.yaml | 0 lmms_eval/tasks/nocaps/nocaps_val.yaml | 0 lmms_eval/tasks/nocaps/utils.py | 0 .../tasks/ok_vqa/_default_template_vqa_yaml | 0 lmms_eval/tasks/ok_vqa/_generate_config.py | 0 lmms_eval/tasks/ok_vqa/_ok_vqa.yaml | 0 lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml | 0 lmms_eval/tasks/ok_vqa/utils.py | 0 lmms_eval/tasks/olympiadbench/cn_utils.py | 18 +- lmms_eval/tasks/olympiadbench/en_utils.py | 24 +- .../olympiadbench/olympiadbench_evals.py | 72 ++-- lmms_eval/tasks/pope/pope.yaml | 0 lmms_eval/tasks/pope/utils.py | 0 lmms_eval/tasks/realworldqa/realworldqa.yaml | 42 +++ lmms_eval/tasks/realworldqa/utils.py | 117 ++++++ .../refcoco+/_default_template_bbox_yaml | 0 .../tasks/refcoco+/_default_template_seg_yaml | 0 lmms_eval/tasks/refcoco+/_generate_config.py | 0 lmms_eval/tasks/refcoco+/_refcoco.yaml | 0 .../tasks/refcoco+/refcoco+_bbox_testA.yaml | 0 .../tasks/refcoco+/refcoco+_bbox_testB.yaml | 0 .../tasks/refcoco+/refcoco+_bbox_val.yaml | 0 .../tasks/refcoco+/refcoco+_seg_testA.yaml | 0 .../tasks/refcoco+/refcoco+_seg_testB.yaml | 0 .../tasks/refcoco+/refcoco+_seg_val.yaml | 0 lmms_eval/tasks/refcoco+/utils.py | 0 .../tasks/refcoco/_default_template_bbox_yaml | 0 .../tasks/refcoco/_default_template_seg_yaml | 0 lmms_eval/tasks/refcoco/_generate_config.py | 0 lmms_eval/tasks/refcoco/_refcoco.yaml | 0 .../tasks/refcoco/refcoco_bbox_test.yaml | 0 .../tasks/refcoco/refcoco_bbox_testA.yaml | 0 .../tasks/refcoco/refcoco_bbox_testB.yaml | 0 lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml | 0 lmms_eval/tasks/refcoco/refcoco_seg_test.yaml | 0 .../tasks/refcoco/refcoco_seg_testA.yaml | 0 .../tasks/refcoco/refcoco_seg_testB.yaml | 0 lmms_eval/tasks/refcoco/refcoco_seg_val.yaml | 0 lmms_eval/tasks/refcoco/utils.py | 0 .../refcocog/_default_template_bbox_yaml | 0 .../tasks/refcocog/_default_template_seg_yaml | 0 lmms_eval/tasks/refcocog/_generate_config.py | 0 lmms_eval/tasks/refcocog/_refcoco.yaml | 0 .../tasks/refcocog/refcocog_bbox_test.yaml | 0 .../tasks/refcocog/refcocog_bbox_val.yaml | 0 .../tasks/refcocog/refcocog_seg_test.yaml | 0 .../tasks/refcocog/refcocog_seg_val.yaml | 0 lmms_eval/tasks/refcocog/utils.py | 0 lmms_eval/tasks/scienceqa/scienceqa.yaml | 0 lmms_eval/tasks/scienceqa/scienceqa_full.yaml | 0 lmms_eval/tasks/scienceqa/scienceqa_img.yaml | 0 lmms_eval/tasks/scienceqa/utils.py | 0 lmms_eval/tasks/seedbench/seedbench.yaml | 0 lmms_eval/tasks/seedbench/seedbench_ppl.yaml | 0 lmms_eval/tasks/seedbench/utils.py | 0 lmms_eval/tasks/seedbench_2/seedbench_2.yaml | 0 lmms_eval/tasks/seedbench_2/utils.py | 0 .../_default_sft_eval_ocr_rec_template_yaml | 23 ++ .../_default_sft_eval_rest_template_yaml | 20 + lmms_eval/tasks/sft_eval/_generate_config.py | 94 +++++ lmms_eval/tasks/sft_eval/_sft_eval.yaml | 34 ++ lmms_eval/tasks/sft_eval/sft_activity.yaml | 5 + lmms_eval/tasks/sft_eval/sft_arts.yaml | 5 + lmms_eval/tasks/sft_eval/sft_body.yaml | 5 + lmms_eval/tasks/sft_eval/sft_car.yaml | 5 + lmms_eval/tasks/sft_eval/sft_color.yaml | 5 + lmms_eval/tasks/sft_eval/sft_commodity.yaml | 5 + lmms_eval/tasks/sft_eval/sft_count.yaml | 5 + lmms_eval/tasks/sft_eval/sft_daily.yaml | 5 + lmms_eval/tasks/sft_eval/sft_engineer.yaml | 5 + .../tasks/sft_eval/sft_entertainment.yaml | 5 + lmms_eval/tasks/sft_eval/sft_exist.yaml | 5 + lmms_eval/tasks/sft_eval/sft_face.yaml | 5 + lmms_eval/tasks/sft_eval/sft_food.yaml | 5 + lmms_eval/tasks/sft_eval/sft_healthcare.yaml | 5 + lmms_eval/tasks/sft_eval/sft_landmark.yaml | 5 + lmms_eval/tasks/sft_eval/sft_logo.yaml | 5 + lmms_eval/tasks/sft_eval/sft_natural.yaml | 5 + lmms_eval/tasks/sft_eval/sft_ocr_qa_adv.yaml | 5 + .../tasks/sft_eval/sft_ocr_qa_chart.yaml | 5 + lmms_eval/tasks/sft_eval/sft_ocr_qa_form.yaml | 5 + .../tasks/sft_eval/sft_ocr_qa_scene.yaml | 5 + .../tasks/sft_eval/sft_ocr_qa_screen.yaml | 5 + lmms_eval/tasks/sft_eval/sft_ocr_rec_adv.yaml | 5 + lmms_eval/tasks/sft_eval/sft_ocr_rec_doc.yaml | 5 + .../tasks/sft_eval/sft_ocr_rec_handwrite.yaml | 5 + .../tasks/sft_eval/sft_ocr_rec_markdown.yaml | 5 + .../tasks/sft_eval/sft_ocr_rec_scene.yaml | 5 + .../tasks/sft_eval/sft_ocr_rec_screen.yaml | 5 + lmms_eval/tasks/sft_eval/sft_place.yaml | 5 + lmms_eval/tasks/sft_eval/sft_position.yaml | 5 + lmms_eval/tasks/sft_eval/sft_sport.yaml | 5 + lmms_eval/tasks/sft_eval/sft_status.yaml | 5 + lmms_eval/tasks/sft_eval/utils.py | 109 ++++++ lmms_eval/tasks/stvqa/stvqa.yaml | 0 lmms_eval/tasks/stvqa/utils.py | 0 .../textcaps/_default_template_textcaps_yaml | 0 lmms_eval/tasks/textcaps/textcaps.yaml | 0 lmms_eval/tasks/textcaps/textcaps_test.yaml | 0 lmms_eval/tasks/textcaps/textcaps_train.yaml | 0 lmms_eval/tasks/textcaps/textcaps_val.yaml | 0 lmms_eval/tasks/textcaps/utils.py | 0 .../textvqa/_default_template_textvqa_yaml | 0 lmms_eval/tasks/textvqa/_textvqa.yaml | 0 lmms_eval/tasks/textvqa/textvqa_test.yaml | 0 lmms_eval/tasks/textvqa/textvqa_val.yaml | 0 lmms_eval/tasks/textvqa/utils.py | 0 .../vizwiz_vqa/_default_template_vqa_yaml | 0 .../tasks/vizwiz_vqa/_generate_config.py | 0 lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml | 0 lmms_eval/tasks/vizwiz_vqa/utils.py | 0 .../tasks/vizwiz_vqa/vizwiz_vqa_test.yaml | 0 .../tasks/vizwiz_vqa/vizwiz_vqa_val.yaml | 0 .../tasks/vqav2/_default_template_vqav2_yaml | 0 lmms_eval/tasks/vqav2/_vqav2.yaml | 0 lmms_eval/tasks/vqav2/utils.py | 0 lmms_eval/tasks/vqav2/vqav2_test.yaml | 0 lmms_eval/tasks/vqav2/vqav2_val.yaml | 0 .../tasks/worldqa/_default_template_yaml | 10 + lmms_eval/tasks/worldqa/utils.py | 113 ++++++ .../tasks/worldqa/worldqa_generation.yaml | 13 + lmms_eval/tasks/worldqa/worldqa_mc.yaml | 11 + lmms_eval/tasks/worldqa/worldqa_mcppl.yaml | 11 + lmms_eval/utils.py | 0 miscs/llava_result_check.md | 0 miscs/repr_scripts.sh | 0 miscs/repr_torch_envs.txt | 0 miscs/scienceqa_id.txt | 0 miscs/script.sh | 2 +- miscs/test_llava.py | 0 miscs/test_scienceqa.py | 0 pyproject.toml | 7 +- setup.py | 0 tools/make_hf_dataset.ipynb | 0 265 files changed, 1730 insertions(+), 159 deletions(-) mode change 100644 => 100755 .github/issue_template.md mode change 100644 => 100755 .github/pull_request_template.md mode change 100644 => 100755 .github/workflows/black.yml mode change 100644 => 100755 .gitignore mode change 100644 => 100755 .pre-commit-config.yaml mode change 100644 => 100755 README.md mode change 100644 => 100755 docs/README.md mode change 100644 => 100755 docs/commands.md mode change 100644 => 100755 docs/model_guide.md mode change 100644 => 100755 docs/task_guide.md mode change 100644 => 100755 example_eval.yaml mode change 100644 => 100755 llava_repr_requirements.txt mode change 100644 => 100755 lmms_eval/__init__.py mode change 100644 => 100755 lmms_eval/__main__.py mode change 100644 => 100755 lmms_eval/api/__init__.py mode change 100644 => 100755 lmms_eval/api/filter.py mode change 100644 => 100755 lmms_eval/api/instance.py mode change 100644 => 100755 lmms_eval/api/metrics.py mode change 100644 => 100755 lmms_eval/api/model.py mode change 100644 => 100755 lmms_eval/api/registry.py mode change 100644 => 100755 lmms_eval/api/samplers.py mode change 100644 => 100755 lmms_eval/api/task.py mode change 100644 => 100755 lmms_eval/evaluator.py mode change 100644 => 100755 lmms_eval/filters/__init__.py mode change 100644 => 100755 lmms_eval/filters/decontamination.py mode change 100644 => 100755 lmms_eval/filters/extraction.py mode change 100644 => 100755 lmms_eval/filters/selection.py mode change 100644 => 100755 lmms_eval/filters/transformation.py mode change 100644 => 100755 lmms_eval/logging_utils.py mode change 100644 => 100755 lmms_eval/models/__init__.py mode change 100644 => 100755 lmms_eval/models/fuyu.py mode change 100644 => 100755 lmms_eval/models/gpt4v.py mode change 100644 => 100755 lmms_eval/models/instructblip.py mode change 100644 => 100755 lmms_eval/models/llava.py create mode 100755 lmms_eval/models/llava_vid.py mode change 100644 => 100755 lmms_eval/models/minicpm_v.py mode change 100644 => 100755 lmms_eval/models/model_utils/__init__.py mode change 100644 => 100755 lmms_eval/models/model_utils/qwen/qwen_generate_utils.py mode change 100644 => 100755 lmms_eval/models/qwen_vl.py mode change 100644 => 100755 lmms_eval/tasks/__init__.py mode change 100644 => 100755 lmms_eval/tasks/_task_utils/file_utils.py mode change 100644 => 100755 lmms_eval/tasks/_task_utils/gpt_eval_utils.py mode change 100644 => 100755 lmms_eval/tasks/_task_utils/vqa_eval_metric.py mode change 100644 => 100755 lmms_eval/tasks/ai2d/ai2d.yaml mode change 100644 => 100755 lmms_eval/tasks/ai2d/upload_ai2d.py mode change 100644 => 100755 lmms_eval/tasks/ai2d/utils.py mode change 100644 => 100755 lmms_eval/tasks/chartqa/chartqa.yaml mode change 100644 => 100755 lmms_eval/tasks/chartqa/upload_chartqa.py mode change 100644 => 100755 lmms_eval/tasks/chartqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/cmmmu/_cmmmu.yaml mode change 100644 => 100755 lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml mode change 100644 => 100755 lmms_eval/tasks/cmmmu/cmmmu_test.yaml mode change 100644 => 100755 lmms_eval/tasks/cmmmu/cmmmu_val.yaml mode change 100644 => 100755 lmms_eval/tasks/cmmmu/utils.py mode change 100644 => 100755 lmms_eval/tasks/coco_cap/coco2014_cap.yaml mode change 100644 => 100755 lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml mode change 100644 => 100755 lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml mode change 100644 => 100755 lmms_eval/tasks/coco_cap/coco2017_cap.yaml mode change 100644 => 100755 lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml mode change 100644 => 100755 lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml mode change 100644 => 100755 lmms_eval/tasks/coco_cap/coco_cap.yaml mode change 100644 => 100755 lmms_eval/tasks/coco_cap/utils.py mode change 100644 => 100755 lmms_eval/tasks/docvqa/_default_template_docvqa_yaml mode change 100644 => 100755 lmms_eval/tasks/docvqa/docvqa.yaml mode change 100644 => 100755 lmms_eval/tasks/docvqa/docvqa_test.yaml mode change 100644 => 100755 lmms_eval/tasks/docvqa/docvqa_val.yaml mode change 100644 => 100755 lmms_eval/tasks/docvqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/ferret/ferret.yaml mode change 100644 => 100755 lmms_eval/tasks/ferret/rule.json mode change 100644 => 100755 lmms_eval/tasks/ferret/utils.py mode change 100644 => 100755 lmms_eval/tasks/flickr30k/flickr30k.yaml mode change 100644 => 100755 lmms_eval/tasks/flickr30k/flickr30k_test.yaml mode change 100644 => 100755 lmms_eval/tasks/flickr30k/utils.py mode change 100644 => 100755 lmms_eval/tasks/gqa/gqa.yaml mode change 100644 => 100755 lmms_eval/tasks/gqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/hallusion_bench/evaluate_hb.py mode change 100644 => 100755 lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml mode change 100644 => 100755 lmms_eval/tasks/hallusion_bench/utils.py mode change 100644 => 100755 lmms_eval/tasks/iconqa/_default_template_docvqa_yaml mode change 100644 => 100755 lmms_eval/tasks/iconqa/iconqa.yaml mode change 100644 => 100755 lmms_eval/tasks/iconqa/iconqa_test.yaml mode change 100644 => 100755 lmms_eval/tasks/iconqa/iconqa_val.yaml mode change 100644 => 100755 lmms_eval/tasks/iconqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/infovqa/_default_template_infovqa_yaml mode change 100644 => 100755 lmms_eval/tasks/infovqa/infovqa.yaml mode change 100644 => 100755 lmms_eval/tasks/infovqa/infovqa_test.yaml mode change 100644 => 100755 lmms_eval/tasks/infovqa/infovqa_val.yaml mode change 100644 => 100755 lmms_eval/tasks/infovqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml mode change 100644 => 100755 lmms_eval/tasks/llava-bench-coco/rule.json mode change 100644 => 100755 lmms_eval/tasks/llava-bench-coco/utils.py mode change 100644 => 100755 lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml mode change 100644 => 100755 lmms_eval/tasks/llava-in-the-wild/rule.json mode change 100644 => 100755 lmms_eval/tasks/llava-in-the-wild/utils.py mode change 100644 => 100755 lmms_eval/tasks/mathvista/mathvista.yaml mode change 100644 => 100755 lmms_eval/tasks/mathvista/mathvista_evals.py mode change 100644 => 100755 lmms_eval/tasks/mathvista/mathvista_test.yaml mode change 100644 => 100755 lmms_eval/tasks/mathvista/mathvista_testmini.yaml mode change 100644 => 100755 lmms_eval/tasks/mathvista/utils.py create mode 100644 lmms_eval/tasks/mix_evals/_default_template_yaml create mode 100755 lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml create mode 100755 lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml create mode 100755 lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml create mode 100755 lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml create mode 100755 lmms_eval/tasks/mix_evals/utils.py mode change 100644 => 100755 lmms_eval/tasks/mmbench/cc_utils.py mode change 100644 => 100755 lmms_eval/tasks/mmbench/cn_utils.py mode change 100644 => 100755 lmms_eval/tasks/mmbench/en_utils.py mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_cc.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_cn.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_cn_test.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_en.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_en_dev.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_en_test.yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/mmbench_evals.py mode change 100644 => 100755 lmms_eval/tasks/mme/mme.yaml mode change 100644 => 100755 lmms_eval/tasks/mme/utils.py mode change 100644 => 100755 lmms_eval/tasks/mmmu/mmmu.yaml mode change 100644 => 100755 lmms_eval/tasks/mmmu/mmmu_test.yaml mode change 100644 => 100755 lmms_eval/tasks/mmmu/mmmu_val.yaml mode change 100644 => 100755 lmms_eval/tasks/mmmu/utils.py mode change 100644 => 100755 lmms_eval/tasks/mmvet/mmvet.yaml mode change 100644 => 100755 lmms_eval/tasks/mmvet/utils.py mode change 100644 => 100755 lmms_eval/tasks/multidocvqa/multidocvqa.yaml mode change 100644 => 100755 lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml mode change 100644 => 100755 lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml mode change 100644 => 100755 lmms_eval/tasks/multidocvqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/nocaps/_default_template_nocaps_yaml mode change 100644 => 100755 lmms_eval/tasks/nocaps/nocaps.yaml mode change 100644 => 100755 lmms_eval/tasks/nocaps/nocaps_test.yaml mode change 100644 => 100755 lmms_eval/tasks/nocaps/nocaps_val.yaml mode change 100644 => 100755 lmms_eval/tasks/nocaps/utils.py mode change 100644 => 100755 lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml mode change 100644 => 100755 lmms_eval/tasks/ok_vqa/_generate_config.py mode change 100644 => 100755 lmms_eval/tasks/ok_vqa/_ok_vqa.yaml mode change 100644 => 100755 lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml mode change 100644 => 100755 lmms_eval/tasks/ok_vqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/pope/pope.yaml mode change 100644 => 100755 lmms_eval/tasks/pope/utils.py create mode 100644 lmms_eval/tasks/realworldqa/realworldqa.yaml create mode 100644 lmms_eval/tasks/realworldqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/refcoco+/_default_template_bbox_yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/_default_template_seg_yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/_generate_config.py mode change 100644 => 100755 lmms_eval/tasks/refcoco+/_refcoco.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco+/utils.py mode change 100644 => 100755 lmms_eval/tasks/refcoco/_default_template_bbox_yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/_default_template_seg_yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/_generate_config.py mode change 100644 => 100755 lmms_eval/tasks/refcoco/_refcoco.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_seg_test.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/refcoco_seg_val.yaml mode change 100644 => 100755 lmms_eval/tasks/refcoco/utils.py mode change 100644 => 100755 lmms_eval/tasks/refcocog/_default_template_bbox_yaml mode change 100644 => 100755 lmms_eval/tasks/refcocog/_default_template_seg_yaml mode change 100644 => 100755 lmms_eval/tasks/refcocog/_generate_config.py mode change 100644 => 100755 lmms_eval/tasks/refcocog/_refcoco.yaml mode change 100644 => 100755 lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml mode change 100644 => 100755 lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml mode change 100644 => 100755 lmms_eval/tasks/refcocog/refcocog_seg_test.yaml mode change 100644 => 100755 lmms_eval/tasks/refcocog/refcocog_seg_val.yaml mode change 100644 => 100755 lmms_eval/tasks/refcocog/utils.py mode change 100644 => 100755 lmms_eval/tasks/scienceqa/scienceqa.yaml mode change 100644 => 100755 lmms_eval/tasks/scienceqa/scienceqa_full.yaml mode change 100644 => 100755 lmms_eval/tasks/scienceqa/scienceqa_img.yaml mode change 100644 => 100755 lmms_eval/tasks/scienceqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/seedbench/seedbench.yaml mode change 100644 => 100755 lmms_eval/tasks/seedbench/seedbench_ppl.yaml mode change 100644 => 100755 lmms_eval/tasks/seedbench/utils.py mode change 100644 => 100755 lmms_eval/tasks/seedbench_2/seedbench_2.yaml mode change 100644 => 100755 lmms_eval/tasks/seedbench_2/utils.py create mode 100644 lmms_eval/tasks/sft_eval/_default_sft_eval_ocr_rec_template_yaml create mode 100644 lmms_eval/tasks/sft_eval/_default_sft_eval_rest_template_yaml create mode 100644 lmms_eval/tasks/sft_eval/_generate_config.py create mode 100644 lmms_eval/tasks/sft_eval/_sft_eval.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_activity.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_arts.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_body.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_car.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_color.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_commodity.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_count.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_daily.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_engineer.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_entertainment.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_exist.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_face.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_food.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_healthcare.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_landmark.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_logo.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_natural.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_qa_adv.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_qa_chart.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_qa_form.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_qa_scene.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_qa_screen.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_rec_adv.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_rec_doc.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_rec_handwrite.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_rec_markdown.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_rec_scene.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_ocr_rec_screen.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_place.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_position.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_sport.yaml create mode 100644 lmms_eval/tasks/sft_eval/sft_status.yaml create mode 100644 lmms_eval/tasks/sft_eval/utils.py mode change 100644 => 100755 lmms_eval/tasks/stvqa/stvqa.yaml mode change 100644 => 100755 lmms_eval/tasks/stvqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/textcaps/_default_template_textcaps_yaml mode change 100644 => 100755 lmms_eval/tasks/textcaps/textcaps.yaml mode change 100644 => 100755 lmms_eval/tasks/textcaps/textcaps_test.yaml mode change 100644 => 100755 lmms_eval/tasks/textcaps/textcaps_train.yaml mode change 100644 => 100755 lmms_eval/tasks/textcaps/textcaps_val.yaml mode change 100644 => 100755 lmms_eval/tasks/textcaps/utils.py mode change 100644 => 100755 lmms_eval/tasks/textvqa/_default_template_textvqa_yaml mode change 100644 => 100755 lmms_eval/tasks/textvqa/_textvqa.yaml mode change 100644 => 100755 lmms_eval/tasks/textvqa/textvqa_test.yaml mode change 100644 => 100755 lmms_eval/tasks/textvqa/textvqa_val.yaml mode change 100644 => 100755 lmms_eval/tasks/textvqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml mode change 100644 => 100755 lmms_eval/tasks/vizwiz_vqa/_generate_config.py mode change 100644 => 100755 lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml mode change 100644 => 100755 lmms_eval/tasks/vizwiz_vqa/utils.py mode change 100644 => 100755 lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml mode change 100644 => 100755 lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml mode change 100644 => 100755 lmms_eval/tasks/vqav2/_default_template_vqav2_yaml mode change 100644 => 100755 lmms_eval/tasks/vqav2/_vqav2.yaml mode change 100644 => 100755 lmms_eval/tasks/vqav2/utils.py mode change 100644 => 100755 lmms_eval/tasks/vqav2/vqav2_test.yaml mode change 100644 => 100755 lmms_eval/tasks/vqav2/vqav2_val.yaml create mode 100644 lmms_eval/tasks/worldqa/_default_template_yaml create mode 100755 lmms_eval/tasks/worldqa/utils.py create mode 100755 lmms_eval/tasks/worldqa/worldqa_generation.yaml create mode 100755 lmms_eval/tasks/worldqa/worldqa_mc.yaml create mode 100755 lmms_eval/tasks/worldqa/worldqa_mcppl.yaml mode change 100644 => 100755 lmms_eval/utils.py mode change 100644 => 100755 miscs/llava_result_check.md mode change 100644 => 100755 miscs/repr_scripts.sh mode change 100644 => 100755 miscs/repr_torch_envs.txt mode change 100644 => 100755 miscs/scienceqa_id.txt mode change 100644 => 100755 miscs/script.sh mode change 100644 => 100755 miscs/test_llava.py mode change 100644 => 100755 miscs/test_scienceqa.py mode change 100644 => 100755 pyproject.toml mode change 100644 => 100755 setup.py mode change 100644 => 100755 tools/make_hf_dataset.ipynb diff --git a/.github/issue_template.md b/.github/issue_template.md old mode 100644 new mode 100755 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md old mode 100644 new mode 100755 diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/docs/README.md b/docs/README.md old mode 100644 new mode 100755 diff --git a/docs/commands.md b/docs/commands.md old mode 100644 new mode 100755 diff --git a/docs/model_guide.md b/docs/model_guide.md old mode 100644 new mode 100755 diff --git a/docs/task_guide.md b/docs/task_guide.md old mode 100644 new mode 100755 diff --git a/example_eval.yaml b/example_eval.yaml old mode 100644 new mode 100755 index 40e29a85..6750ee7a --- a/example_eval.yaml +++ b/example_eval.yaml @@ -1,15 +1,8 @@ - model: llava model_args: pretrained=liuhaotian/llava-v1.5-7b - tasks: ai2d + tasks: mmmu_val batch_size: 1 log_samples: true - log_samples_suffix: eval_vizwiz_vqa + log_samples_suffix: eval_mmmu output_path: "./logs/" -- model: llava - model_args: pretrained=liuhaotian/llava-v1.5-13b - tasks: mme - batch_size: 1 - log_samples: true - log_samples_suffix: mme - output_path: "./logs/" diff --git a/llava_repr_requirements.txt b/llava_repr_requirements.txt old mode 100644 new mode 100755 index e3f0f527..c09b6f5b --- a/llava_repr_requirements.txt +++ b/llava_repr_requirements.txt @@ -27,23 +27,6 @@ shortuuid==1.0.12 sqlitedict==2.1.0 tenacity==8.2.3 torch==2.0.1 -openai>=1.0.0 -pycocoevalcap tokenizers==0.15.2 tqdm==4.66.2 -tqdm-multiprocess transformers==4.37.2 -zstandard -pillow -pyyaml -sympy -mpmath -Jinja2 -openpyxl -Levenshtein -hf_transfer -tenacity -wandb>=0.16.0 -transformers-stream-generator -tiktoken -pre-commit \ No newline at end of file diff --git a/lmms_eval/__init__.py b/lmms_eval/__init__.py old mode 100644 new mode 100755 diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py old mode 100644 new mode 100755 diff --git a/lmms_eval/api/__init__.py b/lmms_eval/api/__init__.py old mode 100644 new mode 100755 diff --git a/lmms_eval/api/filter.py b/lmms_eval/api/filter.py old mode 100644 new mode 100755 diff --git a/lmms_eval/api/instance.py b/lmms_eval/api/instance.py old mode 100644 new mode 100755 diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py old mode 100644 new mode 100755 diff --git a/lmms_eval/api/model.py b/lmms_eval/api/model.py old mode 100644 new mode 100755 diff --git a/lmms_eval/api/registry.py b/lmms_eval/api/registry.py old mode 100644 new mode 100755 diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py old mode 100644 new mode 100755 index f77065e8..2cecfe22 --- a/lmms_eval/api/samplers.py +++ b/lmms_eval/api/samplers.py @@ -37,7 +37,9 @@ def get_context(self, doc, num_fewshot): + ( str(self.doc_to_target(doc)[0]) if type(self.doc_to_target(doc)) is list - else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]) + else self.doc_to_target(doc) + if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) + else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]) ) for doc in selected_docs ] diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py old mode 100644 new mode 100755 index 3262937c..0d5c0815 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -7,6 +7,8 @@ import ast import logging import random +from glob import glob +import shutil from tqdm import tqdm import datasets @@ -18,6 +20,7 @@ from typing import Union, List, Any from collections.abc import Callable from tenacity import retry, stop_after_attempt, wait_fixed +from huggingface_hub import snapshot_download from lmms_eval import utils from lmms_eval.api import samplers @@ -678,6 +681,26 @@ def _prepare_metric_and_aggregation(self): @retry(stop=stop_after_attempt(5), wait=wait_fixed(2)) def download(self, dataset_kwargs=None) -> None: + # If the dataset is a video dataset, + # Recursively search whether their is a zip and unzip it to the huggingface home + if dataset_kwargs is not None and "video" in dataset_kwargs and dataset_kwargs["video"]: + hf_home = os.environ["HF_HOME"] + cache_dir = dataset_kwargs["cache_dir"] + + cache_dir = os.path.join(hf_home, cache_dir) + cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset") + zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True) + if not os.path.exists(cache_dir): + for zip_file in zip_files: + shutil.unpack_archive(zip_file, cache_dir) + + if "builder_script" in dataset_kwargs: + builder_script = dataset_kwargs["builder_script"] + self.DATASET_PATH = os.path.join(cache_path, builder_script) + dataset_kwargs.pop("builder_script") + + dataset_kwargs.pop("cache_dir") + dataset_kwargs.pop("video") download_config = DownloadConfig() download_config.max_retries = dataset_kwargs.get("max_retries", 3) if dataset_kwargs is not None else 3 download_config.num_proc = dataset_kwargs.get("num_proc", 8) if dataset_kwargs is not None else 8 @@ -973,6 +996,8 @@ def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Inst return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs) def process_results(self, doc, results): + # if self.OUTPUT_TYPE == "generate_until": + # results[0] = results[0].strip() if callable(self.config.process_results): return self.config.process_results(doc, results) diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py old mode 100644 new mode 100755 diff --git a/lmms_eval/filters/__init__.py b/lmms_eval/filters/__init__.py old mode 100644 new mode 100755 index a4242343..38ab7d8a --- a/lmms_eval/filters/__init__.py +++ b/lmms_eval/filters/__init__.py @@ -1,4 +1,4 @@ -from lmms_eval.api.filter import FilterEnsemble +from lmms_eval.api.filter import FilterEnsemble, Filter from . import selection from . import extraction from . import transformation @@ -13,6 +13,7 @@ "lowercase": transformation.LowercaseFilter, "uppercase": transformation.UppercaseFilter, "map": transformation.MapFilter, + "multi_choice_regex": extraction.MultiChoiceRegexFilter, # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function # that takes an input and returns a scalar and then should select the max reward, # or should implement different filters for different ways of handling a reward model's inference. diff --git a/lmms_eval/filters/decontamination.py b/lmms_eval/filters/decontamination.py old mode 100644 new mode 100755 diff --git a/lmms_eval/filters/extraction.py b/lmms_eval/filters/extraction.py old mode 100644 new mode 100755 index 3f85899b..329d7540 --- a/lmms_eval/filters/extraction.py +++ b/lmms_eval/filters/extraction.py @@ -1,18 +1,47 @@ import re - +import sys +import unicodedata from lmms_eval.api.filter import Filter +class WhitespaceFilter(Filter): + """ """ + + def __init__(self) -> None: + pass + + def apply(self, resps, docs): + def filter_set(inst): + filtered_resp = [] + for resp in inst: + if resp.startswith(" "): + resp = resp[1:] + + filtered_resp.append(resp) + + return filtered_resp + + filtered_resps = [filter_set(resp) for resp in resps] + + return filtered_resps + + class RegexFilter(Filter): """ """ - def __init__(self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]") -> None: + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ) -> None: """ pass a string `regex` to run `re.compile(r"regex")` on. `fallback` defines the output returned if no matches for the regex are located. """ self.regex_pattern = regex_pattern self.regex = re.compile(regex_pattern) + self.group_select = group_select self.fallback = fallback def apply(self, resps, docs): @@ -23,9 +52,12 @@ def apply(self, resps, docs): def filter_set(inst): filtered = [] for resp in inst: - match = self.regex.search(resp) + match = self.regex.findall(resp) if match: - match = match.group(1).strip() + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() else: match = self.fallback filtered.append(match) @@ -38,23 +70,145 @@ def filter_set(inst): return filtered_resps -class WhitespaceFilter(Filter): - """ """ +class MultiChoiceRegexFilter(RegexFilter): + """ + A filter used to extract a model's answer on multiple choice questions with + letter answers. assumes each document has a "choices" field + containing the list of answer choices and that the answer label symbols + are of the form (A), (B), (C), ... or A, B, C. + """ - def __init__(self) -> None: - pass + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ignore_case=False, + ignore_punctuation=False, + regexes_to_ignore=None, + ) -> None: + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(regex_pattern, group_select, fallback) + self.ignore_case = ignore_case + self.ignore_punctuation = ignore_punctuation + self.regexes_to_ignore = regexes_to_ignore def apply(self, resps, docs): - def filter_set(inst): - filtered_resp = [] - for resp in inst: - if resp.startswith(" "): - resp = resp[1:] + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) - filtered_resp.append(resp) + def find_match(regex, resp, convert_dict={}): + match = regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + if match and match in convert_dict: + match = convert_dict[match] + return match - return filtered_resp + punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")) - filtered_resps = [filter_set(resp) for resp in resps] + def filter_ignores(st): + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + + if self.ignore_case: + st = st.lower() + + if self.ignore_punctuation: + # https://stackoverflow.com/a/266162 + st = st.translate(punct_tbl) + return st + + filtered_resps = [] + + for r, doc in zip(resps, docs): + fallback_regexes = [] + choice_to_alpha = {} + next_alpha = "A" + + without_paren_fallback_regexes = [] + without_paren_to_target = {} + + choices = doc["choices"] + for c in choices: + m = filter_ignores(c.strip()) + fallback_regexes.append(f"{re.escape(m)}") + choice_to_alpha[m] = f"({next_alpha})" + + without_paren_fallback_regexes.append(next_alpha) + without_paren_to_target[next_alpha] = f"({next_alpha})" + + next_alpha = chr(ord(next_alpha) + 1) + fallback_regex = re.compile("|".join(fallback_regexes)) + without_paren_fallback_regex = "|".join(without_paren_fallback_regexes) + without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})") + + filtered = [] + for resp in r: + match = find_match(self.regex, resp) + if not match: + match = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) + if not match: + match = find_match(without_paren_fallback_regex, resp, without_paren_to_target) + if not match: + match = self.fallback + filtered.append(match) + filtered_resps.append(filtered) return filtered_resps + + +class ExtendedRegexFilter(RegexFilter): + punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")) + + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ignore_case=False, + ignore_punctuation=False, + regexes_to_ignore=None, + ) -> None: + super().__init__(regex_pattern, group_select, fallback) + self.ignore_case = ignore_case + self.ignore_punctuation = ignore_punctuation + self.regexes_to_ignore = regexes_to_ignore + + def filter_ignores(self, st): + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + + if self.ignore_case: + st = st.lower() + + if self.ignore_punctuation: + # https://stackoverflow.com/a/266162 + st = st.translate(self.punct_tbl) + return st + + def find_match(self, regex, resp, convert_dict={}): + match = regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + if match and match in convert_dict: + match = convert_dict[match] + return match diff --git a/lmms_eval/filters/selection.py b/lmms_eval/filters/selection.py old mode 100644 new mode 100755 diff --git a/lmms_eval/filters/transformation.py b/lmms_eval/filters/transformation.py old mode 100644 new mode 100755 diff --git a/lmms_eval/logging_utils.py b/lmms_eval/logging_utils.py old mode 100644 new mode 100755 index 21a2ee04..6107d21b --- a/lmms_eval/logging_utils.py +++ b/lmms_eval/logging_utils.py @@ -89,10 +89,10 @@ def finish(self): def init_run(self): if "name" not in self.wandb_args: if "config" in self.all_args_dict and self.all_args_dict["config"] != "": - self.wandb_args["name"] = self.all_args_dict["config"].split("/")[-1].replace(".yaml", "") + "_" + self.args.log_samples_suffix + self.wandb_args["name"] = self.all_args_dict["config"].split("/")[-1].replace(".yaml", "") + "/" + self.args.log_samples_suffix else: task_names = self.args.tasks.replace(",", "/") - self.wandb_args["name"] = f"{self.args.model}_{task_names}_{self.args.log_samples_suffix}" + self.wandb_args["name"] = f"{self.args.model}/<{task_names}>/{self.args.log_samples_suffix}" if self.args.num_fewshot: self.wandb_args["name"] += f"_{self.args.num_fewshot}shot" if "project" not in self.wandb_args: @@ -119,6 +119,7 @@ def _get_config(self) -> Dict[str, Any]: def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]: """Sanitize the results dictionary.""" _results = copy.deepcopy(self.results.get("results", dict())) + _results["model_configs"] = self.results.get("model_configs", dict()) # Remove None from the metric string name tmp_results = copy.deepcopy(_results) @@ -138,15 +139,18 @@ def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]: if isinstance(metric_value, str): wandb_summary[f"{task}/{metric_name}"] = metric_value + wandb_summary["model_configs"] = self.results.get("model_configs", dict()) for summary_metric, summary_value in wandb_summary.items(): - _task, _summary_metric = summary_metric.split("/") - _results[_task].pop(_summary_metric) + if summary_metric != "model_configs": + _task, _summary_metric = summary_metric.split("/") + _results[_task].pop(_summary_metric) tmp_results = copy.deepcopy(_results) for task_name, task_results in tmp_results.items(): - for metric_name, metric_value in task_results.items(): - _results[f"{task_name}/{metric_name}"] = metric_value - _results[task_name].pop(metric_name) + if task_name != "model_configs": + for metric_name, metric_value in task_results.items(): + _results[f"{task_name}/{metric_name}"] = metric_value + _results[task_name].pop(metric_name) for task in self.task_names: _results.pop(task) diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py old mode 100644 new mode 100755 index 9135311c..88c0d37e --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -1,14 +1,6 @@ import os -AVAILABLE_MODELS = { - "llava": "Llava", - "llava_hf": "LlavaHf", - "qwen_vl": "Qwen_VL", - "fuyu": "Fuyu", - "gpt4v": "GPT4V", - "instructblip": "InstructBLIP", - "minicpm_v": "MiniCPM_V", -} +AVAILABLE_MODELS = {"llava": "Llava", "llava_hf": "LlavaHf", "qwen_vl": "Qwen_VL", "fuyu": "Fuyu", "gpt4v": "GPT4V", "instructblip": "InstructBLIP", "minicpm_v": "MiniCPM_V", "llava_vid": "LlavaVid"} for model_name, model_class in AVAILABLE_MODELS.items(): try: @@ -16,7 +8,6 @@ except ImportError: pass - import hf_transfer os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" diff --git a/lmms_eval/models/fuyu.py b/lmms_eval/models/fuyu.py old mode 100644 new mode 100755 index 173fa6b9..34290b9f --- a/lmms_eval/models/fuyu.py +++ b/lmms_eval/models/fuyu.py @@ -204,7 +204,7 @@ def _collate(x): # generation_output = self.model.generate( # **model_inputs, temperature=gen_kwargs["temperature"], max_new_tokens=gen_kwargs["max_new_tokens"], top_p=gen_kwargs["top_p"], num_beams=gen_kwargs["num_beams"], pad_token_id=self.tokenizer.eos_token_id # ) - generation_output = self.model.generate(**model_inputs, max_new_tokens=gen_kwargs["max_new_tokens"]) + generation_output = self.model.generate(**model_inputs, max_new_tokens=gen_kwargs["max_new_tokens"], pad_token_id=self.tokenizer.eos_token_id) generation_texts = self.processor.batch_decode(generation_output, skip_special_tokens=True) response = [gen_text.split("\x04")[1].strip(" ").strip("\n") for gen_text in generation_texts] res.extend(response) diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py old mode 100644 new mode 100755 index d2ec2025..33c632a3 --- a/lmms_eval/models/gpt4v.py +++ b/lmms_eval/models/gpt4v.py @@ -35,19 +35,24 @@ } -@register_model("gpt4V") +@register_model("gpt4v") class GPT4V(lmms): - def __init__(self, **kwargs) -> None: + def __init__( + self, + model_version: str = "gpt-4-vision-preview", + **kwargs, + ) -> None: super().__init__() # Manually set a image token for GPT4V so that we can search for it # and split the text and image # Here we just use the same token as llava for convenient + self.model_version = model_version self.image_token = "" # Function to encode the image def encode_image(self, image: Image): output_buffer = BytesIO() - image.save(output_buffer, format="JPEG") + image.save(output_buffer, format="PNG") byte_data = output_buffer.getvalue() base64_str = base64.b64encode(byte_data).decode("utf-8") return base64_str @@ -72,7 +77,7 @@ def generate_until(self, requests) -> List[str]: img = self.encode_image(visual) imgs.append(img) - payload = {"model": "gpt-4-vision-preview", "messages": []} + payload = {"model": self.model_version, "messages": []} response_json = {"role": "user", "content": []} # When there is no image token in the context, append the image to the text if self.image_token not in contexts: @@ -102,8 +107,8 @@ def generate_until(self, requests) -> List[str]: if "num_beams" not in gen_kwargs: gen_kwargs["num_beams"] = 1 - # payload["max_tokens"] = gen_kwargs["max_new_tokens"] - # payload["temperature"] = gen_kwargs["temperature"] + payload["max_tokens"] = gen_kwargs["max_new_tokens"] + payload["temperature"] = gen_kwargs["temperature"] for attempt in range(5): try: diff --git a/lmms_eval/models/instructblip.py b/lmms_eval/models/instructblip.py old mode 100644 new mode 100755 index 6f83d599..a99d20ea --- a/lmms_eval/models/instructblip.py +++ b/lmms_eval/models/instructblip.py @@ -9,6 +9,7 @@ from accelerate import Accelerator, DistributedType from accelerate.state import AcceleratorState from typing import List, Optional, Union, Tuple +import transformers from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration from lmms_eval.utils import stop_sequences_criteria @@ -19,6 +20,7 @@ warnings.filterwarnings("ignore") eval_logger = logging.getLogger("lmms-eval") +transformers.logging.set_verbosity_error() @register_model("instructblip") diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py old mode 100644 new mode 100755 index a735f82c..0fa3636c --- a/lmms_eval/models/llava.py +++ b/lmms_eval/models/llava.py @@ -1,3 +1,4 @@ +import ast import torch torch.backends.cuda.matmul.allow_tf32 = True @@ -28,7 +29,10 @@ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX from llava.conversation import conv_templates, SeparatorStyle except ImportError: - eval_logger.error("LLaVA is not installed. Please install LLaVA to use this model.") + import traceback + + traceback.print_exc() + # eval_logger.error("LLaVA is not installed. Please install LLaVA to use this model.") from transformers.integrations.deepspeed import ( is_deepspeed_zero3_enabled, @@ -36,6 +40,17 @@ unset_hf_deepspeed_config, ) +from transformers.utils import is_flash_attn_2_available + +# inference implementation for attention, can be "sdpa", "eager", "flash_attention_2". Seems FA2 is not effective during inference: https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5 +# if is_flash_attn_2_available: +# best_fit_attn_implementation = "flash_attention_2" # flash_attn has a bug that says: ERROR Error query and key must have the same dtype in generating + +if torch.__version__ > "2.1.2": + best_fit_attn_implementation = "sdpa" +else: + best_fit_attn_implementation = "eager" + @register_model("llava") class Llava(lmms): @@ -47,16 +62,17 @@ def __init__( self, pretrained: str = "liuhaotian/llava-v1.5-7b", truncation: Optional[bool] = True, - device: Optional[str] = "cuda", + device: Optional[str] = "cuda:0", dtype: Optional[Union[str, torch.dtype]] = "auto", batch_size: Optional[Union[int, str]] = 1, trust_remote_code: Optional[bool] = False, revision=None, - use_flash_attention_2=True, - device_map="auto", + attn_implementation=best_fit_attn_implementation, + device_map="cuda:0", conv_template="vicuna_v1", use_cache=True, truncate_context=False, # whether to truncate the context in generation, set it False for LLaVA-1.6 + customized_config=None, # ends in json **kwargs, ) -> None: super().__init__() @@ -65,14 +81,21 @@ def __init__( accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) - if accelerator.num_processes > 1 and device_map == "": + if accelerator.num_processes > 1: self._device = torch.device(f"cuda:{accelerator.local_process_index}") self.device_map = f"cuda:{accelerator.local_process_index}" - else: + elif accelerator.num_processes == 1 and device_map == "auto": self._device = torch.device(device) self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" - self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained), device_map=self.device_map, use_flash_attention_2=use_flash_attention_2) + llava_model_args = {} + llava_model_args["attn_implementation"] = attn_implementation + llava_model_args["customized_config"] = customized_config + llava_model_args["use_flash_attention_2"] = False + self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained), device_map=self.device_map, **llava_model_args) self._config = self._model.config self.model.eval() self.model.tie_weights() @@ -82,7 +105,7 @@ def __init__( self.use_cache = use_cache self.truncate_context = truncate_context # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." - if accelerator.num_processes > 1 and device_map == "": + if accelerator.num_processes > 1: assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works @@ -174,7 +197,10 @@ def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=Non return encoding def tok_decode(self, tokens): - return self.tokenizer.decode(tokens) + try: + return self.tokenizer.decode(tokens) + except: + return self.tokenizer.decode([tokens]) def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: # TODO @@ -189,6 +215,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: continuation = doc_to_target(self.task_dict[task][split][doc_id]) visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] visuals = self.flatten(visuals) + image_sizes = [[visual.size[0], visual.size[1]] for visual in visuals] if visuals: image = process_images(visuals, self._image_processor, self._config) if type(image) is list: @@ -226,7 +253,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: # Context part no need to calculate for loss labels[0, : contxt_id.shape[1]] = -100 with torch.inference_mode(): - outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True) + outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True, image_sizes=image_sizes) loss = outputs["loss"] # loss = torch.exp(loss) logits = outputs["logits"] @@ -270,8 +297,8 @@ def _collate(x): contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) task = task[0] split = split[0] - visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] - visuals = self.flatten(visuals) + batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] # [B, N] + flattened_visuals = self.flatten(batched_visuals) # [B*N] # we assume all gen kwargs in the batch are the same # this is safe to assume because the `grouper` object ensures it. gen_kwargs = all_gen_kwargs[0] @@ -292,8 +319,8 @@ def _collate(x): self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}") # encode, pad, and truncate contexts for this batch - if visuals: - image_tensor = process_images(visuals, self._image_processor, self._config) + if flattened_visuals: + image_tensor = process_images(flattened_visuals, self._image_processor, self._config) if type(image_tensor) is list: image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor] else: @@ -305,7 +332,7 @@ def _collate(x): question_input = [] - for visual, context in zip(visuals, contexts): + for visual, context in zip(batched_visuals, contexts): if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context: """ Three senarios: @@ -318,7 +345,6 @@ def _collate(x): question = image_tokens + "\n" + context else: question = context - conv = conv_templates[self.conv_template].copy() conv.append_message(conv.roles[0], question) conv.append_message(conv.roles[1], None) @@ -328,7 +354,7 @@ def _collate(x): # The above for loop has bugs. When there is no visuals, e.g. pure text, # there will be no for loop execute resulting in an empty question_input (because no visuals) # Scenario 1 won't even be execute - if len(visuals) == 0: + if len(flattened_visuals) == 0: for context in contexts: question = context conv = conv_templates[self.conv_template].copy() @@ -339,7 +365,7 @@ def _collate(x): # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) # preconfigure gen_kwargs with defaults - gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + gen_kwargs["image_sizes"] = [flattened_visuals[idx].size for idx in range(len(flattened_visuals))] if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 if "temperature" not in gen_kwargs: @@ -371,6 +397,7 @@ def _collate(x): ) text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True) except Exception as e: + raise e eval_logger.error(f"Error {e} in generating") cont = "" text_outputs = [""] diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py new file mode 100755 index 00000000..0913dc11 --- /dev/null +++ b/lmms_eval/models/llava_vid.py @@ -0,0 +1,356 @@ +import logging +from multiprocessing import context +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState +from typing import List, Optional, Union, Tuple +import torch +from tqdm import tqdm +from decord import VideoReader, cpu +import numpy as np +import math +from datetime import timedelta +from transformers import AutoConfig + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.utils import stop_sequences_criteria + +eval_logger = logging.getLogger("lmms-eval") + +try: + from llavavid.model.builder import load_pretrained_model + from llavavid.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria + from llavavid.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX + from llavavid.conversation import conv_templates, SeparatorStyle +except ImportError: + eval_logger.error("LLaVA-Video is not installed. Please install LLaVA-Video to use this model.") + + +@register_model("llavavid") +class LlavaVid(lmms): + """ + LlavaVid Model + """ + + def __init__( + self, + pretrained: str = "liuhaotian/llava-v1.5-7b", + truncation: Optional[bool] = True, + device: Optional[str] = "cuda:0", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: Optional[Union[int, str]] = 1, + trust_remote_code: Optional[bool] = False, + revision=None, + attn_implementation=( + "sdpa" if torch.__version__ > "2.1.2" else "eager" + ), # inference implementation for attention, can be "sdpa", "eager", "flash_attention_2". Seems FA2 is not effective during inference: https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5 + device_map="cuda:0", + conv_template="vicuna_v1", + use_cache=True, + truncate_context=False, # whether to truncate the context in generation, set it False for LLaVA-1.6 + for_get_frames_num: int = 3, + mm_resampler_type: str = "spatial_pool", + mm_spatial_pool_stride: int = 2, + mm_spatial_pool_out_channels: int = 1024, + mm_spatial_pool_mode: str = "average", + overwrite: bool = True, + **kwargs, + ) -> None: + super().__init__() + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + elif accelerator.num_processes == 1 and device_map == "auto": + self._device = torch.device(device) + self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + + self.pretrained = pretrained + self.model_name = get_model_name_from_path(pretrained) + self._config = AutoConfig.from_pretrained(self.pretrained) + self.overwrite = overwrite + self.mm_resampler_type = mm_resampler_type + self.mm_spatial_pool_stride = int(mm_spatial_pool_stride) + self.mm_spatial_pool_out_channels = int(mm_spatial_pool_out_channels) + self.mm_spatial_pool_mode = mm_spatial_pool_mode + self.for_get_frames_num = int(for_get_frames_num) + if self.overwrite == True: + overwrite_config = {} + overwrite_config["mm_resampler_type"] = self.mm_resampler_type + overwrite_config["mm_spatial_pool_stride"] = self.mm_spatial_pool_stride + overwrite_config["mm_spatial_pool_out_channels"] = self.mm_spatial_pool_out_channels + overwrite_config["mm_spatial_pool_mode"] = self.mm_spatial_pool_mode + overwrite_config["patchify_video_feature"] = False + + cfg_pretrained = AutoConfig.from_pretrained(self.pretrained) + + if "224" in cfg_pretrained.mm_vision_tower: + # suppose the length of text tokens is around 1000, from bo's report + least_token_number = self.for_get_frames_num * (16 // self.mm_spatial_pool_stride) ** 2 + 1000 + else: + least_token_number = self.for_get_frames_num * (24 // self.mm_spatial_pool_stride) ** 2 + 1000 + + scaling_factor = math.ceil(least_token_number / 4096) + if scaling_factor >= 2: + print(float(scaling_factor)) + overwrite_config["rope_scaling"] = {"factor": float(scaling_factor), "type": "linear"} + overwrite_config["max_sequence_length"] = 4096 * scaling_factor + overwrite_config["tokenizer_model_max_length"] = 4096 * scaling_factor + self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, self.model_name, device_map=self.device_map, overwrite_config=overwrite_config) + else: + self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model( + pretrained, + None, + self.model_name, + device_map=self.device_map, + ) + + self._config = self._model.config + self.model.eval() + self.model.tie_weights() + self.truncation = truncation + self.batch_size_per_gpu = int(batch_size) + self.conv_template = conv_template + self.use_cache = use_cache + self.truncate_context = truncate_context + # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._world_size = 1 + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + def pad_sequence(self, input_ids, batch_first, padding_value): + if self.tokenizer.padding_side == "left": + input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids] + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value) + if self.tokenizer.padding_side == "left": + input_ids = torch.flip(input_ids, [1]) + return input_ids + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def load_video(self, video_path, for_get_frames_num): + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + # fps = round(vr.get_avg_fps()) + # frame_idx = [i for i in range(0, len(vr), fps)] + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + spare_frames = vr.get_batch(frame_idx).asnumpy() + return spare_frames + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + if type(doc_to_target) == str: + continuation = doc_to_target + else: + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + videos = [] + for visual in visuals: + video = self.load_video(visual, self.for_get_frames_num) + video = self._image_processor.preprocess(video, return_tensors="pt")["pixel_values"].half().cuda() + videos.append(video) + + qs = contexts + if self.model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs + else: + qs = DEFAULT_IMAGE_TOKEN + "\n" + qs + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], continuation) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() + attention_masks = input_ids.ne(self.tokenizer.pad_token_id).long().cuda() + + labels = input_ids.clone() + # Context part no need to calculate for loss + labels[0, : contxt_id.shape[1]] = -100 + + with torch.inference_mode(): + outputs = self.model(input_ids=input_ids, labels=labels, images=videos, modalities="video") + + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + pbar.close() + return res + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + videos = [] + for visual in visuals: + video = self.load_video(visual, self.for_get_frames_num) + video = self._image_processor.preprocess(video, return_tensors="pt")["pixel_values"].half().cuda() + videos.append(video) + + qs = contexts + if self.model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs + else: + qs = DEFAULT_IMAGE_TOKEN + "\n" + qs + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() + attention_masks = input_ids.ne(self.tokenizer.pad_token_id).long().cuda() + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + keywords = [stop_str] + stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids) + + cur_prompt = contexts + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + with torch.inference_mode(): + self.model.update_prompt([[cur_prompt]]) + # import pdb;pdb.set_trace() + output_ids = self.model.generate( + inputs=input_ids, + images=videos, + attention_mask=attention_masks, + modalities="video", + use_cache=self.use_cache, + stopping_criteria=[stopping_criteria], + do_sample=True if gen_kwargs["temperature"] > 0 else False, + temperature=gen_kwargs["temperature"], + top_p=gen_kwargs["top_p"], + num_beams=gen_kwargs["num_beams"], + max_new_tokens=gen_kwargs["max_new_tokens"], + ) + # output_ids = model.generate(inputs=input_ids, images=video, attention_mask=attention_masks, modalities="video", do_sample=True, temperature=0.2, use_cache=True, stopping_criteria=[stopping_criteria]) + + outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + res.append(outputs) + pbar.update(1) + return res diff --git a/lmms_eval/models/minicpm_v.py b/lmms_eval/models/minicpm_v.py old mode 100644 new mode 100755 diff --git a/lmms_eval/models/model_utils/__init__.py b/lmms_eval/models/model_utils/__init__.py old mode 100644 new mode 100755 diff --git a/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py b/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/models/qwen_vl.py b/lmms_eval/models/qwen_vl.py old mode 100644 new mode 100755 index 2d73e4a5..a8898745 --- a/lmms_eval/models/qwen_vl.py +++ b/lmms_eval/models/qwen_vl.py @@ -228,9 +228,12 @@ def _collate(x): until = [until] elif not isinstance(until, list): raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + # Tuple object does not support item assignment + # Error may occur here + contexts = [con for con in contexts] for i in range(len(contexts)): if "" in contexts[i]: - context[i] = contexts[i].replace("", "") + contexts[i] = contexts[i].replace("", "") questions = [self.prompt.format(visual_path, context) for visual_path, context in zip(visual_paths, contexts)] # Similar to llava, is visual paths has len 0 diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/_task_utils/file_utils.py b/lmms_eval/tasks/_task_utils/file_utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/_task_utils/gpt_eval_utils.py b/lmms_eval/tasks/_task_utils/gpt_eval_utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/_task_utils/vqa_eval_metric.py b/lmms_eval/tasks/_task_utils/vqa_eval_metric.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ai2d/ai2d.yaml b/lmms_eval/tasks/ai2d/ai2d.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ai2d/upload_ai2d.py b/lmms_eval/tasks/ai2d/upload_ai2d.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ai2d/utils.py b/lmms_eval/tasks/ai2d/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/chartqa/chartqa.yaml b/lmms_eval/tasks/chartqa/chartqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/chartqa/upload_chartqa.py b/lmms_eval/tasks/chartqa/upload_chartqa.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/chartqa/utils.py b/lmms_eval/tasks/chartqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/cmmmu/_cmmmu.yaml b/lmms_eval/tasks/cmmmu/_cmmmu.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml b/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/cmmmu/cmmmu_test.yaml b/lmms_eval/tasks/cmmmu/cmmmu_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/cmmmu/cmmmu_val.yaml b/lmms_eval/tasks/cmmmu/cmmmu_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/cmmmu/utils.py b/lmms_eval/tasks/cmmmu/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/coco2014_cap.yaml b/lmms_eval/tasks/coco_cap/coco2014_cap.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml b/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml b/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/coco2017_cap.yaml b/lmms_eval/tasks/coco_cap/coco2017_cap.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml b/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml b/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/coco_cap.yaml b/lmms_eval/tasks/coco_cap/coco_cap.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/coco_cap/utils.py b/lmms_eval/tasks/coco_cap/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml b/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/docvqa/docvqa.yaml b/lmms_eval/tasks/docvqa/docvqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/docvqa/docvqa_test.yaml b/lmms_eval/tasks/docvqa/docvqa_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/docvqa/docvqa_val.yaml b/lmms_eval/tasks/docvqa/docvqa_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/docvqa/utils.py b/lmms_eval/tasks/docvqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ferret/ferret.yaml b/lmms_eval/tasks/ferret/ferret.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ferret/rule.json b/lmms_eval/tasks/ferret/rule.json old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ferret/utils.py b/lmms_eval/tasks/ferret/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/flickr30k/flickr30k.yaml b/lmms_eval/tasks/flickr30k/flickr30k.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/flickr30k/flickr30k_test.yaml b/lmms_eval/tasks/flickr30k/flickr30k_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/flickr30k/utils.py b/lmms_eval/tasks/flickr30k/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/gqa/gqa.yaml b/lmms_eval/tasks/gqa/gqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/gqa/utils.py b/lmms_eval/tasks/gqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/hallusion_bench/evaluate_hb.py b/lmms_eval/tasks/hallusion_bench/evaluate_hb.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml b/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/hallusion_bench/utils.py b/lmms_eval/tasks/hallusion_bench/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml b/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/iconqa/iconqa.yaml b/lmms_eval/tasks/iconqa/iconqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/iconqa/iconqa_test.yaml b/lmms_eval/tasks/iconqa/iconqa_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/iconqa/iconqa_val.yaml b/lmms_eval/tasks/iconqa/iconqa_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/iconqa/utils.py b/lmms_eval/tasks/iconqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml b/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/infovqa/infovqa.yaml b/lmms_eval/tasks/infovqa/infovqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/infovqa/infovqa_test.yaml b/lmms_eval/tasks/infovqa/infovqa_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/infovqa/infovqa_val.yaml b/lmms_eval/tasks/infovqa/infovqa_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/infovqa/utils.py b/lmms_eval/tasks/infovqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml b/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/llava-bench-coco/rule.json b/lmms_eval/tasks/llava-bench-coco/rule.json old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/llava-bench-coco/utils.py b/lmms_eval/tasks/llava-bench-coco/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml b/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/llava-in-the-wild/rule.json b/lmms_eval/tasks/llava-in-the-wild/rule.json old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/llava-in-the-wild/utils.py b/lmms_eval/tasks/llava-in-the-wild/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mathvista/mathvista.yaml b/lmms_eval/tasks/mathvista/mathvista.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mathvista/mathvista_evals.py b/lmms_eval/tasks/mathvista/mathvista_evals.py old mode 100644 new mode 100755 index d40f6093..8fcf120e --- a/lmms_eval/tasks/mathvista/mathvista_evals.py +++ b/lmms_eval/tasks/mathvista/mathvista_evals.py @@ -62,7 +62,7 @@ def _post_request(self, payload): response.raise_for_status() return response.json() - def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=10000000, sleep_time=0): + def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=5, sleep_time=0): messages = [ {"role": "user", "content": prompt}, ] diff --git a/lmms_eval/tasks/mathvista/mathvista_test.yaml b/lmms_eval/tasks/mathvista/mathvista_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mathvista/mathvista_testmini.yaml b/lmms_eval/tasks/mathvista/mathvista_testmini.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mathvista/utils.py b/lmms_eval/tasks/mathvista/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mix_evals/_default_template_yaml b/lmms_eval/tasks/mix_evals/_default_template_yaml new file mode 100644 index 00000000..8364226c --- /dev/null +++ b/lmms_eval/tasks/mix_evals/_default_template_yaml @@ -0,0 +1,9 @@ +dataset_path: lmms-lab/MixEvals_Video2Text +dataset_kwargs: + token: True + video: True + cache_dir: mix_evals_video2text +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" \ No newline at end of file diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml new file mode 100755 index 00000000..bee421c0 --- /dev/null +++ b/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml @@ -0,0 +1,5 @@ +group: mix_evals_video2text +task: +- mix_evals_video2text_open +- mix_evals_video2text_mc +- mix_evals_video2text_freeform \ No newline at end of file diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml new file mode 100755 index 00000000..87ca8b1e --- /dev/null +++ b/lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml @@ -0,0 +1,20 @@ +dataset_name: "video2text_closeended_free-form" +task: "mix_evals_video2text_freeform" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual +doc_to_text: !function utils.mix_evals_video2text_doc_to_text +doc_to_target: "{{target}}" +# process_results: !function utils.mix_evals_video2text_process_results_freeform +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +include: _default_template_yaml + +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "Answer the question using a single word or phrase." \ No newline at end of file diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml new file mode 100755 index 00000000..ca19aa09 --- /dev/null +++ b/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml @@ -0,0 +1,28 @@ +include: _default_template_yaml +dataset_name: "video2text_closeended_multiple-choice" +task: "mix_evals_video2text_mc" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual +doc_to_text: !function utils.mix_evals_video2text_doc_to_text +doc_to_target: "{{target}}" + +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: 0 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "Answer with the option’s letter from the given choices directly." \ No newline at end of file diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml new file mode 100755 index 00000000..965cf750 --- /dev/null +++ b/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml @@ -0,0 +1,13 @@ +dataset_name: "video2text_openended" +task: "mix_evals_video2text_open" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual +doc_to_text: !function utils.mix_evals_video2text_doc_to_text_open_convs +doc_to_target: "" +process_results: !function utils.mix_evals_video2text_process_results_open_convs +metric_list: + - metric: submission + aggregation: !function utils.mix_evals_video2text_aggregate_gen + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/mix_evals/utils.py b/lmms_eval/tasks/mix_evals/utils.py new file mode 100755 index 00000000..2cf175f3 --- /dev/null +++ b/lmms_eval/tasks/mix_evals/utils.py @@ -0,0 +1,200 @@ +import os +import re +import sys +import datetime +import lmms_eval.tasks._task_utils.file_utils as file_utils +from lmms_eval.filters.extraction import ExtendedRegexFilter +import json +import logging +import yaml +from pathlib import Path + +with open(Path(__file__).parent / "_default_template_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +# A bit ugly here +# But the idea is that we will unzip all the zip files +# To HF HOME cache dir +# And load it here +HF_HOME = os.environ["HF_HOME"] +cache_dir = config["dataset_kwargs"]["cache_dir"] +cache_dir = os.path.join(HF_HOME, cache_dir) +cache_dir = os.path.join(cache_dir) + + +eval_logger = logging.getLogger("lmms-eval") + + +# Pass in video path here +# Can only work correctly with video llm +def mix_evals_video2text_doc_to_visual(doc): + video_path = doc["video_path"] + video_path = os.path.join(cache_dir, video_path) + if os.path.exists(video_path): + video_path = video_path + elif os.path.exists(video_path.replace("mp4", "MP4")): + video_path = video_path.replace("mp4", "MP4") + else: + sys.exit(f"video path:{video_path} does not exist, please check") + return [video_path] + + +# This is the place where you format your question +def mix_evals_video2text_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + user_prompt = doc["prompt"] + + if "options" in doc: + option_prompt = "Here are the options:\n" + for idx, option in enumerate(doc["options"]): + char_idx = chr(ord("A") + idx) + option = option.strip() + option_prompt += f"{char_idx}. {option}\n" + + option_prompt = option_prompt.rstrip("\n") + user_prompt = f"{user_prompt}\n{option_prompt}" + + if pre_prompt: + user_prompt = f"{pre_prompt}\n{user_prompt}" + + if post_prompt: + user_prompt = f"{user_prompt}\n{post_prompt}" + return user_prompt + + +def mix_evals_video2text_doc_to_text_open_convs(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + formatted_prompt = "" + first_turn_user_prompt = doc["first_turn_user_prompt"] + + if pre_prompt: + formatted_prompt = f"{pre_prompt}\n{first_turn_user_prompt}" + else: + formatted_prompt = f"{first_turn_user_prompt}" + + if "round2" in doc and doc["round2"]: + second_turn_user_prompt = doc["second_turn_user_prompt"] + formatted_prompt += f"{formatted_prompt}\n{second_turn_user_prompt}" + if post_prompt: + formatted_prompt += f"{formatted_prompt}\n{post_prompt}" + return formatted_prompt + else: + if post_prompt: + formatted_prompt += f"{formatted_prompt}\n{post_prompt}" + return formatted_prompt + + +def mix_evals_video2text_process_results_open_convs(doc, result): + pred = result[0] + return {"submission": {"pred": pred, "question_idx": doc["question_index"], "first_turn_video_caption": doc["first_turn_video_caption"], "target": ""}} + + +def mix_evals_video2text_process_results_freeform(doc, result): + pred = result[0] + return {"submission": {"pred": pred, "question_idx": doc["question_index"], "first_turn_video_caption": doc["first_turn_video_caption"], "target": ""}} + + +def mix_evals_video2text_aggregate_submissions(results, args, task): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"mix_evals_video2text_{task}-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + with open(path, "w") as f: + json.dump(results, f) + eval_logger.info(f"Submission file saved to {path}") + + +# Factory into different aggregate +def mix_evals_video2text_aggregate_gen(results, args): + mix_evals_video2text_aggregate_submissions(results, args, "OpenConvs") + + +class MultiChoiceRegexFilter(ExtendedRegexFilter): + def __init__(self, *args, **kwargs): + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(*args, **kwargs) + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + + filtered_resps = [] + + for r, doc in zip(resps, docs): + fallback_regexes = [] + choice_to_alpha = {} + next_alpha = "A" + + without_paren_fallback_regexes = [] + without_paren_to_target = {} + + options_prompt = "" + for idx, option in enumerate(doc["options"]): + char_idx = chr(ord("A") + idx) + option = option.strip() + options_prompt += f"{char_idx}. {option}\n" + options_prompt = options_prompt.rstrip("\n") + # Regex to extract multiple choice options from the question + multiple_choices_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)") + matches = multiple_choices_regex.findall(options_prompt) + + # Build regex patterns and mappings for each choice + for m in matches: + choice_text = m[1].strip() + fallback_regexes.append(f"{re.escape(choice_text)}") + choice_to_alpha[choice_text] = next_alpha + + next_alpha = chr(ord(next_alpha) + 1) + + # Compile regex to match any of the extracted choices + fallback_regex = re.compile("|".join(fallback_regexes)) + + # Process each response + filtered = [] + for resp in r: + # Remove any punctuation and extra spaces + cleaned_resp = re.sub(r"[^\w\s]", "", resp).strip() + # Try to match cleaned response with the choice text + match = fallback_regex.search(cleaned_resp) + if match and match.group() in choice_to_alpha: + # Map the matched choice text back to its corresponding letter + filtered.append(choice_to_alpha[match.group()]) + else: + # If no match, return the cleaned response + filtered.append(cleaned_resp) + + filtered_resps.append(filtered[0]) + + return filtered_resps diff --git a/lmms_eval/tasks/mmbench/cc_utils.py b/lmms_eval/tasks/mmbench/cc_utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/cn_utils.py b/lmms_eval/tasks/mmbench/cn_utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench.yaml b/lmms_eval/tasks/mmbench/mmbench.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_cc.yaml b/lmms_eval/tasks/mmbench/mmbench_cc.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_cn.yaml b/lmms_eval/tasks/mmbench/mmbench_cn.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_en.yaml b/lmms_eval/tasks/mmbench/mmbench_en.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/mmbench_evals.py b/lmms_eval/tasks/mmbench/mmbench_evals.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mme/mme.yaml b/lmms_eval/tasks/mme/mme.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mme/utils.py b/lmms_eval/tasks/mme/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmmu/mmmu.yaml b/lmms_eval/tasks/mmmu/mmmu.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmmu/mmmu_test.yaml b/lmms_eval/tasks/mmmu/mmmu_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmmu/mmmu_val.yaml b/lmms_eval/tasks/mmmu/mmmu_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py old mode 100644 new mode 100755 index bc68c00e..302a6dff --- a/lmms_eval/tasks/mmmu/utils.py +++ b/lmms_eval/tasks/mmmu/utils.py @@ -16,11 +16,11 @@ def replace_images_tokens(input_string): - for i in range(1, 8): - question_text = f"" - query_text = "" - if question_text in input_string: - input_string = input_string.replace(question_text, query_text) + # for i in range(1, 8): + # question_text = f"" + # query_text = "" + # if question_text in input_string: + # input_string = input_string.replace(question_text, query_text) return input_string @@ -51,7 +51,7 @@ def mmmu_doc_to_visual(doc): prompt = construct_prompt(doc) image_tokens = re.findall(r"", prompt) # Remove <> and swap space as _ - image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] + image_tokens = sorted(list(set([image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]))) visual = [doc[image_token].convert("RGB") for image_token in image_tokens] return visual diff --git a/lmms_eval/tasks/mmvet/mmvet.yaml b/lmms_eval/tasks/mmvet/mmvet.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmvet/utils.py b/lmms_eval/tasks/mmvet/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/multidocvqa/multidocvqa.yaml b/lmms_eval/tasks/multidocvqa/multidocvqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml b/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml b/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/multidocvqa/utils.py b/lmms_eval/tasks/multidocvqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml b/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/nocaps/nocaps.yaml b/lmms_eval/tasks/nocaps/nocaps.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/nocaps/nocaps_test.yaml b/lmms_eval/tasks/nocaps/nocaps_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/nocaps/nocaps_val.yaml b/lmms_eval/tasks/nocaps/nocaps_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/nocaps/utils.py b/lmms_eval/tasks/nocaps/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml b/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ok_vqa/_generate_config.py b/lmms_eval/tasks/ok_vqa/_generate_config.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml b/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml b/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/ok_vqa/utils.py b/lmms_eval/tasks/ok_vqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/olympiadbench/cn_utils.py b/lmms_eval/tasks/olympiadbench/cn_utils.py index 34e5ce4d..628d51da 100644 --- a/lmms_eval/tasks/olympiadbench/cn_utils.py +++ b/lmms_eval/tasks/olympiadbench/cn_utils.py @@ -5,14 +5,17 @@ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file import logging + eval_logger = logging.getLogger("lmms-eval") dir_name = os.path.dirname(os.path.abspath(__file__)) olympiadbench_evaluator = OlympiadBenchEvaluator() + def olympiadbench_doc_to_visual(doc): return [image.convert("RGB") for image in doc["images"]] + def olympiadbench_doc_to_text(doc): question = doc["question"] subject = doc["subfield"] @@ -36,28 +39,26 @@ def olympiadbench_doc_to_text(doc): else: post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n' - final_question = pre_prompt + question + '\n' + post_prompt + final_question = pre_prompt + question + "\n" + post_prompt return final_question + def olympiadbench_process_results(doc, results): precision = doc["error"] - is_proving = "TP" in doc["source"] + is_proving = "TP" in doc["source"] if precision is None: precision = 0 prediction = results[0].strip() if is_proving: - return { - "submission": prediction - } + return {"submission": prediction} else: prediction = prediction.split("所以最终答案是")[-1] prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。") accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision) accuracy = int(accuracy) - return { - "exact_match": accuracy - } + return {"exact_match": accuracy} + def olympiadbench_aggregate_results(results, args): now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") @@ -66,4 +67,3 @@ def olympiadbench_aggregate_results(results, args): with open(path, "w") as f: json.dump(results, f, ensure_ascii=False) print(f"Submission file saved to {path}") - \ No newline at end of file diff --git a/lmms_eval/tasks/olympiadbench/en_utils.py b/lmms_eval/tasks/olympiadbench/en_utils.py index a21ee159..4b165e38 100644 --- a/lmms_eval/tasks/olympiadbench/en_utils.py +++ b/lmms_eval/tasks/olympiadbench/en_utils.py @@ -5,14 +5,17 @@ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file import logging + eval_logger = logging.getLogger("lmms-eval") dir_name = os.path.dirname(os.path.abspath(__file__)) olympiadbench_evaluator = OlympiadBenchEvaluator() + def olympiadbench_doc_to_visual(doc): return [image.convert("RGB") for image in doc["images"]] + def olympiadbench_doc_to_text(doc): question = doc["question"] subject = doc["subfield"] @@ -30,34 +33,34 @@ def olympiadbench_doc_to_text(doc): post_prompt += f"The answer of the question should be {ans_type}.\n" else: post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n" - post_prompt += "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with " + post_prompt += ( + "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with " + ) if not mul_ans: post_prompt += '"So the final answer is \\boxed{answer}."\n' else: - post_prompt += 'So the final answer is \\boxed{multiple answers connected with commas}.\n' + post_prompt += "So the final answer is \\boxed{multiple answers connected with commas}.\n" - final_question = pre_prompt + question + '\n' + post_prompt + final_question = pre_prompt + question + "\n" + post_prompt return final_question + def olympiadbench_process_results(doc, results): precision = doc["error"] - is_proving = "TP" in doc["source"] + is_proving = "TP" in doc["source"] if precision is None: precision = 0 prediction = results[0].strip() if is_proving: - return { - "submission": prediction - } + return {"submission": prediction} else: prediction = prediction.split("final answer is")[-1] prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。") accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision) accuracy = int(accuracy) - return { - "exact_match": accuracy - } + return {"exact_match": accuracy} + def olympiadbench_aggregate_results(results, args): now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") @@ -66,4 +69,3 @@ def olympiadbench_aggregate_results(results, args): with open(path, "w") as f: json.dump(results, f, ensure_ascii=False) print(f"Submission file saved to {path}") - \ No newline at end of file diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py index dd40f611..709ee988 100644 --- a/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py +++ b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py @@ -11,6 +11,7 @@ # precision = 1e-4 # res = scorer.judge(exp1, exp2, precision) + class OlympiadBenchEvaluator: def __init__(self): # Map of special symbols to their replacements @@ -46,8 +47,8 @@ def split_by_comma(self, expr: str): start_idx = i + 1 if start_idx < len(expr): - splitted_expr.append(expr[start_idx:].strip()) - + splitted_expr.append(expr[start_idx:].strip()) + return splitted_expr def trans_plus_minus_sign(self, expr_list: list): @@ -59,9 +60,9 @@ def trans_plus_minus_sign(self, expr_list: list): new_expr_list.append(expr.replace("\\pm", "-")) else: new_expr_list.append(expr) - + return new_expr_list - + def judge(self, expression1, expression2, precision=1e-8): # Judge if two expressions are equal (expression1 is considered as the Ground Truth) # Default precision is a list for supporting multiple expressions @@ -74,11 +75,11 @@ def judge(self, expression1, expression2, precision=1e-8): if expression1 == expression2: # print("Exactly equal") return True - + # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered - expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1) - expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2) - + expression1 = re.sub(r"[\u4e00-\u9fff]+", "", expression1) + expression2 = re.sub(r"[\u4e00-\u9fff]+", "", expression2) + expression1 = self.split_by_comma(expression1) expression2 = self.split_by_comma(expression2) @@ -88,7 +89,6 @@ def judge(self, expression1, expression2, precision=1e-8): # Set up a list for allowed errors if len(precision) <= 1: precision = precision * len(temp_list1) - if len(temp_list1) != len(temp_list2): return False @@ -112,7 +112,7 @@ def judge(self, expression1, expression2, precision=1e-8): # If all elements are matched, return True return True - + def is_interval(self, expr): # Checks if an expression is an interval return expr.startswith(("(", "[")) and expr.endswith((")", "]")) @@ -120,7 +120,7 @@ def is_interval(self, expr): def sympy_sub_pi(self, expression_sympy): # Replaces the symbol for pi in sympy expressions with its numerical value return expression_sympy.subs(self.pi, math.pi) - + def is_equal(self, expression1, expression2): # Default first expression is ground truth. Check if expressions are equal in different aspects if expression1 == expression2 and expression1 != "" and expression2 != "": @@ -143,7 +143,6 @@ def is_equal(self, expression1, expression2): return True except: pass - # Then check if expressions are mathematically equal try: if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2): @@ -151,7 +150,6 @@ def is_equal(self, expression1, expression2): return True except: pass - # Lastly, check for equation equality try: if self.equation_equal(expression1, expression2): @@ -159,7 +157,6 @@ def is_equal(self, expression1, expression2): return True except: pass - return False def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True): @@ -167,17 +164,14 @@ def numerical_equal(self, expression1: str, expression2: str, include_percentage # Includes possible percentage cases reference = float(expression1) prediction = float(expression2) - if include_percentage: gt_result = [reference / 100, reference, reference * 100] else: gt_result = [reference] - for item in gt_result: if abs(item - prediction) <= self.precision * 1.01: return True return False - def expression_equal(self, exp1, exp2): # Check if two expressions are mathematically equivalent @@ -186,7 +180,7 @@ def extract_expression(expression): if "=" in expression: expression = expression.split("=")[1] return expression.strip() - + exp1 = extract_expression(exp1) exp2 = extract_expression(exp2) @@ -204,7 +198,7 @@ def extract_expression(expression): elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol): try: if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)): - print(f"These two numbers cannot be calculated by the current computer for: \"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"") + print(f'These two numbers cannot be calculated by the current computer for: "{str(expr1_sym)}" and "{str(expr2_sym)}"') return False if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01: @@ -218,7 +212,6 @@ def extract_expression(expression): simplified_expr = simplify(expr1_sym - expr2_sym) num_value = simplified_expr.evalf() - return abs(num_value) < 1e-3 except: return False @@ -227,7 +220,7 @@ def equation_equal(self, expression1, expression2): # Check if two equations are mathematically equivalent # Simplify equations and use sympy for equivalence checking def simplify_equation(latex_eq): - lhs, rhs = latex_eq.split('=') + lhs, rhs = latex_eq.split("=") lhs_expr = parse_latex(lhs) rhs_expr = parse_latex(rhs) @@ -254,18 +247,18 @@ def interval_equal(self, expression1, expression2): def compare_two_interval(inter1, inter2): if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]: return False - - inter1 = inter1.strip('[]()') - inter2 = inter2.strip('[]()') - items_1 = inter1.split(',') - items_2 = inter2.split(',') + inter1 = inter1.strip("[]()") + inter2 = inter2.strip("[]()") + + items_1 = inter1.split(",") + items_2 = inter2.split(",") for item_1, item_2 in zip(items_1, items_2): if not self.expression_equal(item_1, item_2): return False return True - + interval1 = expression1 interval2 = expression2 @@ -274,7 +267,6 @@ def compare_two_interval(inter1, inter2): else: inter_list1 = interval1.split("\\cup") inter_list2 = interval2.split("\\cup") - if len(inter_list1) != len(inter_list2): return False else: @@ -286,7 +278,7 @@ def compare_two_interval(inter1, inter2): def preprocess(self, expression1, expression2): # Preprocess expressions to extract and replace special symbols def extract_boxed_content(latex_str): - boxed_matches = re.finditer(r'\\boxed{', latex_str) + boxed_matches = re.finditer(r"\\boxed{", latex_str) results = "" for match in boxed_matches: @@ -295,14 +287,14 @@ def extract_boxed_content(latex_str): stack = 1 while stack > 0 and end_index < len(latex_str): - if latex_str[end_index] == '{': + if latex_str[end_index] == "{": stack += 1 - elif latex_str[end_index] == '}': + elif latex_str[end_index] == "}": stack -= 1 end_index += 1 if stack == 0: - content = latex_str[start_index:end_index - 1] + content = latex_str[start_index : end_index - 1] results += content + "," else: raise ValueError("Mismatched braces in LaTeX string.") @@ -317,28 +309,28 @@ def extract_boxed_content(latex_str): results += ans + "," else: results = latex_str - + return results - + def sepcial_symbol_replace(expression): if "\\in " in expression: expression = expression.split("\\in ")[1] - + for signal in self.special_signal_map: expression = expression.replace(signal, self.special_signal_map[signal]) expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~,。") - pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}' - expression = re.sub(pattern, r'\1', expression) + pattern = r"\\(?:mathrm|mathbf)\{~?([^}]*)\}" + expression = re.sub(pattern, r"\1", expression) return expression - + exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2) exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2) return exp1, exp2 - + def can_compute_power(self, expr): # Checks if a power expression can be computed if isinstance(expr, Pow): @@ -352,4 +344,4 @@ def can_compute_power(self, expr): else: return False else: - return True # Not a power expression, can compute \ No newline at end of file + return True # Not a power expression, can compute diff --git a/lmms_eval/tasks/pope/pope.yaml b/lmms_eval/tasks/pope/pope.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/pope/utils.py b/lmms_eval/tasks/pope/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/realworldqa/realworldqa.yaml b/lmms_eval/tasks/realworldqa/realworldqa.yaml new file mode 100644 index 00000000..9d7dfcb3 --- /dev/null +++ b/lmms_eval/tasks/realworldqa/realworldqa.yaml @@ -0,0 +1,42 @@ +dataset_path: lmms-lab/RealWorldQA +dataset_kwargs: + token: True +task: "realworldqa" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.realworldqa_doc_to_visual +doc_to_text: !function utils.realworldqa_doc_to_text +doc_to_target: "answer" + +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: 0 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + gpt4v: + pre_prompt: "" + post_prompt: "" +metadata: + - version: 0.0 diff --git a/lmms_eval/tasks/realworldqa/utils.py b/lmms_eval/tasks/realworldqa/utils.py new file mode 100644 index 00000000..9ed645ed --- /dev/null +++ b/lmms_eval/tasks/realworldqa/utils.py @@ -0,0 +1,117 @@ +from lmms_eval.filters.extraction import ExtendedRegexFilter +from lmms_eval.filters.transformation import MapFilter +import re + +REPLACE_PROMPT = "Please answer directly with only the letter of the correct option and nothing else." + + +def realworldqa_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def realworldqa_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + question = doc["question"].strip() + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"]: + question = question.replace(REPLACE_PROMPT, "") + post_prompt = model_specific_prompt_kwargs["post_prompt"] + return f"{pre_prompt}{question}{post_prompt}" + + +# number_words_to_digits = { +# "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", +# "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", +# "ten": "10" +# } + + +def realworldqa_process_results(doc, results): + pred = results[0].lower().strip().rstrip(".") + gt_ans = doc["answer"].lower().strip() + + print(f"Prediction: {pred}, Ground Truth: {gt_ans}") + # assert gt_ans in ["a", "b", "c", "d"] + score = 1.0 if pred == gt_ans else 0.0 + return { + "exact_match": score, + } + + +class NumberWordsToDigitsFilter(MapFilter): + def __init__(self) -> None: + mapping_dict = {"zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10"} + super().__init__(mapping_dict, default_value=None) + + def apply(self, resps, docs): + def filter_set(inst): + return [self.mapping_dict.get(resp.lower(), resp) for resp in inst] + + return [filter_set(resp) for resp in resps] + + +class MultiChoiceRegexFilter(ExtendedRegexFilter): + def __init__(self, *args, **kwargs): + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(*args, **kwargs) + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + + filtered_resps = [] + + for r, doc in zip(resps, docs): + fallback_regexes = [] + choice_to_alpha = {} + next_alpha = "A" + + without_paren_fallback_regexes = [] + without_paren_to_target = {} + + # Regex to extract multiple choice options from the question + multiple_choices_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)") + matches = multiple_choices_regex.findall(doc["question"]) + + # Build regex patterns and mappings for each choice + for m in matches: + choice_text = m[1].strip() + fallback_regexes.append(f"{re.escape(choice_text)}") + choice_to_alpha[choice_text] = next_alpha + + next_alpha = chr(ord(next_alpha) + 1) + + # Compile regex to match any of the extracted choices + fallback_regex = re.compile("|".join(fallback_regexes)) + + # Process each response + filtered = [] + for resp in r: + # Remove any punctuation and extra spaces + cleaned_resp = re.sub(r"[^\w\s]", "", resp).strip() + # Try to match cleaned response with the choice text + match = fallback_regex.search(cleaned_resp) + if match and match.group() in choice_to_alpha: + # Map the matched choice text back to its corresponding letter + filtered.append(choice_to_alpha[match.group()]) + else: + # If no match, return the cleaned response + filtered.append(cleaned_resp) + + filtered_resps.append(filtered[0]) + + return filtered_resps diff --git a/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml b/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/_default_template_seg_yaml b/lmms_eval/tasks/refcoco+/_default_template_seg_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/_generate_config.py b/lmms_eval/tasks/refcoco+/_generate_config.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/_refcoco.yaml b/lmms_eval/tasks/refcoco+/_refcoco.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml b/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml b/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml b/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml b/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml b/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml b/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco+/utils.py b/lmms_eval/tasks/refcoco+/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/_default_template_bbox_yaml b/lmms_eval/tasks/refcoco/_default_template_bbox_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/_default_template_seg_yaml b/lmms_eval/tasks/refcoco/_default_template_seg_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/_generate_config.py b/lmms_eval/tasks/refcoco/_generate_config.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/_refcoco.yaml b/lmms_eval/tasks/refcoco/_refcoco.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml b/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml b/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml b/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml b/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml b/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml b/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml b/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml b/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcoco/utils.py b/lmms_eval/tasks/refcoco/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/_default_template_bbox_yaml b/lmms_eval/tasks/refcocog/_default_template_bbox_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/_default_template_seg_yaml b/lmms_eval/tasks/refcocog/_default_template_seg_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/_generate_config.py b/lmms_eval/tasks/refcocog/_generate_config.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/_refcoco.yaml b/lmms_eval/tasks/refcocog/_refcoco.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml b/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml b/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml b/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml b/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/refcocog/utils.py b/lmms_eval/tasks/refcocog/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/scienceqa/scienceqa.yaml b/lmms_eval/tasks/scienceqa/scienceqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/scienceqa/scienceqa_full.yaml b/lmms_eval/tasks/scienceqa/scienceqa_full.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/scienceqa/scienceqa_img.yaml b/lmms_eval/tasks/scienceqa/scienceqa_img.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/scienceqa/utils.py b/lmms_eval/tasks/scienceqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/seedbench/seedbench.yaml b/lmms_eval/tasks/seedbench/seedbench.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/seedbench/seedbench_ppl.yaml b/lmms_eval/tasks/seedbench/seedbench_ppl.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/seedbench/utils.py b/lmms_eval/tasks/seedbench/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/seedbench_2/seedbench_2.yaml b/lmms_eval/tasks/seedbench_2/seedbench_2.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/seedbench_2/utils.py b/lmms_eval/tasks/seedbench_2/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/sft_eval/_default_sft_eval_ocr_rec_template_yaml b/lmms_eval/tasks/sft_eval/_default_sft_eval_ocr_rec_template_yaml new file mode 100644 index 00000000..b00317d6 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/_default_sft_eval_ocr_rec_template_yaml @@ -0,0 +1,23 @@ +dataset_path: lmms-lab/sft-eval +output_type: generate_until +doc_to_visual: !function utils.sft_eval_doc_to_visual +doc_to_text: !function utils.sft_eval_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 128 + until: + - "ASSISTANT:" +process_results: !function utils.process_result_ocr_rec +metric_list: + - metric: edit_distance + aggregation : !function utils.sft_eval_edit_dist_agg + higher_is_better : true + - metric: edit_acc + aggregation : !function utils.sft_eval_edit_dist_acc_agg + higher_is_better : true +metadata: + version: '0.0' +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" \ No newline at end of file diff --git a/lmms_eval/tasks/sft_eval/_default_sft_eval_rest_template_yaml b/lmms_eval/tasks/sft_eval/_default_sft_eval_rest_template_yaml new file mode 100644 index 00000000..b658dfb4 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/_default_sft_eval_rest_template_yaml @@ -0,0 +1,20 @@ +dataset_path: lmms-lab/sft-eval +output_type: generate_until +doc_to_visual: !function utils.sft_eval_doc_to_visual +doc_to_text: !function utils.sft_eval_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 128 + until: + - "ASSISTANT:" +process_results: !function utils.process_result_rest +metric_list: + - metric: fuzzy_match + aggregation : !function utils.sft_eval_acc_agg + higher_is_better : true +metadata: + version: '0.0' +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" \ No newline at end of file diff --git a/lmms_eval/tasks/sft_eval/_generate_config.py b/lmms_eval/tasks/sft_eval/_generate_config.py new file mode 100644 index 00000000..e1dbcb83 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/_generate_config.py @@ -0,0 +1,94 @@ +import os +import yaml + +splits = [ + "sft-activity", + "sft-arts", + "sft-body", + "sft-car", + "sft-color", + "sft-commodity", + "sft-count", + "sft-daily", + "sft-engineer", + "sft-entertainment", + "sft-exist", + "sft-face", + "sft-food", + "sft-healthcare", + "sft-landmark", + "sft-logo", + "sft-natural", + "sft-ocr_qa_adv", + "sft-ocr_qa_chart", + "sft-ocr_qa_form", + "sft-ocr_qa_scene", + "sft-ocr_qa_screen", + "sft-ocr_rec_adv", + "sft-ocr_rec_doc", + "sft-ocr_rec_handwrite", + "sft-ocr_rec_markdown", + "sft-ocr_rec_scene", + "sft-ocr_rec_screen", + "sft-place", + "sft-position", + "sft-sport", + "sft-status", +] +dir_path = os.path.dirname(os.path.realpath(__file__)) + +local_name2official_name = { + "sft-ocr_rec_scene": "sft_ocr_scene_cn_eval", + "sft-ocr_rec_screen": "sft_ocr_screen_cn_eval", + "sft-ocr_rec_handwrite": "sft_ocr_handwrite_cn_eval", + "sft-ocr_rec_adv": "sft_ocr_adv_cn_eval", + "sft-ocr_rec_doc": "sft_ocr_doc_cn_eval", + "sft-ocr_qa_scene": "sft_ocr_sceneQA_cn_eval", + "sft-ocr_qa_screen": "sft_ocr_screenQA_cn_eval", + "sft-ocr_qa_adv": "sft_ocr_ecommerceQA_cn_eval", + "sft-ocr_rec_markdown": "sft_ocr_markdown_cn_eval", + "sft-ocr_qa_form": "sft_ocr_formQA_cn_eval", + "sft-ocr_qa_chart": "sft_ocr_chartQA_cn_eval", + "sft-face": "sft_celeb_cn_eval", + "sft-body": "sft_body_cn_eval", + "sft-count": "sft_count_cn_eval", + "sft-position": "sft_position_cn_eval", + "#": "sft_visualprompt_cn_eval", + "!": "sft_grounding_cn_eval", + "sft-exist": "sft_exist_cn_eval", + "sft-color": "sft_color_cn_eval", + "sft-status": "sft_status_cn_eval", + "sft-activity": "sft_activity_cn_eval", + "sft-place": "sft_place_cn_eval", + "sft-daily": "sft_daily_cn_eval", + "sft-arts": "sft_arts_cn_eval", + "sft-natural": "sft_natural_cn_eval", + "sft-engineer": "sft_engineer_cn_eval", + "sft-healthcare": "sft_healthcare_cn_eval", + "sft-entertainment": "sft_entertainment_cn_eval", + "sft-sport": "sft_sport_cn_eval", + "sft-commodity": "sft_commodity_cn_eval", + "sft-food": "sft_food_cn_eval", + "sft-car": "sft_car_cn_eval", + "sft-landmark": "sft_landmark_cn_eval", +} + +task_name = [local_name2official_name[split] if split in local_name2official_name else split for split in splits] +splits = [split.replace("-", "_") for split in splits] + +if __name__ == "__main__": + for split, task in zip(splits, task_name): + yaml_dict = {"group": f"sft_eval", "task": task, "test_split": split, "dataset_name": split} + save_path = os.path.join(dir_path, f"{split}.yaml") + if "ocr" in split and "rec" in split: + yaml_dict["include"] = "_default_sft_eval_ocr_rec_template_yaml" + else: + yaml_dict["include"] = "_default_sft_eval_rest_template_yaml" + print(f"Saving to {save_path}") + with open(save_path, "w") as f: + yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) + + group_dict = {"group": "sft_eval", "task": splits} + + with open(os.path.join(dir_path, "_sft_eval.yaml"), "w") as f: + yaml.dump(group_dict, f, default_flow_style=False, indent=4) diff --git a/lmms_eval/tasks/sft_eval/_sft_eval.yaml b/lmms_eval/tasks/sft_eval/_sft_eval.yaml new file mode 100644 index 00000000..c336a4dd --- /dev/null +++ b/lmms_eval/tasks/sft_eval/_sft_eval.yaml @@ -0,0 +1,34 @@ +group: sft_eval +task: +- sft_activity +- sft_arts +- sft_body +- sft_car +- sft_color +- sft_commodity +- sft_count +- sft_daily +- sft_engineer +- sft_entertainment +- sft_exist +- sft_face +- sft_food +- sft_healthcare +- sft_landmark +- sft_logo +- sft_natural +- sft_ocr_qa_adv +- sft_ocr_qa_chart +- sft_ocr_qa_form +- sft_ocr_qa_scene +- sft_ocr_qa_screen +- sft_ocr_rec_adv +- sft_ocr_rec_doc +- sft_ocr_rec_handwrite +- sft_ocr_rec_markdown +- sft_ocr_rec_scene +- sft_ocr_rec_screen +- sft_place +- sft_position +- sft_sport +- sft_status diff --git a/lmms_eval/tasks/sft_eval/sft_activity.yaml b/lmms_eval/tasks/sft_eval/sft_activity.yaml new file mode 100644 index 00000000..d8bc91b5 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_activity.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_activity_cn_eval +test_split: sft_activity +dataset_name: sft_activity +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_arts.yaml b/lmms_eval/tasks/sft_eval/sft_arts.yaml new file mode 100644 index 00000000..592b2892 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_arts.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_arts_cn_eval +test_split: sft_arts +dataset_name: sft_arts +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_body.yaml b/lmms_eval/tasks/sft_eval/sft_body.yaml new file mode 100644 index 00000000..6e0edb3f --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_body.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_body_cn_eval +test_split: sft_body +dataset_name: sft_body +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_car.yaml b/lmms_eval/tasks/sft_eval/sft_car.yaml new file mode 100644 index 00000000..e8216934 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_car.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_car_cn_eval +test_split: sft_car +dataset_name: sft_car +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_color.yaml b/lmms_eval/tasks/sft_eval/sft_color.yaml new file mode 100644 index 00000000..85f912af --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_color.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_color_cn_eval +test_split: sft_color +dataset_name: sft_color +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_commodity.yaml b/lmms_eval/tasks/sft_eval/sft_commodity.yaml new file mode 100644 index 00000000..3c37c5e0 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_commodity.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_commodity_cn_eval +test_split: sft_commodity +dataset_name: sft_commodity +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_count.yaml b/lmms_eval/tasks/sft_eval/sft_count.yaml new file mode 100644 index 00000000..b4184448 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_count.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_count_cn_eval +test_split: sft_count +dataset_name: sft_count +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_daily.yaml b/lmms_eval/tasks/sft_eval/sft_daily.yaml new file mode 100644 index 00000000..540f657e --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_daily.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_daily_cn_eval +test_split: sft_daily +dataset_name: sft_daily +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_engineer.yaml b/lmms_eval/tasks/sft_eval/sft_engineer.yaml new file mode 100644 index 00000000..bd2532b7 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_engineer.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_engineer_cn_eval +test_split: sft_engineer +dataset_name: sft_engineer +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_entertainment.yaml b/lmms_eval/tasks/sft_eval/sft_entertainment.yaml new file mode 100644 index 00000000..04f84127 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_entertainment.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_entertainment_cn_eval +test_split: sft_entertainment +dataset_name: sft_entertainment +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_exist.yaml b/lmms_eval/tasks/sft_eval/sft_exist.yaml new file mode 100644 index 00000000..b3ec83f7 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_exist.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_exist_cn_eval +test_split: sft_exist +dataset_name: sft_exist +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_face.yaml b/lmms_eval/tasks/sft_eval/sft_face.yaml new file mode 100644 index 00000000..5dd6962f --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_face.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_celeb_cn_eval +test_split: sft_face +dataset_name: sft_face +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_food.yaml b/lmms_eval/tasks/sft_eval/sft_food.yaml new file mode 100644 index 00000000..a6a3dad4 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_food.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_food_cn_eval +test_split: sft_food +dataset_name: sft_food +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_healthcare.yaml b/lmms_eval/tasks/sft_eval/sft_healthcare.yaml new file mode 100644 index 00000000..c63d093a --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_healthcare.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_healthcare_cn_eval +test_split: sft_healthcare +dataset_name: sft_healthcare +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_landmark.yaml b/lmms_eval/tasks/sft_eval/sft_landmark.yaml new file mode 100644 index 00000000..4c4c1209 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_landmark.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_landmark_cn_eval +test_split: sft_landmark +dataset_name: sft_landmark +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_logo.yaml b/lmms_eval/tasks/sft_eval/sft_logo.yaml new file mode 100644 index 00000000..35116a2d --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_logo.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft-logo +test_split: sft_logo +dataset_name: sft_logo +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_natural.yaml b/lmms_eval/tasks/sft_eval/sft_natural.yaml new file mode 100644 index 00000000..78668e9f --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_natural.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_natural_cn_eval +test_split: sft_natural +dataset_name: sft_natural +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_qa_adv.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_qa_adv.yaml new file mode 100644 index 00000000..0e7243ff --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_qa_adv.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_ecommerceQA_cn_eval +test_split: sft_ocr_qa_adv +dataset_name: sft_ocr_qa_adv +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_qa_chart.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_qa_chart.yaml new file mode 100644 index 00000000..031eaa72 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_qa_chart.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_chartQA_cn_eval +test_split: sft_ocr_qa_chart +dataset_name: sft_ocr_qa_chart +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_qa_form.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_qa_form.yaml new file mode 100644 index 00000000..18e059e8 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_qa_form.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_formQA_cn_eval +test_split: sft_ocr_qa_form +dataset_name: sft_ocr_qa_form +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_qa_scene.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_qa_scene.yaml new file mode 100644 index 00000000..0af342d9 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_qa_scene.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_sceneQA_cn_eval +test_split: sft_ocr_qa_scene +dataset_name: sft_ocr_qa_scene +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_qa_screen.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_qa_screen.yaml new file mode 100644 index 00000000..586194a9 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_qa_screen.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_screenQA_cn_eval +test_split: sft_ocr_qa_screen +dataset_name: sft_ocr_qa_screen +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_rec_adv.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_rec_adv.yaml new file mode 100644 index 00000000..69e116f8 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_rec_adv.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_adv_cn_eval +test_split: sft_ocr_rec_adv +dataset_name: sft_ocr_rec_adv +include: _default_sft_eval_ocr_rec_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_rec_doc.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_rec_doc.yaml new file mode 100644 index 00000000..0d498c46 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_rec_doc.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_doc_cn_eval +test_split: sft_ocr_rec_doc +dataset_name: sft_ocr_rec_doc +include: _default_sft_eval_ocr_rec_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_rec_handwrite.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_rec_handwrite.yaml new file mode 100644 index 00000000..5eb39256 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_rec_handwrite.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_handwrite_cn_eval +test_split: sft_ocr_rec_handwrite +dataset_name: sft_ocr_rec_handwrite +include: _default_sft_eval_ocr_rec_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_rec_markdown.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_rec_markdown.yaml new file mode 100644 index 00000000..7fb34090 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_rec_markdown.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_markdown_cn_eval +test_split: sft_ocr_rec_markdown +dataset_name: sft_ocr_rec_markdown +include: _default_sft_eval_ocr_rec_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_rec_scene.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_rec_scene.yaml new file mode 100644 index 00000000..8ef84382 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_rec_scene.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_scene_cn_eval +test_split: sft_ocr_rec_scene +dataset_name: sft_ocr_rec_scene +include: _default_sft_eval_ocr_rec_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_ocr_rec_screen.yaml b/lmms_eval/tasks/sft_eval/sft_ocr_rec_screen.yaml new file mode 100644 index 00000000..eba50bdb --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_ocr_rec_screen.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_ocr_screen_cn_eval +test_split: sft_ocr_rec_screen +dataset_name: sft_ocr_rec_screen +include: _default_sft_eval_ocr_rec_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_place.yaml b/lmms_eval/tasks/sft_eval/sft_place.yaml new file mode 100644 index 00000000..1bc615b4 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_place.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_place_cn_eval +test_split: sft_place +dataset_name: sft_place +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_position.yaml b/lmms_eval/tasks/sft_eval/sft_position.yaml new file mode 100644 index 00000000..ef489818 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_position.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_position_cn_eval +test_split: sft_position +dataset_name: sft_position +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_sport.yaml b/lmms_eval/tasks/sft_eval/sft_sport.yaml new file mode 100644 index 00000000..94a6140b --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_sport.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_sport_cn_eval +test_split: sft_sport +dataset_name: sft_sport +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/sft_status.yaml b/lmms_eval/tasks/sft_eval/sft_status.yaml new file mode 100644 index 00000000..9f28ba51 --- /dev/null +++ b/lmms_eval/tasks/sft_eval/sft_status.yaml @@ -0,0 +1,5 @@ +group: sft_eval +task: sft_status_cn_eval +test_split: sft_status +dataset_name: sft_status +include: _default_sft_eval_rest_template_yaml diff --git a/lmms_eval/tasks/sft_eval/utils.py b/lmms_eval/tasks/sft_eval/utils.py new file mode 100644 index 00000000..8ed2ef0c --- /dev/null +++ b/lmms_eval/tasks/sft_eval/utils.py @@ -0,0 +1,109 @@ +from rapidfuzz.distance import Levenshtein + + +def _normalize_text(text): + import string + + text = "".join(filter(lambda x: x in (string.digits + string.ascii_letters), text)) + + +def process_result_ocr_rec(doc, result): + return {"edit_distance": {"gts": [gt.replace("\n", "") for gt in doc["answer"]], "predictions": result}, "edit_acc": {"gts": [gt.replace("\n", "") for gt in doc["answer"]], "predictions": result}} + + +def process_result_rest(doc, result): + return {"fuzzy_match": {"gts": [gt.replace("\n", "") for gt in doc["answer"]], "predictions": result, "question_id": doc["question_id"]}} + + +def run_editdistance(gts, predictions, ignore_space=True, is_filter=False): + eps = 1e-6 + correct_num = 0 + all_num = 0 + norm_edit_dis = 0.0 + edit_norm_score_list = list() + for idx in range(len(predictions)): + target, pred = gts[idx][0], predictions[idx] + if ignore_space: + pred = pred.replace(" ", "") + target = target.replace(" ", "") + if is_filter: + pred = _normalize_text(pred) + target = _normalize_text(target) + + ned = Levenshtein.normalized_distance(pred, target) + norm_edit_dis += ned + edit_norm_score_list.append(1 - ned) + if pred == target: + correct_num += 1 + all_num += 1 + # import pdb; pdb.set_trace() + metric = {"acc": correct_num / (all_num + eps), "norm_edit_dis": 1 - norm_edit_dis / (all_num + eps)} + return metric, edit_norm_score_list + + +def fuzzy_match_multi_answers(results, gt_dict): + acc = [] + for result in results: + question_id = result["question_id"] + try: + gt_ans = gt_dict[question_id] + except: + import pdb + + pdb.set_trace() + pred = result["text"] + for gt in gt_ans: + vqa_acc = 1 + if not ( + (gt == "是" and gt in pred and "不是" not in pred) + or (gt == "对" and gt in pred and "不对" not in pred) + or (gt == "相同" and gt in pred and "不相同" not in pred) + or (gt == "有" and gt in pred and "没有" not in pred) + or (gt == "在" and gt in pred and "不在" not in pred) + or (gt == "一样" and gt in pred and "不一样" not in pred) + or (gt not in ["是", "在", "对", "有", "一样", "相同"] and gt.lower() in pred.lower()) + ): + vqa_acc = 0 + if vqa_acc == 1: + break + acc.append(vqa_acc) + accuracy = sum(acc) / len(acc) * 100 + return {"Acc": accuracy} + + +def sft_eval_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def sft_eval_doc_to_text(doc): + return doc["question"] + + +def sft_eval_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "") + post_prompt = model_specific_prompt_kwargs.get("post_prompt", "") + question = f"{pre_prompt}{doc['question']}{post_prompt}" + return question + + +def sft_eval_edit_dist_acc_agg(results): + predictions = [result["predictions"][0] for result in results] + gts = [result["gts"] for result in results] + acc, _ = run_editdistance(gts, predictions) + return acc["acc"] + + +def sft_eval_edit_dist_agg(results): + predictions = [result["predictions"][0] for result in results] + gts = [result["gts"] for result in results] + acc, _ = run_editdistance(gts, predictions) + return acc["norm_edit_dis"] + + +def sft_eval_acc_agg(results): + gts_dict = {result["question_id"]: result["gts"] for result in results} + predictions = [{"question_id": result["question_id"], "text": result["predictions"][0]} for result in results] + acc = fuzzy_match_multi_answers(predictions, gts_dict) + return acc["Acc"] diff --git a/lmms_eval/tasks/stvqa/stvqa.yaml b/lmms_eval/tasks/stvqa/stvqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/stvqa/utils.py b/lmms_eval/tasks/stvqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml b/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textcaps/textcaps.yaml b/lmms_eval/tasks/textcaps/textcaps.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textcaps/textcaps_test.yaml b/lmms_eval/tasks/textcaps/textcaps_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textcaps/textcaps_train.yaml b/lmms_eval/tasks/textcaps/textcaps_train.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textcaps/textcaps_val.yaml b/lmms_eval/tasks/textcaps/textcaps_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textcaps/utils.py b/lmms_eval/tasks/textcaps/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml b/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textvqa/_textvqa.yaml b/lmms_eval/tasks/textvqa/_textvqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textvqa/textvqa_test.yaml b/lmms_eval/tasks/textvqa/textvqa_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textvqa/textvqa_val.yaml b/lmms_eval/tasks/textvqa/textvqa_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/textvqa/utils.py b/lmms_eval/tasks/textvqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml b/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vizwiz_vqa/_generate_config.py b/lmms_eval/tasks/vizwiz_vqa/_generate_config.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml b/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vizwiz_vqa/utils.py b/lmms_eval/tasks/vizwiz_vqa/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml b/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml b/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml b/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vqav2/_vqav2.yaml b/lmms_eval/tasks/vqav2/_vqav2.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vqav2/utils.py b/lmms_eval/tasks/vqav2/utils.py old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vqav2/vqav2_test.yaml b/lmms_eval/tasks/vqav2/vqav2_test.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/vqav2/vqav2_val.yaml b/lmms_eval/tasks/vqav2/vqav2_val.yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/worldqa/_default_template_yaml b/lmms_eval/tasks/worldqa/_default_template_yaml new file mode 100644 index 00000000..7ce14a25 --- /dev/null +++ b/lmms_eval/tasks/worldqa/_default_template_yaml @@ -0,0 +1,10 @@ +dataset_path: lmms-lab/worldqa +dataset_kwargs: + token: True + video: True + builder_script: multi-hop-reasoning.py + cache_dir: multi-hop-reasoning +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" \ No newline at end of file diff --git a/lmms_eval/tasks/worldqa/utils.py b/lmms_eval/tasks/worldqa/utils.py new file mode 100755 index 00000000..642dea1e --- /dev/null +++ b/lmms_eval/tasks/worldqa/utils.py @@ -0,0 +1,113 @@ +from decord import VideoReader, cpu +import numpy as np +import os +import sys +import datetime +import lmms_eval.tasks._task_utils.file_utils as file_utils +import json +import logging +import yaml +from pathlib import Path + +with open(Path(__file__).parent / "_default_template_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +# A bit ugly here +# But the idea is that we will unzip all the zip files +# To HF HOME cache dir +# And load it here +HF_HOME = os.environ["HF_HOME"] +cache_dir = config["dataset_kwargs"]["cache_dir"] +cache_dir = os.path.join(HF_HOME, cache_dir) +cache_dir = os.path.join(cache_dir, "videos") + + +eval_logger = logging.getLogger("lmms-eval") + + +# Pass in video path here +# Can only work correctly with video llm +def worldqa_doc_to_visual(doc): + video_path = doc["video_idx"] + ".mp4" + video_path = os.path.join(cache_dir, video_path) + if os.path.exists(video_path): + video_path = video_path + elif os.path.exists(video_path.replace("mp4", "MP4")): + video_path = video_path.replace("mp4", "MP4") + else: + sys.exit(f"video path:{video_path} does not exist, please check") + return [video_path] + + +# This is the place where you format your question +def worldqa_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + question = doc["question"] + if "option" in doc: + for op in doc["option"]: + question += "\n" + op + post_prompt = "\nAnswer with the option's letter from the given choices directly." + + return f"{pre_prompt}{question}{post_prompt}" + + +def worldqa_doc_to_answer(doc): + return doc["answer"] + + +# If it is mc, keep the option for exact match +def worldqa_doc_to_answer_mc(doc): + return doc["answer"].split(".")[0].strip() + + +# If it is mc ppl, keep the option str for perplexity base matching +def worldqa_doc_to_answer_mc_ppl(doc): + return doc["answer"].split(".")[1].strip() + + +# An example of showing how to custom metric +# Your metric name should have the same key name in your return dict +def worldqa_process_results(doc, result): + pred = result[0] + return {"submission": {"pred": pred, "question_idx": doc["question_idx"], "object_description": doc["object_description"], "answer": doc["answer"]}} + + +def worldqa_aggregate_submissions(results, args, task): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"worldqa-{task}-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + with open(path, "w") as f: + json.dump(results, f) + eval_logger.info(f"Submission file saved to {path}") + + +# Factory into different aggregate +def worldqa_aggregate_gen(results, args): + worldqa_aggregate_submissions(results, args, "Generation") + + +def worldqa_aggregate_mc(results, args): + worldqa_aggregate_submissions(results, args, "MC") + + +def worldqa_aggregate_mc_ppl(results, args): + worldqa_aggregate_submissions(results, args, "MC_PPL") + + +def worldqa_doc_to_choice(doc): + return [op.split(".")[1].strip() for op in doc["option"]] diff --git a/lmms_eval/tasks/worldqa/worldqa_generation.yaml b/lmms_eval/tasks/worldqa/worldqa_generation.yaml new file mode 100755 index 00000000..68be5412 --- /dev/null +++ b/lmms_eval/tasks/worldqa/worldqa_generation.yaml @@ -0,0 +1,13 @@ +dataset_name: "Generation" +task: "worldqa_gen" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.worldqa_doc_to_visual +doc_to_text: !function utils.worldqa_doc_to_text +doc_to_target: !function utils.worldqa_doc_to_answer +process_results: !function utils.worldqa_process_results +metric_list: + - metric: submission + aggregation: !function utils.worldqa_aggregate_gen + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/worldqa/worldqa_mc.yaml b/lmms_eval/tasks/worldqa/worldqa_mc.yaml new file mode 100755 index 00000000..e9681ad1 --- /dev/null +++ b/lmms_eval/tasks/worldqa/worldqa_mc.yaml @@ -0,0 +1,11 @@ +dataset_name: "MC" +task: "worldqa_mc" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.worldqa_doc_to_visual +doc_to_text: !function utils.worldqa_doc_to_text +doc_to_target: !function utils.worldqa_doc_to_answer_mc +metric_list: + - metric: exact_match + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/worldqa/worldqa_mcppl.yaml b/lmms_eval/tasks/worldqa/worldqa_mcppl.yaml new file mode 100755 index 00000000..bf0f31f2 --- /dev/null +++ b/lmms_eval/tasks/worldqa/worldqa_mcppl.yaml @@ -0,0 +1,11 @@ +dataset_name: "MC_PPL" +task: "worldqa_mc_ppl" +test_split: test +output_type: multiple_choice +doc_to_visual: !function utils.worldqa_doc_to_visual +doc_to_text: "question" +doc_to_target: !function utils.worldqa_doc_to_answer_mc_ppl +doc_to_choice: !function utils.worldqa_doc_to_choice +metric_list: + - metric: acc +include: _default_template_yaml diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py old mode 100644 new mode 100755 diff --git a/miscs/llava_result_check.md b/miscs/llava_result_check.md old mode 100644 new mode 100755 diff --git a/miscs/repr_scripts.sh b/miscs/repr_scripts.sh old mode 100644 new mode 100755 diff --git a/miscs/repr_torch_envs.txt b/miscs/repr_torch_envs.txt old mode 100644 new mode 100755 diff --git a/miscs/scienceqa_id.txt b/miscs/scienceqa_id.txt old mode 100644 new mode 100755 diff --git a/miscs/script.sh b/miscs/script.sh old mode 100644 new mode 100755 index b5d6cc6c..f0e63716 --- a/miscs/script.sh +++ b/miscs/script.sh @@ -18,4 +18,4 @@ gpu = 4 bs 1 use_flash_attention_2=True: -accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-13b" --tasks scienceqa --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/ +accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval --model qwen_vl --model_args pretrained="Qwen/Qwen-VL" --tasks mme --batch_size 1 --log_samples --log_samples_suffix debug --output_path ./logs/ diff --git a/miscs/test_llava.py b/miscs/test_llava.py old mode 100644 new mode 100755 diff --git a/miscs/test_scienceqa.py b/miscs/test_scienceqa.py old mode 100644 new mode 100755 diff --git a/pyproject.toml b/pyproject.toml old mode 100644 new mode 100755 index c50c4e76..29dac10f --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.8" license = { text = "MIT" } dependencies = [ - "accelerate>=0.21.0", + "accelerate>=0.29.1", "black==24.1.0", "datasets==2.16.1", "evaluate>=0.4.0", @@ -34,11 +34,11 @@ dependencies = [ "sacrebleu>=1.5.0", "scikit-learn>=0.24.1", "sqlitedict", - "torch>=1.8", + "torch>=2.1.0", # to enable sdpa mode for running 34B model on one 80GB GPU "openai>=1.0.0", "pycocoevalcap", "tqdm-multiprocess", - "transformers", + "transformers>=4.37.2", "zstandard", "pillow", "pyyaml", @@ -64,7 +64,6 @@ lmms_eval = ["**/*.yaml", "tasks/**/*"] [project.scripts] lmms-eval = "lmms_eval.__main__:cli_evaluate" -lmms_eval = "lmms_eval.__main__:cli_evaluate" [project.urls] Homepage = "https://lmms-lab.github.io/lmms-eval-blog/" diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/tools/make_hf_dataset.ipynb b/tools/make_hf_dataset.ipynb old mode 100644 new mode 100755