Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Contribute EgoLife model and evaluation pipeline for EgoPlan & Egothink #560

Merged
merged 3 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lmms_eval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"vllm": "VLLM",
"xcomposer2_4KHD": "XComposer2_4KHD",
"xcomposer2d5": "XComposer2D5",
"egogpt": "EgoGPT",
}


Expand Down
472 changes: 472 additions & 0 deletions lmms_eval/models/egogpt.py

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions lmms_eval/tasks/egoplan/egoplan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
dataset_path: EgoLife-v1/EgoPlan
dataset_kwargs:
token: True
cache_dir: egoplan
video: True
# From_YouTube: True
task: egoplan
test_split: validation
output_type: generate_until
doc_to_visual: !function utils.egoplan_doc_to_visual
doc_to_text: !function utils.egoplan_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 4096
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
# The return value of process_results will be used by metrics
process_results: !function utils.egoplan_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: egoplan_mcq_accuracy
aggregation: !function utils.egoplan_aggregate_results
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
gpt4v:
pre_prompt: ""
post_prompt: "\nAnswer the question with A, B, C, or D."
# qwen_vl:
# pre_prompt: ""
# post_prompt: " Answer:"
# otterhd:
# pre_prompt: ""
# post_prompt: " Answer:"
xcomposer2_4khd:
pre_prompt: "[UNUSED_TOKEN_146]user\n"
post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
metadata:
version: 0.0
207 changes: 207 additions & 0 deletions lmms_eval/tasks/egoplan/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import datetime
import json
import os
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Union

import cv2
import numpy as np
import yaml
from loguru import logger as eval_logger

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

# with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
# raw_data = f.readlines()
# safe_data = []
# for i, line in enumerate(raw_data):
# # remove function definition since yaml load cannot handle it
# if "!function" not in line:
# safe_data.append(line)

# config = yaml.safe_load("".join(safe_data))

hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
# cache_dir = os.path.join(hf_home, cache_dir)
# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "egoplan.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]


def parse_subtitle_time(time_str):
h, m, s_ms = time_str.split(":")
s, ms = s_ms.split(",")
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000


def load_subtitles(subtitle_path):
subtitles = {}
with open(subtitle_path, "r", encoding="utf-8") as file:
content = file.read().split("\n\n")
for section in content:
if section.strip():
lines = section.split("\n")
if len(lines) >= 3:
time_range = lines[1].split(" --> ")
start_time = parse_subtitle_time(time_range[0])
end_time = parse_subtitle_time(time_range[1])
text = " ".join(line for line in lines[2:])
subtitles[(start_time, end_time)] = text
return subtitles


def convert_time_to_frame(time_in_seconds, fps):
return int(time_in_seconds * fps)


def extract_subtitles(video_path, subtitle_path):
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
total_frame = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
subtitles = load_subtitles(subtitle_path)

subtitle_frames = []
for (start_time, end_time), text in subtitles.items():
start_frame = convert_time_to_frame(start_time, fps)
end_frame = convert_time_to_frame(end_time, fps)
subtitle_frames.append((start_frame, end_frame, text))

return subtitle_frames, total_frame


def parse_subtitle_time(time_str):
h, m, s_ms = time_str.split(":")
s, ms = s_ms.split(",")
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000


def load_subtitles(subtitle_path):
subtitles = {}
with open(subtitle_path, "r", encoding="utf-8") as file:
content = file.read().split("\n\n")
for section in content:
if section.strip():
lines = section.split("\n")
if len(lines) >= 3:
time_range = lines[1].split(" --> ")
start_time = parse_subtitle_time(time_range[0])
end_time = parse_subtitle_time(time_range[1])
text = " ".join(line for line in lines[2:])
subtitles[(start_time, end_time)] = text
return subtitles


def convert_time_to_frame(time_in_seconds, fps):
return int(time_in_seconds * fps)


def extract_subtitles(video_path, subtitle_path):
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
total_frame = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
subtitles = load_subtitles(subtitle_path)

subtitle_frames = []
for (start_time, end_time), text in subtitles.items():
start_frame = convert_time_to_frame(start_time, fps)
end_frame = convert_time_to_frame(end_time, fps)
subtitle_frames.append((start_frame, end_frame, text))

return subtitle_frames, total_frame


def egoplan_doc_to_visual(doc):
cache_dir = os.path.join(base_cache_dir, cache_name)
video_path = str(doc["sample_id"]) + ".mp4"
video_path = os.path.join(cache_dir, video_path)
if os.path.exists(video_path):
video_path = video_path
elif os.path.exists(video_path.replace("mp4", "MP4")):
video_path = video_path.replace("mp4", "MP4")
elif os.path.exists(video_path.replace("mp4", "mkv")):
video_path = video_path.replace("mp4", "mkv")
else:
sys.exit(f"video path:{video_path} does not exist, please check")
return [video_path]


def egoplan_doc_to_text(doc, lmms_eval_specific_kwargs=None):
task_goal = doc["task_goal"]
if "goal" in task_goal:
task_goal = task_goal.split("to", 1)[1].strip()
words = task_goal.split()
if words[0].endswith("ing"):
question_pattern = (
"I am tasked with {}. "
"The task's progress is demonstrated in the provided video. "
"My current field of view is shown in the provided image. "
"What should be my next action? "
"Please output the most reasonable action you think, expressed in a short phrase."
)
else:
question_pattern = (
"My current task is to {}. "
"The task's progress is demonstrated in the provided video. "
"My current field of view is shown in the provided image. "
"What should be my next action? "
"Please output the most reasonable action you think, expressed in a short phrase."
)
question = question_pattern.format(task_goal)

candidates = []
for choice_idx in ["A", "B", "C", "D"]:
question += "\n" + f"{choice_idx}. " + (doc[f"choice_{choice_idx.lower()}"])
post_prompt = "\nAnswer with the option's letter from the given choices"

return f"{question}{post_prompt}"


def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is" "The correct option is",
"Best answer:" "Best option:",
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, "")

if len(s.split()) > 10 and not re.search("[ABCD]", s):
return ""

matches = re.search(r"[ABCD]", s)
if matches is None:
return ""
return matches[0]


def egoplan_process_results(doc, results):
pred = results[0]
pred_ans = extract_characters_regex(pred)
# gt_ans = doc["answer"].lower().strip().replace(".", "")
doc["pred_answer"] = pred_ans
data_dict = doc.copy()
return {f"egoplan_mcq_accuracy": data_dict}


def egoplan_aggregate_results(results):
correct_num = 0
for result in results:
if result["pred_answer"] == result["golden_choice_idx"]:
correct_num += 1
question_num = len(results)
accuracy = correct_num / question_num
return accuracy
7 changes: 7 additions & 0 deletions lmms_eval/tasks/egothink/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
dataset_path: EgoLife-v1/Egothink
dataset_kwargs:
token: True
test_split: test
metadata:
version: 0.0
gpt_eval_model_name: "gpt-4"
14 changes: 14 additions & 0 deletions lmms_eval/tasks/egothink/egothink.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
group: egothink
task:
- egothink_activity
- egothink_affordance
- egothink_assistance
- egothink_navigation
- egothink_attribute
- egothink_comparing
- egothink_counting
- egothink_existence
- egothink_forecasting
- egothink_location
- egothink_situated
- egothink_spatial
24 changes: 24 additions & 0 deletions lmms_eval/tasks/egothink/egothink_activity.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
dataset_name: "Activity"
task: "egothink_activity"
output_type: generate_until
doc_to_visual: !function utils.egothink_doc_to_visual
doc_to_text: !function utils.egothink_doc_to_text
doc_to_target: !function utils.egothink_doc_to_answer
generation_kwargs:
max_new_tokens: 30
temperature: 0.2
num_beams: 1
do_sample: True
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function utils.egothink_aggregate_results
higher_is_better: true
# The return value of process_results will be used by metrics
process_results: !function utils.egothink_process_results

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
include: _default_template_yaml
24 changes: 24 additions & 0 deletions lmms_eval/tasks/egothink/egothink_affordance.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
dataset_name: "Object_affordance"
task: "egothink_affordance"
output_type: generate_until
doc_to_visual: !function utils.egothink_doc_to_visual
doc_to_text: !function utils.egothink_doc_to_text
doc_to_target: !function utils.egothink_doc_to_answer
generation_kwargs:
max_new_tokens: 30
temperature: 0.2
num_beams: 1
do_sample: True
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function utils.egothink_aggregate_results
higher_is_better: true
# The return value of process_results will be used by metrics
process_results: !function utils.egothink_process_results

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
include: _default_template_yaml
24 changes: 24 additions & 0 deletions lmms_eval/tasks/egothink/egothink_assistance.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
dataset_name: "Planning_assistance"
task: "egothink_assistance"
output_type: generate_until
doc_to_visual: !function utils.egothink_doc_to_visual
doc_to_text: !function utils.egothink_doc_to_text
doc_to_target: !function utils.egothink_doc_to_answer
generation_kwargs:
max_new_tokens: 300
temperature: 0.2
num_beams: 1
do_sample: True
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function utils.egothink_aggregate_results
higher_is_better: true
# The return value of process_results will be used by metrics
process_results: !function utils.egothink_process_results

lmms_eval_specific_kwargs:
default:
pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in a detailed and helpful way. USER: <image>\n"
post_prompt: " ASSISTANT:"
include: _default_template_yaml
24 changes: 24 additions & 0 deletions lmms_eval/tasks/egothink/egothink_attribute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
dataset_name: "Object_attribute"
task: "egothink_attribute"
output_type: generate_until
doc_to_visual: !function utils.egothink_doc_to_visual
doc_to_text: !function utils.egothink_doc_to_text
doc_to_target: !function utils.egothink_doc_to_answer
generation_kwargs:
max_new_tokens: 30
temperature: 0.2
num_beams: 1
do_sample: True
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function utils.egothink_aggregate_results
higher_is_better: true
# The return value of process_results will be used by metrics
process_results: !function utils.egothink_process_results

lmms_eval_specific_kwargs:
default:
pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual content, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
post_prompt: " ASSISTANT:"
include: _default_template_yaml
Loading