diff --git a/lmms_eval/models/minimonkey.py b/lmms_eval/models/minimonkey.py index a4037ac9c..1ecd857c8 100644 --- a/lmms_eval/models/minimonkey.py +++ b/lmms_eval/models/minimonkey.py @@ -70,7 +70,7 @@ def __init__( self._rank = self.accelerator.local_process_index self._world_size = self.accelerator.num_processes else: - #self.model.to(self._device) + # self.model.to(self._device) self._rank = 0 self._word_size = 1 @@ -200,12 +200,9 @@ def _collate(x): pixel_values2 = load_image2(image, min_num=3, max_num=7, target_aspect_ratio=target_aspect_ratio) pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0).to(self._device).to(self.dtype) - response, history = self.model.chat(self.tokenizer, pixel_values, - target_aspect_ratio, prompt, gen_kwargs, - history=None, return_history=True) + response, history = self.model.chat(self.tokenizer, pixel_values, target_aspect_ratio, prompt, gen_kwargs, history=None, return_history=True) - context = [{"role": "user", "content": prompt}, - {"role": "assistant", "content": response}] + context = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}] except Exception as e: eval_logger.error(f"Error {e} in generating") cont = "" @@ -220,8 +217,8 @@ def _collate(x): import numpy as np -from PIL import Image import torchvision.transforms as T +from PIL import Image from torchvision.transforms.functional import InterpolationMode IMAGENET_MEAN = (0.485, 0.456, 0.406) @@ -230,17 +227,12 @@ def _collate(x): def build_transform(input_size): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - transform = T.Compose([ - T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), - T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), - T.ToTensor(), - T.Normalize(mean=MEAN, std=STD) - ]) + transform = T.Compose([T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD)]) return transform def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): - best_ratio_diff = float('inf') + best_ratio_diff = float("inf") best_ratio = (1, 1) area = width * height for ratio in target_ratios: @@ -260,14 +252,11 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio - target_ratios = set( - (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if - i * j <= max_num and i * j >= min_num) + target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio( - aspect_ratio, target_ratios, orig_width, orig_height, image_size) + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] @@ -278,12 +267,7 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna resized_img = image.resize((target_width, target_height)) processed_images = [] for i in range(blocks): - box = ( - (i % (target_width // image_size)) * image_size, - (i // (target_width // image_size)) * image_size, - ((i % (target_width // image_size)) + 1) * image_size, - ((i // (target_width // image_size)) + 1) * image_size - ) + box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size) # split the image split_img = resized_img.crop(box) processed_images.append(split_img) @@ -299,9 +283,7 @@ def dynamic_preprocess2(image, min_num=1, max_num=12, prior_aspect_ratio=None, i aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio - target_ratios = set( - (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if - i * j <= max_num and i * j >= min_num) + target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) new_target_ratios = [] for i in target_ratios: @@ -310,8 +292,7 @@ def dynamic_preprocess2(image, min_num=1, max_num=12, prior_aspect_ratio=None, i else: continue # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio( - aspect_ratio, new_target_ratios, orig_width, orig_height, image_size) + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, new_target_ratios, orig_width, orig_height, image_size) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] @@ -321,12 +302,7 @@ def dynamic_preprocess2(image, min_num=1, max_num=12, prior_aspect_ratio=None, i resized_img = image.resize((target_width, target_height)) processed_images = [] for i in range(blocks): - box = ( - (i % (target_width // image_size)) * image_size, - (i // (target_width // image_size)) * image_size, - ((i % (target_width // image_size)) + 1) * image_size, - ((i // (target_width // image_size)) + 1) * image_size - ) + box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size) # split the image split_img = resized_img.crop(box) processed_images.append(split_img) @@ -338,20 +314,18 @@ def dynamic_preprocess2(image, min_num=1, max_num=12, prior_aspect_ratio=None, i def load_image(image, input_size=448, min_num=1, max_num=12): - image = image.convert('RGB') + image = image.convert("RGB") transform = build_transform(input_size=input_size) - images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, - min_num=min_num, max_num=max_num) + images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num) pixel_values = [transform(image) for image in images] pixel_values = torch.stack(pixel_values) return pixel_values, target_aspect_ratio def load_image2(image, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None): - image = image.convert('RGB') + image = image.convert("RGB") transform = build_transform(input_size=input_size) - images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, - max_num=max_num, prior_aspect_ratio=target_aspect_ratio) + images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio) pixel_values = [transform(image) for image in images] pixel_values = torch.stack(pixel_values) return pixel_values