-
Notifications
You must be signed in to change notification settings - Fork 212
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Model] add vllm compatible models (#544)
* Add VLLM model integration and update configurations - Introduce VLLM model in the model registry. - Update AVAILABLE_MODELS to include new models: - models/__init__.py: Added "aria", "internvideo2", "llama_vision", "oryx", "ross", "slime", "videochat2", "vllm", "xcomposer2_4KHD", "xcomposer2d5". - Create vllm.py for VLLM model implementation: - Implemented encoding for images and videos. - Added methods for generating responses and handling multi-round generation. - Update mmu tasks with new prompt formats and evaluation metrics: - mmmu_val.yaml: Added specific kwargs for prompt types. - mmmu_val_reasoning.yaml: Enhanced prompts for reasoning tasks. - utils.py: Adjusted evaluation rules and scoring for predictions. - Added script for easy model execution: - vllm_qwen2vl.sh: Script to run VLLM with specified parameters. * Set environment variables for VLLM script - Configure environment for better performance and debugging. - Added variables to control multiprocessing and NCCL behavior. miscs/vllm_qwen2vl.sh: - Set `VLLM_WORKER_MULTIPROC_METHOD` to `spawn` for compatibility. - Enabled `NCCL_BLOCKING_WAIT` to avoid hangs. - Increased `NCCL_TIMEOUT` to 18000000 for long-running processes. - Set `NCCL_DEBUG` to `DEBUG` for detailed logs. * Rename scripts and update paths - Renamed representation scripts for clarity. - miscs/repr_scripts.sh -> miscs/model_dryruns/llava_1_5.sh - miscs/cicd_qwen2vl.sh -> miscs/model_dryruns/qwen2vl.sh - miscs/tinyllava_repr_scripts.sh -> miscs/model_dryruns/tinyllava.sh - miscs/vllm_qwen2vl.sh -> miscs/model_dryruns/vllm_qwen2vl.sh - Updated parameters in the vllm_qwen2vl.sh script. - miscs/model_dryruns/vllm_qwen2vl.sh: Added `--limit=64` to output path command. * Optimize image handling in VLLM model - Simplify image conversion in the `to_base64` method: - vllm.py: Directly convert input image to RGB format instead of copying it. - Remove unnecessary base64 encoding for images: - vllm.py: Return the PIL image directly instead of converting it to base64. - Update video frame processing to return PIL images: - vllm.py: Replace base64 encoding of frames with returning the PIL frames directly. * Revert "Optimize image handling in VLLM model" This reverts commit 469e1fc. * use threads to encode visuals --------- Co-authored-by: kcz358 <[email protected]>
- Loading branch information
Showing
9 changed files
with
266 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
import asyncio | ||
import base64 | ||
import json | ||
import os | ||
import time | ||
from concurrent.futures import ThreadPoolExecutor | ||
from copy import deepcopy | ||
from io import BytesIO | ||
from multiprocessing import cpu_count | ||
from typing import List, Optional, Tuple, Union | ||
|
||
import numpy as np | ||
from accelerate import Accelerator, DistributedType | ||
from decord import VideoReader, cpu | ||
from loguru import logger as eval_logger | ||
from PIL import Image | ||
from tqdm import tqdm | ||
|
||
from lmms_eval.api.instance import Instance | ||
from lmms_eval.api.model import lmms | ||
from lmms_eval.api.registry import register_model | ||
|
||
NUM_SECONDS_TO_SLEEP = 5 | ||
|
||
try: | ||
from vllm import LLM, SamplingParams | ||
except ImportError: | ||
vllm = None | ||
|
||
|
||
@register_model("vllm") | ||
class VLLM(lmms): | ||
def __init__( | ||
self, | ||
model_version: str = "Qwen/Qwen2.5-VL-3B-Instruct", | ||
tensor_parallel_size: int = 1, | ||
gpu_memory_utilization: float = 0.8, | ||
batch_size: int = 1, | ||
timeout: int = 60, | ||
max_images: int = 32, | ||
max_videos: int = 8, | ||
max_audios: int = 8, | ||
max_frame_num: int = 32, | ||
threads: int = 16, # Threads to use for decoding visuals | ||
trust_remote_code: Optional[bool] = True, | ||
**kwargs, | ||
) -> None: | ||
super().__init__() | ||
# Manually set a image token for GPT4V so that we can search for it | ||
# and split the text and image | ||
# Here we just use the same token as llava for convenient | ||
self.model_version = model_version | ||
self.max_images = max_images | ||
self.max_frame_num = max_frame_num | ||
self.threads = threads | ||
|
||
accelerator = Accelerator() | ||
self.client = LLM( | ||
model=self.model_version, | ||
tensor_parallel_size=tensor_parallel_size, | ||
gpu_memory_utilization=gpu_memory_utilization, | ||
limit_mm_per_prompt={"image": max_images, "video": max_videos, "audio": max_audios}, | ||
trust_remote_code=trust_remote_code, | ||
) | ||
if accelerator.num_processes > 1: | ||
assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." | ||
self.accelerator = accelerator | ||
if self.accelerator.is_local_main_process: | ||
eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") | ||
self._rank = self.accelerator.local_process_index | ||
self._world_size = self.accelerator.num_processes | ||
else: | ||
self.accelerator = accelerator | ||
self._rank = self.accelerator.local_process_index | ||
self._world_size = self.accelerator.num_processes | ||
|
||
self.device = self.accelerator.device | ||
self.batch_size_per_gpu = int(batch_size) | ||
|
||
# Function to encode the image | ||
def encode_image(self, image: Union[Image.Image, str]): | ||
if isinstance(image, str): | ||
img = Image.open(image).convert("RGB") | ||
else: | ||
img = image.copy() | ||
|
||
output_buffer = BytesIO() | ||
img.save(output_buffer, format="PNG") | ||
byte_data = output_buffer.getvalue() | ||
|
||
base64_str = base64.b64encode(byte_data).decode("utf-8") | ||
return base64_str | ||
|
||
# Function to encode the video | ||
def encode_video(self, video_path): | ||
vr = VideoReader(video_path, ctx=cpu(0)) | ||
total_frame_num = len(vr) | ||
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.max_frame_num, dtype=int) | ||
|
||
# Ensure the last frame is included | ||
if total_frame_num - 1 not in uniform_sampled_frames: | ||
uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1) | ||
|
||
frame_idx = uniform_sampled_frames.tolist() | ||
frames = vr.get_batch(frame_idx).asnumpy() | ||
|
||
base64_frames = [] | ||
for frame in frames: | ||
img = Image.fromarray(frame) | ||
output_buffer = BytesIO() | ||
img.save(output_buffer, format="PNG") | ||
byte_data = output_buffer.getvalue() | ||
base64_str = base64.b64encode(byte_data).decode("utf-8") | ||
base64_frames.append(base64_str) | ||
|
||
return base64_frames | ||
|
||
def flatten(self, input): | ||
new_list = [] | ||
for i in input: | ||
for j in i: | ||
new_list.append(j) | ||
return new_list | ||
|
||
def generate_until(self, requests) -> List[str]: | ||
res = [] | ||
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") | ||
|
||
batch_size = self.batch_size_per_gpu | ||
batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)] | ||
for batch_requests in batched_requests: | ||
batched_messages = [] | ||
for idx in range(len(batch_requests)): | ||
contexts, gen_kwargs, doc_to_visual, doc_id, task, split = batch_requests[idx].arguments | ||
if "max_new_tokens" not in gen_kwargs: | ||
gen_kwargs["max_new_tokens"] = 1024 | ||
if gen_kwargs["max_new_tokens"] > 4096: | ||
gen_kwargs["max_new_tokens"] = 4096 | ||
if "temperature" not in gen_kwargs: | ||
gen_kwargs["temperature"] = 0 | ||
if "top_p" not in gen_kwargs: | ||
gen_kwargs["top_p"] = 0.95 | ||
|
||
params = { | ||
"temperature": gen_kwargs["temperature"], | ||
"max_tokens": gen_kwargs["max_new_tokens"], | ||
"top_p": gen_kwargs["top_p"], | ||
} | ||
sampling_params = SamplingParams(**params) | ||
|
||
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] | ||
if None in visuals: | ||
visuals = [] | ||
imgs = [] | ||
else: | ||
visuals = self.flatten(visuals) | ||
imgs = [] # multiple images or frames for video | ||
all_tasks = [] | ||
with ThreadPoolExecutor(max_workers=self.threads) as executor: | ||
for visual in visuals: | ||
if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual): | ||
all_tasks.append(executor.submit(self.encode_video, visual)) | ||
elif isinstance(visual, str) and (".jpg" in visual or ".jpeg" in visual or ".png" in visual or ".gif" in visual or ".bmp" in visual or ".tiff" in visual or ".webp" in visual): | ||
all_tasks.append(executor.submit(self.encode_image, visual)) | ||
elif isinstance(visual, Image.Image): | ||
all_tasks.append(executor.submit(self.encode_image, visual)) | ||
|
||
for task in all_tasks: | ||
imgs.append(task.result()) | ||
|
||
messages = [{"role": "user", "content": []}] | ||
# When there is no image token in the context, append the image to the text | ||
messages[0]["content"].append({"type": "text", "text": contexts}) | ||
for img in imgs: | ||
messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}) | ||
|
||
batched_messages.append(messages) | ||
|
||
response = self.client.chat(sampling_params=sampling_params, messages=batched_messages) | ||
response_text = [o.outputs[0].text for o in response] | ||
|
||
assert len(response_text) == len(batch_requests) | ||
res.extend(response_text) | ||
pbar.update(len(batch_requests)) | ||
|
||
pbar.close() | ||
return res | ||
|
||
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: | ||
# TODO | ||
assert False, "GPT4V not support" | ||
|
||
def generate_until_multi_round(self, requests) -> List[str]: | ||
raise NotImplementedError("TODO: Implement multi-round generation") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
Oops, something went wrong.