Update API usage and add environment configuration

- Adjust API client initialization for OpenAI and Azure. - Modify image encoding to handle size limits and maintain aspect ratio. - Add new `OpenAICompatible` model class. - Introduce retry mechanism for API requests with configurable retries. - Update `.gitignore` to exclude `.env` and `scripts/`. .gitignore: - Exclude `.env` file for security. - Ensure no scripts directory is tracked. lmms_eval/models/gpt4v.py: - Refactor API client initialization to use new OpenAI and Azure clients. - Update image encoding to handle size limits with resizing logic. - Adjust retry logic for API calls, reducing sleep time. lmms_eval/models/openai_compatible.py: - Create new `OpenAICompatible` model class with similar structure. - Implement encoding functions for images and videos. - Integrate environment variable loading and persistent response caching. miscs/model_dryruns/openai_compatible.sh: - Add sample script for running the new model.
EvolvingLMMs-Lab · Feb 19, 2025 · 59a640c · 59a640c
1 parent 1fda05b
commit 59a640c
Show file tree

Hide file tree

Showing 4 changed files with 305 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -42,4 +42,5 @@ VATEX/
 lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc
 lmms_eval/tasks/mlvu/__pycache__/utils.cpython-310.pyc
 
-scripts/
+scripts/
+.env
diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py
@@ -4,11 +4,12 @@
 import time
 from copy import deepcopy
 from io import BytesIO
-from typing import List, Tuple
+from typing import List, Tuple, Union
 
 import numpy as np
 import requests as url_requests
 from accelerate import Accelerator, DistributedType
+from openai import AzureOpenAI, OpenAI
 from tqdm import tqdm
 
 from lmms_eval.api.instance import Instance
@@ -20,26 +21,19 @@
 except ImportError:
     pass
 
+from loguru import logger as eval_logger
 from PIL import Image
 
 API_TYPE = os.getenv("API_TYPE", "openai")
-NUM_SECONDS_TO_SLEEP = 30
-from loguru import logger as eval_logger
-
+NUM_SECONDS_TO_SLEEP = 10
 if API_TYPE == "openai":
     API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
     API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json",
-    }
+
 elif API_TYPE == "azure":
     API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
     API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "api-key": API_KEY,
-        "Content-Type": "application/json",
-    }
+    API_VERSION = os.getenv("AZURE_API_VERSION", "2023-07-01-preview")
 
 
 @register_model("gpt4v")
@@ -52,6 +46,7 @@ def __init__(
         timeout: int = 120,
         continual_mode: bool = False,
         response_persistent_folder: str = None,
+        max_size_in_mb: int = 20,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -80,6 +75,11 @@ def __init__(
                 self.response_cache = {}
                 self.cache_mode = "start"
 
+        if API_TYPE == "openai":
+            self.client = OpenAI(api_key=API_KEY)
+        elif API_TYPE == "azure":
+            self.client = AzureOpenAI(api_key=API_KEY, azure_endpoint=API_URL, api_version=API_VERSION)
+
         accelerator = Accelerator()
         # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
         if accelerator.num_processes > 1:
@@ -94,13 +94,30 @@ def __init__(
             self._rank = self.accelerator.local_process_index
             self._world_size = self.accelerator.num_processes
 
+        self.max_size_in_mb = max_size_in_mb
         self.device = self.accelerator.device
 
     # Function to encode the image
-    def encode_image(self, image: Image):
+    def encode_image(self, image: Union[Image.Image, str]):
+        max_size = self.max_size_in_mb * 1024 * 1024  # 20MB in bytes
+        if isinstance(image, str):
+            img = Image.open(image).convert("RGB")
+        else:
+            img = image.copy()
+
         output_buffer = BytesIO()
-        image.save(output_buffer, format="PNG")
+        img.save(output_buffer, format="PNG")
         byte_data = output_buffer.getvalue()
+
+        # If image is too large, resize it while maintaining aspect ratio
+        while len(byte_data) > max_size and img.size[0] > 100 and img.size[1] > 100:
+            new_size = (int(img.size[0] * 0.75), int(img.size[1] * 0.75))
+            img = img.resize(new_size, Image.Resampling.LANCZOS)
+
+            output_buffer = BytesIO()
+            img.save(output_buffer, format="PNG")
+            byte_data = output_buffer.getvalue()
+
         base64_str = base64.b64encode(byte_data).decode("utf-8")
         return base64_str
 
@@ -150,39 +167,30 @@ def generate_until(self, requests) -> List[str]:
                         continue
 
             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
-            visuals = self.flatten(visuals)
-            imgs = []  # multiple images or frames for video
-            for visual in visuals:
-                if self.modality == "image":
-                    img = self.encode_image(visual)
-                    imgs.append(img)
-                elif self.modality == "video":
-                    frames = self.encode_video(visual, self.max_frames_num)
-                    imgs.extend(frames)
+            if None in visuals:
+                visuals = []
+                imgs = []
+            else:
+                visuals = self.flatten(visuals)
+                imgs = []  # multiple images or frames for video
+                for visual in visuals:
+                    if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
+                        frames = self.encode_video(visual, self.max_frames_num)
+                        imgs.extend(frames)
+                    elif isinstance(visual, str) and (".jpg" in visual or ".jpeg" in visual or ".png" in visual or ".gif" in visual or ".bmp" in visual or ".tiff" in visual or ".webp" in visual):
+                        img = self.encode_image(visual)
+                        imgs.append(img)
+                    elif isinstance(visual, Image.Image):
+                        img = self.encode_image(visual)
+                        imgs.append(img)
 
             payload = {"messages": []}
-            if API_TYPE == "openai":
-                payload["model"] = self.model_version
-
-            response_json = {"role": "user", "content": []}
-            # When there is no image token in the context, append the image to the text
-            if self.image_token not in contexts:
-                payload["messages"].append(deepcopy(response_json))
-                payload["messages"][0]["content"].append({"type": "text", "text": contexts})
-                for img in imgs:
-                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
-            else:
-                contexts = contexts.split(self.image_token)
-                for idx, img in enumerate(imgs):
-                    payload["messages"].append(deepcopy(response_json))
-                    payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
-                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
-
-                # If n image tokens are in the contexts
-                # contexts will be splitted into n+1 chunks
-                # Manually add it into the payload
-                payload["messages"].append(deepcopy(response_json))
-                payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
+            payload["model"] = self.model_version
+
+            payload["messages"].append({"role": "user", "content": []})
+            payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+            for img in imgs:
+                payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
 
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
@@ -198,26 +206,24 @@ def generate_until(self, requests) -> List[str]:
             payload["max_tokens"] = gen_kwargs["max_new_tokens"]
             payload["temperature"] = gen_kwargs["temperature"]
 
-            for attempt in range(5):
+            MAX_RETRIES = 5
+            for attempt in range(MAX_RETRIES):
                 try:
-                    response = url_requests.post(API_URL, headers=headers, json=payload, timeout=self.timeout)
-                    response_data = response.json()
-
-                    response_text = response_data["choices"][0]["message"]["content"].strip()
+                    response = self.client.chat.completions.create(**payload)
+                    response_text = response.choices[0].message.content
                     break  # If successful, break out of the loop
 
                 except Exception as e:
-                    try:
-                        error_msg = response.json()
-                    except:
-                        error_msg = ""
+                    error_msg = str(e)
+                    eval_logger.info(f"Attempt {attempt + 1}/{MAX_RETRIES} failed with error: {error_msg}")
 
-                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}.\nReponse: {error_msg}")
-                    if attempt <= 5:
-                        time.sleep(NUM_SECONDS_TO_SLEEP)
-                    else:  # If this was the last attempt, log and return empty string
-                        eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}.\nResponse: {response.json()}")
+                    # On last attempt, log error and set empty response
+                    if attempt == MAX_RETRIES - 1:
+                        eval_logger.error(f"All {MAX_RETRIES} attempts failed. Last error: {error_msg}")
                         response_text = ""
+                    else:
+                        time.sleep(NUM_SECONDS_TO_SLEEP)
+
             res.append(response_text)
             pbar.update(1)