diff --git a/.gitignore b/.gitignore
index ea20fe52..a254579f 100755
--- a/.gitignore
+++ b/.gitignore
@@ -42,4 +42,5 @@ VATEX/
 lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc
 lmms_eval/tasks/mlvu/__pycache__/utils.cpython-310.pyc
 
-scripts/
\ No newline at end of file
+scripts/
+.env
\ No newline at end of file
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
index 1a3dbd61..3927b294 100644
--- a/lmms_eval/models/__init__.py
+++ b/lmms_eval/models/__init__.py
@@ -39,6 +39,7 @@
     "minimonkey": "MiniMonkey",
     "moviechat": "MovieChat",
     "mplug_owl_video": "mplug_Owl",
+    "openai_compatible": "OpenAICompatible",
     "oryx": "Oryx",
     "phi3v": "Phi3v",
     "qwen2_5_vl": "Qwen2_5_VL",
diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py
index af313a57..662b7388 100755
--- a/lmms_eval/models/gpt4v.py
+++ b/lmms_eval/models/gpt4v.py
@@ -4,11 +4,12 @@
 import time
 from copy import deepcopy
 from io import BytesIO
-from typing import List, Tuple
+from typing import List, Tuple, Union
 
 import numpy as np
 import requests as url_requests
 from accelerate import Accelerator, DistributedType
+from openai import AzureOpenAI, OpenAI
 from tqdm import tqdm
 
 from lmms_eval.api.instance import Instance
@@ -20,26 +21,19 @@
 except ImportError:
     pass
 
+from loguru import logger as eval_logger
 from PIL import Image
 
 API_TYPE = os.getenv("API_TYPE", "openai")
-NUM_SECONDS_TO_SLEEP = 30
-from loguru import logger as eval_logger
-
+NUM_SECONDS_TO_SLEEP = 10
 if API_TYPE == "openai":
     API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
     API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json",
-    }
+
 elif API_TYPE == "azure":
     API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
     API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "api-key": API_KEY,
-        "Content-Type": "application/json",
-    }
+    API_VERSION = os.getenv("AZURE_API_VERSION", "2023-07-01-preview")
 
 
 @register_model("gpt4v")
@@ -52,6 +46,7 @@ def __init__(
         timeout: int = 120,
         continual_mode: bool = False,
         response_persistent_folder: str = None,
+        max_size_in_mb: int = 20,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -80,6 +75,11 @@ def __init__(
                 self.response_cache = {}
                 self.cache_mode = "start"
 
+        if API_TYPE == "openai":
+            self.client = OpenAI(api_key=API_KEY)
+        elif API_TYPE == "azure":
+            self.client = AzureOpenAI(api_key=API_KEY, azure_endpoint=API_URL, api_version=API_VERSION)
+
         accelerator = Accelerator()
         # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
         if accelerator.num_processes > 1:
@@ -94,13 +94,30 @@ def __init__(
             self._rank = self.accelerator.local_process_index
             self._world_size = self.accelerator.num_processes
 
+        self.max_size_in_mb = max_size_in_mb
         self.device = self.accelerator.device
 
     # Function to encode the image
-    def encode_image(self, image: Image):
+    def encode_image(self, image: Union[Image.Image, str]):
+        max_size = self.max_size_in_mb * 1024 * 1024  # 20MB in bytes
+        if isinstance(image, str):
+            img = Image.open(image).convert("RGB")
+        else:
+            img = image.copy()
+
         output_buffer = BytesIO()
-        image.save(output_buffer, format="PNG")
+        img.save(output_buffer, format="PNG")
         byte_data = output_buffer.getvalue()
+
+        # If image is too large, resize it while maintaining aspect ratio
+        while len(byte_data) > max_size and img.size[0] > 100 and img.size[1] > 100:
+            new_size = (int(img.size[0] * 0.75), int(img.size[1] * 0.75))
+            img = img.resize(new_size, Image.Resampling.LANCZOS)
+
+            output_buffer = BytesIO()
+            img.save(output_buffer, format="PNG")
+            byte_data = output_buffer.getvalue()
+
         base64_str = base64.b64encode(byte_data).decode("utf-8")
         return base64_str
 
@@ -150,39 +167,30 @@ def generate_until(self, requests) -> List[str]:
                         continue
 
             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
-            visuals = self.flatten(visuals)
-            imgs = []  # multiple images or frames for video
-            for visual in visuals:
-                if self.modality == "image":
-                    img = self.encode_image(visual)
-                    imgs.append(img)
-                elif self.modality == "video":
-                    frames = self.encode_video(visual, self.max_frames_num)
-                    imgs.extend(frames)
+            if None in visuals:
+                visuals = []
+                imgs = []
+            else:
+                visuals = self.flatten(visuals)
+                imgs = []  # multiple images or frames for video
+                for visual in visuals:
+                    if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
+                        frames = self.encode_video(visual, self.max_frames_num)
+                        imgs.extend(frames)
+                    elif isinstance(visual, str) and (".jpg" in visual or ".jpeg" in visual or ".png" in visual or ".gif" in visual or ".bmp" in visual or ".tiff" in visual or ".webp" in visual):
+                        img = self.encode_image(visual)
+                        imgs.append(img)
+                    elif isinstance(visual, Image.Image):
+                        img = self.encode_image(visual)
+                        imgs.append(img)
 
             payload = {"messages": []}
-            if API_TYPE == "openai":
-                payload["model"] = self.model_version
-
-            response_json = {"role": "user", "content": []}
-            # When there is no image token in the context, append the image to the text
-            if self.image_token not in contexts:
-                payload["messages"].append(deepcopy(response_json))
-                payload["messages"][0]["content"].append({"type": "text", "text": contexts})
-                for img in imgs:
-                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
-            else:
-                contexts = contexts.split(self.image_token)
-                for idx, img in enumerate(imgs):
-                    payload["messages"].append(deepcopy(response_json))
-                    payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
-                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
-
-                # If n image tokens are in the contexts
-                # contexts will be splitted into n+1 chunks
-                # Manually add it into the payload
-                payload["messages"].append(deepcopy(response_json))
-                payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
+            payload["model"] = self.model_version
+
+            payload["messages"].append({"role": "user", "content": []})
+            payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+            for img in imgs:
+                payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
 
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
@@ -198,26 +206,24 @@ def generate_until(self, requests) -> List[str]:
             payload["max_tokens"] = gen_kwargs["max_new_tokens"]
             payload["temperature"] = gen_kwargs["temperature"]
 
-            for attempt in range(5):
+            MAX_RETRIES = 5
+            for attempt in range(MAX_RETRIES):
                 try:
-                    response = url_requests.post(API_URL, headers=headers, json=payload, timeout=self.timeout)
-                    response_data = response.json()
-
-                    response_text = response_data["choices"][0]["message"]["content"].strip()
+                    response = self.client.chat.completions.create(**payload)
+                    response_text = response.choices[0].message.content
                     break  # If successful, break out of the loop
 
                 except Exception as e:
-                    try:
-                        error_msg = response.json()
-                    except:
-                        error_msg = ""
+                    error_msg = str(e)
+                    eval_logger.info(f"Attempt {attempt + 1}/{MAX_RETRIES} failed with error: {error_msg}")
 
-                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}.\nReponse: {error_msg}")
-                    if attempt <= 5:
-                        time.sleep(NUM_SECONDS_TO_SLEEP)
-                    else:  # If this was the last attempt, log and return empty string
-                        eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}.\nResponse: {response.json()}")
+                    # On last attempt, log error and set empty response
+                    if attempt == MAX_RETRIES - 1:
+                        eval_logger.error(f"All {MAX_RETRIES} attempts failed. Last error: {error_msg}")
                         response_text = ""
+                    else:
+                        time.sleep(NUM_SECONDS_TO_SLEEP)
+
             res.append(response_text)
             pbar.update(1)
 
diff --git a/lmms_eval/models/openai_compatible.py b/lmms_eval/models/openai_compatible.py
new file mode 100644
index 00000000..7be1cd91
--- /dev/null
+++ b/lmms_eval/models/openai_compatible.py
@@ -0,0 +1,225 @@
+import base64
+import json
+import os
+import time
+from io import BytesIO
+from typing import List, Tuple, Union
+
+import numpy as np
+import requests as url_requests
+from accelerate import Accelerator, DistributedType
+from tqdm import tqdm
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+
+try:
+    from decord import VideoReader, cpu
+except ImportError:
+    pass
+
+from dotenv import find_dotenv, load_dotenv
+from loguru import logger as eval_logger
+from openai import OpenAI
+from PIL import Image
+
+load_dotenv(verbose=True)
+
+
+@register_model("openai_compatible")
+class OpenAICompatible(lmms):
+    def __init__(
+        self,
+        model_version: str = "grok-2-latest",
+        timeout: int = 120,
+        max_retries: int = 5,
+        max_size_in_mb: int = 20,
+        continual_mode: bool = False,
+        response_persistent_folder: str = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.model_version = model_version
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.max_size_in_mb = max_size_in_mb  # some models have a limit on the size of the image
+        self.continual_mode = continual_mode
+        if self.continual_mode:
+            if response_persistent_folder is None:
+                raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
+
+            os.makedirs(response_persistent_folder, exist_ok=True)
+            self.response_persistent_folder = response_persistent_folder
+            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
+
+            if os.path.exists(self.response_persistent_file):
+                with open(self.response_persistent_file, "r") as f:
+                    self.response_cache = json.load(f)
+                self.cache_mode = "resume"
+            else:
+                self.response_cache = {}
+                self.cache_mode = "start"
+
+        self.client = OpenAI(api_key=os.getenv("OPENAI_COMPATIBLE_API_KEY"), base_url=os.getenv("OPENAI_COMPATIBLE_API_URL"))
+
+        accelerator = Accelerator()
+        # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        else:
+            self.accelerator = accelerator
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+
+        self.device = self.accelerator.device
+
+    # Function to encode the image
+    def encode_image(self, image: Union[Image.Image, str]):
+        max_size = self.max_size_in_mb * 1024 * 1024  # 20MB in bytes
+        if isinstance(image, str):
+            img = Image.open(image).convert("RGB")
+        else:
+            img = image.copy()
+
+        output_buffer = BytesIO()
+        img.save(output_buffer, format="PNG")
+        byte_data = output_buffer.getvalue()
+
+        # If image is too large, resize it while maintaining aspect ratio
+        while len(byte_data) > max_size and img.size[0] > 100 and img.size[1] > 100:
+            new_size = (int(img.size[0] * 0.75), int(img.size[1] * 0.75))
+            img = img.resize(new_size, Image.Resampling.LANCZOS)
+
+            output_buffer = BytesIO()
+            img.save(output_buffer, format="PNG")
+            byte_data = output_buffer.getvalue()
+
+        base64_str = base64.b64encode(byte_data).decode("utf-8")
+        return base64_str
+
+    # Function to encode the video
+    def encode_video(self, video_path, for_get_frames_num):
+        vr = VideoReader(video_path, ctx=cpu(0))
+        total_frame_num = len(vr)
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int)
+
+        # Ensure the last frame is included
+        if total_frame_num - 1 not in uniform_sampled_frames:
+            uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1)
+
+        frame_idx = uniform_sampled_frames.tolist()
+        frames = vr.get_batch(frame_idx).asnumpy()
+
+        base64_frames = []
+        for frame in frames:
+            img = Image.fromarray(frame)
+            output_buffer = BytesIO()
+            img.save(output_buffer, format="PNG")
+            byte_data = output_buffer.getvalue()
+            base64_str = base64.b64encode(byte_data).decode("utf-8")
+            base64_frames.append(base64_str)
+
+        return base64_frames
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def generate_until(self, requests) -> List[str]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            if self.continual_mode is True and self.cache_mode == "resume":
+                doc_uuid = f"{task}___{split}___{doc_id}"
+                if doc_uuid in self.response_cache:
+                    response_text = self.response_cache[doc_uuid]
+                    if response_text:
+                        res.append(response_text)
+                        pbar.update(1)
+                        continue
+
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            if None in visuals:
+                visuals = []
+                imgs = []
+            else:
+                visuals = self.flatten(visuals)
+                imgs = []  # multiple images or frames for video
+                for visual in visuals:
+                    if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
+                        frames = self.encode_video(visual, self.max_frames_num)
+                        imgs.extend(frames)
+                    elif isinstance(visual, str) and (".jpg" in visual or ".jpeg" in visual or ".png" in visual or ".gif" in visual or ".bmp" in visual or ".tiff" in visual or ".webp" in visual):
+                        img = self.encode_image(visual)
+                        imgs.append(img)
+                    elif isinstance(visual, Image.Image):
+                        img = self.encode_image(visual)
+                        imgs.append(img)
+
+            payload = {"messages": []}
+            payload["model"] = self.model_version
+
+            # When there is no image token in the context, append the image to the text
+            payload["messages"].append({"role": "user", "content": []})
+            payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+            for img in imgs:
+                payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if gen_kwargs["max_new_tokens"] > 4096:
+                gen_kwargs["max_new_tokens"] = 4096
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+
+            payload["max_tokens"] = gen_kwargs["max_new_tokens"]
+            payload["temperature"] = gen_kwargs["temperature"]
+
+            for attempt in range(self.max_retries):
+                try:
+                    response = self.client.chat.completions.create(**payload)
+                    response_text = response.choices[0].message.content
+                    break  # If successful, break out of the loop
+
+                except Exception as e:
+                    error_msg = str(e)
+                    eval_logger.info(f"Attempt {attempt + 1}/{self.max_retries} failed with error: {error_msg}")
+
+                    # On last attempt, log error and set empty response
+                    if attempt == self.max_retries - 1:
+                        eval_logger.error(f"All {self.max_retries} attempts failed. Last error: {error_msg}")
+                        response_text = ""
+                    else:
+                        time.sleep(self.timeout)
+
+            res.append(response_text)
+            pbar.update(1)
+
+            if self.continual_mode is True:  # Cache the response
+                doc_uuid = f"{task}___{split}___{doc_id}"
+                self.response_cache[doc_uuid] = response_text
+                with open(self.response_persistent_file, "w") as f:
+                    json.dump(self.response_cache, f)
+
+        pbar.close()
+        return res
+
+    def generate_until_multi_round(self, requests) -> List[str]:
+        raise NotImplementedError("TODO: Implement multi-round generation for OpenAI compatible models")
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        raise NotImplementedError("TODO: Implement loglikelihood for OpenAI compatible models")
diff --git a/miscs/model_dryruns/openai_compatible.sh b/miscs/model_dryruns/openai_compatible.sh
new file mode 100644
index 00000000..c93ec29f
--- /dev/null
+++ b/miscs/model_dryruns/openai_compatible.sh
@@ -0,0 +1,13 @@
+# cd ~/prod/lmms-eval-public
+# pip3 install -e .
+# pip3 install openai
+
+python3 -m lmms_eval \
+    --model openai_compatible \
+    --model_args model_version=grok-2-vision-1212 \
+    --tasks mme,mmmu_val \
+    --batch_size 1 \
+    --log_samples \
+    --log_samples_suffix openai_compatible \
+    --output_path ./logs \
+    --limit=8
\ No newline at end of file