From ad9ad8309a0c96a13cbdeb71049dcfd4fad98c27 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihrachys@redhat.com>
Date: Tue, 23 Jul 2024 22:33:04 +0000
Subject: [PATCH 1/4] Use `python3` in README instructions

The `python` symlink may be missing on a system; or even point to py2.
We should use `python3` to be sure. (It's ok to use `python` inside a
virtualenv though.)

Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7c8d667..51023ba 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ scripts/infra/cloud-instance.sh ec2 ssh
 # Regardless of how you setup your instance
 git clone https://github.com/instructlab/taxonomy.git && pushd taxonomy && git branch rc && popd
 git clone --bare https://github.com/instructlab/eval.git && git clone eval.git/ && cd eval && git remote add syncrepo ../eval.git
-python -m venv venv
+python3 -m venv venv
 source venv/bin/activate
 pip install -r requirements.txt
 pip install -r requirements-dev.txt

From 568dffb8d99e944b2792ac6edbd11d5a44ba3da8 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihrachys@redhat.com>
Date: Wed, 31 Jul 2024 17:50:15 +0000
Subject: [PATCH 2/4] Introduce hard-fail case for openai error handling

Before the patch, all errors from openai were handled by retrying up to
API_MAX_RETRY times and returning $ERROR$ message at the last attempt.

With this patch, if all attempts result in APIConnectionError, we raise
a new EvalError exception. (If at least one of the previous attempts
result in a different error, then we return $ERROR$ as usual.)

Also, several errors are not expected to recover with a retry (400-404,
422). This patch makes them return $ERROR$ immediately without retrying.

Closes: #77

Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
---
 src/instructlab/eval/exceptions.py      | 12 ++++++++
 src/instructlab/eval/mt_bench_common.py | 40 +++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/src/instructlab/eval/exceptions.py b/src/instructlab/eval/exceptions.py
index 1037d90..aa31151 100644
--- a/src/instructlab/eval/exceptions.py
+++ b/src/instructlab/eval/exceptions.py
@@ -107,3 +107,15 @@ def __init__(self, tasks_dir) -> None:
         super().__init__()
         self.tasks_dir = tasks_dir
         self.message = f"Invalid Tasks Dir: {tasks_dir}"
+
+
+class OpenAIError(EvalError):
+    """
+    Error raised when reply retrieval from OpenAI API fails.
+    Attributes
+        message              error message to be printed on raise
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.message = "Failed to receive a reply from API."
diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
index 86f9920..eb76279 100644
--- a/src/instructlab/eval/mt_bench_common.py
+++ b/src/instructlab/eval/mt_bench_common.py
@@ -17,6 +17,9 @@
 from fastchat.model.model_adapter import get_conversation_template  # type: ignore
 import openai
 
+# First Party
+from instructlab.eval import exceptions
+
 # Local
 from .logger_config import setup_logger
 
@@ -247,9 +250,15 @@ def play_a_match_single(
     return result
 
 
+def _is_fatal_openai_error(e: openai.OpenAIError) -> bool:
+    return isinstance(e, openai.APIConnectionError)
+
+
 def chat_completion_openai(
     openai_client, model, conv, temperature, max_tokens, merge_system_user_message=False
 ) -> str:
+    output = None
+
     for i in range(API_MAX_RETRY):
         try:
             messages = conv.to_openai_api_messages()
@@ -269,15 +278,40 @@ def chat_completion_openai(
                 temperature=temperature,
                 max_tokens=max_tokens,
             )
-            return response.choices[0].message.content
-        except openai.OpenAIError as e:
+            output = response.choices[0].message.content
+            break
+        except (
+            # retry may help with these errors
+            openai.APIConnectionError,
+            openai.RateLimitError,  # 429
+            openai.InternalServerError,  # >=500
+            # NOTE: Errors listed below may need a revisit: we are not sure if
+            # it's ever helpful to retry them. Leaving them intact for now.
+            openai.AuthenticationError,  # 401
+            openai.PermissionDeniedError,  # 403
+            openai.NotFoundError,  # 404
+        ) as e:
+            if not _is_fatal_openai_error(e):
+                output = API_ERROR_OUTPUT  # disable hard fail (never raise!)
+            # still, retry in the hope we'll get a successful reply
             if i == API_MAX_RETRY - 1:
                 logger.error(e)
                 break
             logger.debug(e)
             time.sleep(API_RETRY_SLEEP)
+        except (
+            # retry won't fix these errors
+            openai.BadRequestError,  # 400
+            openai.UnprocessableEntityError,  # 422
+        ) as e:
+            logger.debug(e)
+            return API_ERROR_OUTPUT  # immediately soft fail
 
-    return API_ERROR_OUTPUT
+    if output is None:
+        # not a single attempt was non-fatal; this is indicative of
+        # basic connectivity or server issue -> hard fail
+        raise exceptions.OpenAIError
+    return output
 
 
 def check_data(questions, model_answers, ref_answers, models, judges):

From abf0f4115e5e7e9026a3407fa15b32e7b20b2f85 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihrachys@redhat.com>
Date: Wed, 31 Jul 2024 21:04:05 +0000
Subject: [PATCH 3/4] Calculate messages for openai completion once

Before the patch, we were calculating them on every retry attempt. The
function is pure, so there is no good reason to repeat the calculation.

This also simplifies the function a bit.

Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
---
 pyproject.toml                          |  3 ++
 src/instructlab/eval/mt_bench_common.py | 46 ++++++++++++++++++-------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b11c7bd..03faef9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,3 +96,6 @@ from-first = true
 # import-heading-firstparty=First Party
 # import-heading-localfolder=Local
 known-local-folder = ["tuning"]
+
+[tool.mypy]
+ignore_missing_imports = true
diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
index eb76279..89dc58a 100644
--- a/src/instructlab/eval/mt_bench_common.py
+++ b/src/instructlab/eval/mt_bench_common.py
@@ -4,7 +4,7 @@
 """
 
 # Standard
-from typing import Optional
+from typing import Optional, TypedDict
 import ast
 import dataclasses
 import glob
@@ -14,6 +14,7 @@
 import time
 
 # Third Party
+from fastchat import conversation
 from fastchat.model.model_adapter import get_conversation_template  # type: ignore
 import openai
 
@@ -254,23 +255,44 @@ def _is_fatal_openai_error(e: openai.OpenAIError) -> bool:
     return isinstance(e, openai.APIConnectionError)
 
 
+# TODO: copied from instructlab (cli) utils module; consolidate somewhere?
+class Message(TypedDict):
+    """
+    Represents a message within an AI conversation.
+    """
+
+    content: str
+    # one of: "user", "assistant", or "system"
+    role: str
+
+
+def _get_messages(
+    conv: conversation.Conversation, merge_system_user_message: bool
+) -> list[Message]:
+    messages = conv.to_openai_api_messages()
+    if (
+        (merge_system_user_message or conv.name == "mistral")
+        and messages[0]["role"] == "system"
+        and messages[1]["role"] == "user"
+    ):
+        messages[1]["content"] = messages[0]["content"] + "\n" + messages[1]["content"]
+        return messages[1:]
+    return messages
+
+
 def chat_completion_openai(
-    openai_client, model, conv, temperature, max_tokens, merge_system_user_message=False
+    openai_client,
+    model,
+    conv: conversation.Conversation,
+    temperature,
+    max_tokens,
+    merge_system_user_message: bool = False,
 ) -> str:
     output = None
+    messages = _get_messages(conv, merge_system_user_message)
 
     for i in range(API_MAX_RETRY):
         try:
-            messages = conv.to_openai_api_messages()
-            if (
-                (merge_system_user_message or conv.name == "mistral")
-                and messages[0]["role"] == "system"
-                and messages[1]["role"] == "user"
-            ):
-                messages[1]["content"] = (
-                    messages[0]["content"] + "\n" + messages[1]["content"]
-                )
-                messages = messages[1:]
             response = openai_client.chat.completions.create(
                 model=model,
                 messages=messages,

From 7fbd87e969ead41c466705203bed278d307d0be2 Mon Sep 17 00:00:00 2001
From: Dan McPherson <dmcphers@redhat.com>
Date: Thu, 15 Aug 2024 22:15:15 -0400
Subject: [PATCH 4/4] Add to fatal exceptions and handle OpenAIError

Signed-off-by: Dan McPherson <dmcphers@redhat.com>
---
 src/instructlab/eval/exceptions.py      |  6 +++---
 src/instructlab/eval/mt_bench_common.py | 28 +++++++++++++++++--------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/instructlab/eval/exceptions.py b/src/instructlab/eval/exceptions.py
index aa31151..b3c7ca5 100644
--- a/src/instructlab/eval/exceptions.py
+++ b/src/instructlab/eval/exceptions.py
@@ -109,13 +109,13 @@ def __init__(self, tasks_dir) -> None:
         self.message = f"Invalid Tasks Dir: {tasks_dir}"
 
 
-class OpenAIError(EvalError):
+class ModelServingAPIError(EvalError):
     """
-    Error raised when reply retrieval from OpenAI API fails.
+    Error raised when reply retrieval from model serving fails.
     Attributes
         message              error message to be printed on raise
     """
 
     def __init__(self) -> None:
         super().__init__()
-        self.message = "Failed to receive a reply from API."
+        self.message = "Failed to receive a reply from model serving API."
diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
index 89dc58a..ee23546 100644
--- a/src/instructlab/eval/mt_bench_common.py
+++ b/src/instructlab/eval/mt_bench_common.py
@@ -252,7 +252,15 @@ def play_a_match_single(
 
 
 def _is_fatal_openai_error(e: openai.OpenAIError) -> bool:
-    return isinstance(e, openai.APIConnectionError)
+    return isinstance(
+        e,
+        (
+            openai.APIConnectionError,
+            openai.AuthenticationError,
+            openai.PermissionDeniedError,
+            openai.NotFoundError,
+        ),
+    )
 
 
 # TODO: copied from instructlab (cli) utils module; consolidate somewhere?
@@ -302,6 +310,13 @@ def chat_completion_openai(
             )
             output = response.choices[0].message.content
             break
+        except (
+            # retry won't fix these errors
+            openai.BadRequestError,  # 400
+            openai.UnprocessableEntityError,  # 422
+        ) as e:
+            logger.debug(e)
+            return API_ERROR_OUTPUT  # immediately soft fail
         except (
             # retry may help with these errors
             openai.APIConnectionError,
@@ -312,6 +327,8 @@ def chat_completion_openai(
             openai.AuthenticationError,  # 401
             openai.PermissionDeniedError,  # 403
             openai.NotFoundError,  # 404
+            # General catch-all
+            openai.OpenAIError,
         ) as e:
             if not _is_fatal_openai_error(e):
                 output = API_ERROR_OUTPUT  # disable hard fail (never raise!)
@@ -321,18 +338,11 @@ def chat_completion_openai(
                 break
             logger.debug(e)
             time.sleep(API_RETRY_SLEEP)
-        except (
-            # retry won't fix these errors
-            openai.BadRequestError,  # 400
-            openai.UnprocessableEntityError,  # 422
-        ) as e:
-            logger.debug(e)
-            return API_ERROR_OUTPUT  # immediately soft fail
 
     if output is None:
         # not a single attempt was non-fatal; this is indicative of
         # basic connectivity or server issue -> hard fail
-        raise exceptions.OpenAIError
+        raise exceptions.ModelServingAPIError
     return output