Skip to content

Commit

Permalink
Clean up RagasEvaluator interfaces
Browse files Browse the repository at this point in the history
Refactor RagasEvaluator Class for use for
`ilab` interface.

Signed-off-by: Ali Maredia <[email protected]>
  • Loading branch information
alimaredia committed Jan 14, 2025
1 parent 8034f7e commit ad55824
Showing 1 changed file with 38 additions and 58 deletions.
96 changes: 38 additions & 58 deletions src/instructlab/eval/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class ModelConfig(BaseModel):
max_tokens: int = 768

# Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
seed: int = DEFAULT_SEED
seed: int = 42


class RagasEvaluator(Evaluator):
Expand All @@ -105,29 +105,25 @@ def __init__(
self.judge_openai_api_key = judge_openai_api_key

@staticmethod
def _validate_dataset(df: DataFrame):
def validate_dataset(df: DataFrame):
"""
Validates whether or not the given `df` is a valid dataset of `Sample` objects.
Args:
df (DataFrame): DataFrame containing the dataset to be evaluated.
df (DataFrame): DataFrame containing the dataset to be evaluated.
"""
# We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict
# is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required.
# See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required
required_keys = {"user_input", "reference"}
missing_keys = required_keys - set(df.columns)
if missing_keys:
required_keys = {"user_input", "reference", "response"}

columns_list = set(df.columns)
if not columns_list.issubset(required_keys):
raise ValueError(
f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}"
f"Dataset can only have the following keys: {', '.join(required_keys)}. Keys provided were: {', '.join(columns_list)}"
)

def run(
self,
dataset: List[Sample] | Path,
student_model: ModelConfig | None = None,
dataset: List[Sample] | DataFrame,
run_config: RunConfig | None = None,
student_openai_client: OpenAIClient | None = None,
judge_model_name: str | None = None,
judge_openai_api_key: str | None = None,
) -> EvaluationResult:
Expand All @@ -141,17 +137,12 @@ def run(
dataset (List[Sample] | Path):
Can be either a list of `Sample` objects or a path to a jsonl file containing
records matching `Sample`.
student_model: (StudentModelConfig):
When this parameter is provided, we'll attempt to use the described model in order to
generate the responses from the given list of questions.
run_config (RunConfig | None, optional):
Configuration to use when running evaluations. If none is provided, then
a default one is created containing extremely permissive settings when handling
timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
rate limits resulting in heavy throttling during evaluations.
student_openai_client (openai.Client | None, optional):
The client to use when generating questions from the student model, must be compatible with the OpenAI API.
This field is required when `student_model` is provided.
judge_model_name (str | None, optional):
Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified.
judge_openai_api_key (str | None, optional):
Expand All @@ -167,50 +158,29 @@ def run(
judge_openai_api_key = (
judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key
)
student_model = student_model if student_model else self.student_model
run_config = run_config if run_config else self.run_config
student_openai_client = (
student_openai_client
if student_openai_client
else self.student_openai_client
)

# ensure we are in the dataframe format
input_df = None
input_df = dataset
if isinstance(dataset, list):
input_df = DataFrame(dataset)
elif isinstance(dataset, Path):
input_df = read_json(dataset, orient="records", lines=True)
else:
elif not isinstance(dataset, DataFrame):
raise TypeError(f"invalid type of dataset: {type(dataset)}")

# this should never happen, but pylint is not smart enough to detect it
if TYPE_CHECKING:
assert input_df is not None

# ensure the dataset is in the format we expect it
self._validate_dataset(input_df)

need_to_generate_questions = "response" not in input_df.columns
if need_to_generate_questions:
logger.debug(
"`response` is missing in the input dataframe columns, generating questions from the model is required."
)
if not student_model or not student_openai_client:
raise ValueError(
"provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference"
)

# if the student model was provided then we always generate regardless
if student_model:
if not student_openai_client:
raise ValueError(
"`student_model` was specified but `student_openai_client` was not provided"
)
input_df = self._generate_answers_from_model(
input_df, student_model, student_openai_client
# this looks similar to validate_dataset but here we want an exact match, not a subset
required_keys = {"user_input", "reference", "response"}
columns = set(input_df.columns)
if columns != required_keys:
raise ValueError(
f"Input Dataset can only have the following keys: {', '.join(required_keys)}. Keys provided were: {', '.join(columns)}"
)


if not run_config:
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
# are horrible and will result in half of our evaluation results being NaN or 0
Expand Down Expand Up @@ -238,15 +208,25 @@ def run(
)
return results

def _generate_answers_from_model(
self,
@staticmethod
def generate_answers_from_model(
questions: DataFrame,
student_model: ModelConfig,
student_openai_client: OpenAIClient,
model_config: ModelConfig,
openai_client: OpenAIClient,
) -> DataFrame:
"""
Given a DataFrame containing `user_input` columns, generates responses from the given model
and returns a new DataFrame containing its answers in the `response` column.
Args:
questions: (DataFrame):
Questions and refernce answers to be returned with the responses from the model
model_config: (ModelConfig):
Configuration settings for the model when getting responses.
openai_client (openai.Client | None, optional):
The client to use when generating questions from the model, must be compatible with the OpenAI API.
Returns:
DataFrame with user_input, reference, and response columns. Responses for the user_input from the model
"""
# initialize response to write into
updated_df = questions.copy()
Expand All @@ -256,17 +236,17 @@ def _generate_answers_from_model(
messages: List[ChatCompletionMessageParam] = [
{
"role": "system",
"content": student_model.system_prompt,
"content": model_config.system_prompt,
},
{"role": "user", "content": qna["user_input"]},
]
response = student_openai_client.chat.completions.create(
response = openai_client.chat.completions.create(
messages=messages,
model=student_model.model_name,
model=model_config.model_name,
# specify the seed so we can at least try to have some reproducibility when the clients support it
seed=42,
max_tokens=student_model.max_tokens,
temperature=student_model.temperature,
seed=model_config.seed,
max_tokens=model_config.max_tokens,
temperature=model_config.temperature,
)
updated_df.at[i, "response"] = response.choices[0].message.content
return updated_df
Expand Down

0 comments on commit ad55824

Please sign in to comment.