From f8277b156367f880298f062a9598aa7abd3cf36c Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Tue, 14 Jan 2025 11:07:28 -0500 Subject: [PATCH] Clean up RagasEvaluator interfaces Refactor RagasEvaluator Class for use for `ilab` interface. Signed-off-by: Ali Maredia --- src/instructlab/eval/ragas.py | 96 ++++++++++++++--------------------- 1 file changed, 38 insertions(+), 58 deletions(-) diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index ca1d5c9..72567a6 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -74,7 +74,7 @@ class ModelConfig(BaseModel): max_tokens: int = 768 # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. - seed: int = DEFAULT_SEED + seed: int = 42 class RagasEvaluator(Evaluator): @@ -96,29 +96,25 @@ def __init__( self.judge_openai_api_key = judge_openai_api_key @staticmethod - def _validate_dataset(df: DataFrame): + def validate_dataset(df: DataFrame): """ Validates whether or not the given `df` is a valid dataset of `Sample` objects. Args: - df (DataFrame): DataFrame containing the dataset to be evaluated. + df (DataFrame): DataFrame containing the dataset to be evaluated. """ - # We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict - # is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required. - # See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required - required_keys = {"user_input", "reference"} - missing_keys = required_keys - set(df.columns) - if missing_keys: + required_keys = {"user_input", "reference", "response"} + + columns_list = set(df.columns) + if not columns_list.issubset(required_keys): raise ValueError( - f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}" + f"Dataset can only have the following keys: {', '.join(required_keys)}. Keys provided were: {', '.join(columns_list)}" ) def run( self, - dataset: List[Sample] | Path, - student_model: ModelConfig | None = None, + dataset: List[Sample] | DataFrame, run_config: RunConfig | None = None, - student_openai_client: OpenAIClient | None = None, judge_model_name: str | None = None, judge_openai_api_key: str | None = None, ) -> EvaluationResult: @@ -132,17 +128,12 @@ def run( dataset (List[Sample] | Path): Can be either a list of `Sample` objects or a path to a jsonl file containing records matching `Sample`. - student_model: (StudentModelConfig): - When this parameter is provided, we'll attempt to use the described model in order to generate the responses from the given list of questions. run_config (RunConfig | None, optional): Configuration to use when running evaluations. If none is provided, then a default one is created containing extremely permissive settings when handling timeouts. This is because by default, OpenAI tier-1 usage accounts have very high rate limits resulting in heavy throttling during evaluations. - student_openai_client (openai.Client | None, optional): - The client to use when generating questions from the student model, must be compatible with the OpenAI API. - This field is required when `student_model` is provided. judge_model_name (str | None, optional): Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified. judge_openai_api_key (str | None, optional): @@ -158,21 +149,13 @@ def run( judge_openai_api_key = ( judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key ) - student_model = student_model if student_model else self.student_model run_config = run_config if run_config else self.run_config - student_openai_client = ( - student_openai_client - if student_openai_client - else self.student_openai_client - ) # ensure we are in the dataframe format - input_df = None + input_df = dataset if isinstance(dataset, list): input_df = DataFrame(dataset) - elif isinstance(dataset, Path): - input_df = read_json(dataset, orient="records", lines=True) - else: + elif not isinstance(dataset, DataFrame): raise TypeError(f"invalid type of dataset: {type(dataset)}") # this should never happen, but pylint is not smart enough to detect it @@ -180,28 +163,15 @@ def run( assert input_df is not None # ensure the dataset is in the format we expect it - self._validate_dataset(input_df) - - need_to_generate_questions = "response" not in input_df.columns - if need_to_generate_questions: - logger.debug( - "`response` is missing in the input dataframe columns, generating questions from the model is required." - ) - if not student_model or not student_openai_client: - raise ValueError( - "provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference" - ) - - # if the student model was provided then we always generate regardless - if student_model: - if not student_openai_client: - raise ValueError( - "`student_model` was specified but `student_openai_client` was not provided" - ) - input_df = self._generate_answers_from_model( - input_df, student_model, student_openai_client + # this looks similar to validate_dataset but here we want an exact match, not a subset + required_keys = {"user_input", "reference", "response"} + columns = set(input_df.columns) + if columns != required_keys: + raise ValueError( + f"Input Dataset can only have the following keys: {', '.join(required_keys)}. Keys provided were: {', '.join(columns)}" ) + if not run_config: # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits # are horrible and will result in half of our evaluation results being NaN or 0 @@ -229,15 +199,25 @@ def run( ) return results - def _generate_answers_from_model( - self, + @staticmethod + def generate_answers_from_model( questions: DataFrame, - student_model: ModelConfig, - student_openai_client: OpenAIClient, + model_config: ModelConfig, + openai_client: OpenAIClient, ) -> DataFrame: """ Given a DataFrame containing `user_input` columns, generates responses from the given model and returns a new DataFrame containing its answers in the `response` column. + + Args: + questions: (DataFrame): + Questions and refernce answers to be returned with the responses from the model + model_config: (ModelConfig): + Configuration settings for the model when getting responses. + openai_client (openai.Client | None, optional): + The client to use when generating questions from the model, must be compatible with the OpenAI API. + Returns: + DataFrame with user_input, reference, and response columns. Responses for the user_input from the model """ # initialize response to write into updated_df = questions.copy() @@ -247,17 +227,17 @@ def _generate_answers_from_model( messages: List[ChatCompletionMessageParam] = [ { "role": "system", - "content": student_model.system_prompt, + "content": model_config.system_prompt, }, {"role": "user", "content": qna["user_input"]}, ] - response = student_openai_client.chat.completions.create( + response = openai_client.chat.completions.create( messages=messages, - model=student_model.model_name, + model=model_config.model_name, # specify the seed so we can at least try to have some reproducibility when the clients support it - seed=42, - max_tokens=student_model.max_tokens, - temperature=student_model.temperature, + seed=model_config.seed, + max_tokens=model_config.max_tokens, + temperature=model_config.temperature, ) updated_df.at[i, "response"] = response.choices[0].message.content return updated_df