diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py index 6e414d790..1404ecd6b 100644 --- a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py +++ b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py @@ -32,12 +32,15 @@ import numpy as np # for manipulating arrays import openai import pandas as pd # for manipulating data in dataframes -import plotly.express as px # for plots + +# import plotly.express as px # for plots import torch # for matrix optimization from sklearn.model_selection import train_test_split # for splitting train & test data from tenacity import retry, stop_after_attempt, wait_random_exponential -from hamilton.function_modifiers import extract_columns, extract_fields, parameterize, value +from hamilton.function_modifiers import extract_fields, parameterize, value + +client = openai.OpenAI() # import litellm @@ -48,7 +51,7 @@ def _get_embedding(text: str, model="text-similarity-davinci-001", **kwargs) -> # replace newlines, which can negatively affect performance. text = text.replace("\n", " ") - response = openai.embeddings.create(input=[text], model=model, **kwargs) + response = client.embeddings.create(input=[text], model=model, **kwargs) return response.data[0].embedding @@ -58,20 +61,25 @@ def _cosine_similarity(a, b): # input parameters -embedding_cache_path = "data/snli_embedding_cache.pkl" # embeddings will be saved/loaded here -default_embedding_engine = "babbage-similarity" # text-embedding-ada-002 is recommended -num_pairs_to_embed = 1000 # 1000 is arbitrary -local_dataset_path = ( - "data/snli_1.0_train_2k.csv" # download from: https://nlp.stanford.edu/projects/snli/ -) +def embedding_cache_path() -> str: + return "data/snli_embedding_cache.pkl" # embeddings will be saved/loaded here -def local_dataset(local_dataset_path: str) -> pd.DataFrame: - return pd.read_csv(local_dataset_path) +def default_embedding_engine() -> str: + return "babbage-similarity" # text-embedding-ada-002 is recommended + + +def local_dataset( + local_dataset_path: str = "data/snli_1.0_train.csv", # download from: https://nlp.stanford.edu/projects/snli/ +) -> pd.DataFrame: + return pd.read_csv(local_dataset_path, delimiter="\t") # TODO: add pandera schema check -def processed_local_dataset(local_dataset: pd.DataFrame) -> pd.DataFrame: +def processed_local_dataset( + local_dataset: pd.DataFrame, num_pairs_to_embed: int = 1000 # 1000 is arbitrary +) -> pd.DataFrame: + # you can customize this to preprocess your own dataset # output should be a dataframe with 3 columns: text_1, text_2, label (1 for similar, -1 for dissimilar) local_dataset["label"] = local_dataset["gold_label"] @@ -84,13 +92,11 @@ def processed_local_dataset(local_dataset: pd.DataFrame) -> pd.DataFrame: # split data into train and test sets -test_fraction = 0.5 # 0.5 is fairly arbitrary -random_seed = 123 # random seed is arbitrary, but is helpful in reproducibility - - @extract_fields({"base_train_df": pd.DataFrame, "base_test_df": pd.DataFrame}) def split_data( - processed_local_dataset: pd.DataFrame, test_fraction: float = 0.5, random_seed: int = 123 + processed_local_dataset: pd.DataFrame, + test_fraction: float = 0.5, # 0.5 is fairly arbitrary + random_seed: int = 123, # random seed is arbitrary, but is helpful in reproducibility ) -> dict: train_df, test_df = train_test_split( processed_local_dataset, @@ -141,41 +147,49 @@ def test_df_negatives(base_test_df: pd.DataFrame) -> pd.DataFrame: def train_df( - base_train_df: pd.DataFrame, train_df_negatives: pd.DataFrame, negatives_per_positive: int + base_train_df: pd.DataFrame, + train_df_negatives: pd.DataFrame, + negatives_per_positive: int = 1, + random_seed: int = 123, ) -> pd.DataFrame: return pd.concat( [ base_train_df, train_df_negatives.sample( - n=len(train_df) * negatives_per_positive, random_state=random_seed + n=len(base_train_df) * negatives_per_positive, random_state=random_seed ), ] ) def test_df( - base_test_df: pd.DataFrame, test_df_negatives: pd.DataFrame, negatives_per_positive: int + base_test_df: pd.DataFrame, + test_df_negatives: pd.DataFrame, + negatives_per_positive: int = 1, + random_seed: int = 123, ) -> pd.DataFrame: return pd.concat( [ base_test_df, test_df_negatives.sample( - n=len(test_df) * negatives_per_positive, random_state=random_seed + n=len(base_test_df) * negatives_per_positive, random_state=random_seed ), ] ) def data_set(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame: - return pd.concat([train_df, test_df]) + _df = pd.concat([train_df, test_df]) + _df.reset_index(inplace=True) + return _df # this function will get embeddings from the cache and save them there afterward def _get_embedding_with_cache( text: str, - engine: str = default_embedding_engine, + engine: str = "babbage-similarity", embedding_cache: dict = None, - embedding_cache_path: str = embedding_cache_path, + embedding_cache_path: str = None, ) -> list: if embedding_cache is None: embedding_cache = {} @@ -205,31 +219,53 @@ def embedding_cache(embedding_cache_path: str) -> dict: def text1_embedding( data_set: pd.DataFrame, embedding_cache_path: str, embedding_cache: dict ) -> pd.Series: - return data_set["text_1"].apply( - lambda x: _get_embedding_with_cache, + _col = data_set["text_1"].apply( + _get_embedding_with_cache, embedding_cache_path=embedding_cache_path, embedding_cache=embedding_cache, ) + _col.name = "text_1_embedding" + return _col def text2_embedding( data_set: pd.DataFrame, embedding_cache_path: str, embedding_cache: dict ) -> pd.Series: - return data_set["text_2"].apply( - lambda x: _get_embedding_with_cache, + _col = data_set["text_2"].apply( + _get_embedding_with_cache, embedding_cache_path=embedding_cache_path, embedding_cache=embedding_cache, ) + _col.name = "text_2_embedding" + return _col def cosine_similarity(text1_embedding: pd.Series, text2_embedding: pd.Series) -> pd.Series: + # def func(x1, x2): + # if isinstance(x1, list) and isinstance(x2, list): + # return 1 - _cosine_similarity(x1, x2) + # else: + # print(x1) + # print(x2) + # raise ValueError("x1 and x2 must be lists, got {} and {}".format(type(x1), type(x2))) similarity_scores = text1_embedding.combine( text2_embedding, lambda x1, x2: 1 - _cosine_similarity(x1, x2) ) + similarity_scores.name = "cosine_similarity" return similarity_scores +def embedded_data_set( + data_set: pd.DataFrame, + text1_embedding: pd.Series, + text2_embedding: pd.Series, + cosine_similarity: pd.Series, +) -> pd.DataFrame: + _df = pd.concat([data_set, text1_embedding, text2_embedding, cosine_similarity], axis=1) + return _df + + # calculate accuracy (and its standard error) of predicting label=1 if similarity>x # x is optimized by sweeping from -1 to 1 in steps of 0.01 def _accuracy_and_se(cosine_similarity: float, labeled_similarity: int) -> Tuple[float]: @@ -260,11 +296,9 @@ def _accuracy_and_se(cosine_similarity: float, labeled_similarity: int) -> Tuple "test_accuracy": {"dataset_value": value("test")}, } ) -def accuracy_computation( - dataset_value: str, data_set: pd.DataFrame, cosine_similarity: pd.Series -) -> tuple: - data = data_set[data_set["dataset"] == dataset_value] - a, se = _accuracy_and_se(cosine_similarity[data.index], data["label"]) +def accuracy_computation(dataset_value: str, embedded_data_set: pd.DataFrame) -> tuple: + data = embedded_data_set[embedded_data_set["dataset"] == dataset_value] + a, se = _accuracy_and_se(data["cosine_similarity"], data["label"]) print(f"{dataset_value} accuracy: {a:0.1%} ± {1.96 * se:0.1%}") return a, se @@ -306,7 +340,7 @@ def _apply_matrix_to_embeddings_dataframe(matrix: torch.tensor, df: pd.DataFrame } ) def optimize_matrix( - data_set: pd.DataFrame, + embedded_data_set: pd.DataFrame, modified_embedding_length: int = 2048, # in my brief experimentation, bigger was better (2048 is length of babbage encoding) batch_size: int = 100, max_epochs: int = 10, # set to this while initially exploring @@ -337,10 +371,16 @@ def tensors_from_dataframe( return e1, e2, s e1_train, e2_train, s_train = tensors_from_dataframe( - data_set[data_set["dataset"] == "train"], "text_1_embedding", "text_2_embedding", "label" + embedded_data_set[embedded_data_set["dataset"] == "train"], + "text_1_embedding", + "text_2_embedding", + "label", ) e1_test, e2_test, s_test = tensors_from_dataframe( - data_set[data_set["dataset"] == "test"], "text_1_embedding", "text_2_embedding", "label" + embedded_data_set[embedded_data_set["dataset"] == "test"], + "text_1_embedding", + "text_2_embedding", + "label", ) # create dataset and loader @@ -364,7 +404,7 @@ def mse_loss(predictions, targets): return torch.sum(difference * difference) / difference.numel() # initialize projection matrix - embedding_length = len(data_set["text_1_embedding"].values[0]) + embedding_length = len(embedded_data_set["text_1_embedding"].values[0]) matrix = torch.randn(embedding_length, modified_embedding_length, requires_grad=True) epochs, types, losses, accuracies, matrices = [], [], [], [], [] @@ -386,11 +426,11 @@ def mse_loss(predictions, targets): test_loss = mse_loss(test_predictions, s_test) # compute custom embeddings and new cosine similarities - _apply_matrix_to_embeddings_dataframe(matrix, data_set) + _apply_matrix_to_embeddings_dataframe(matrix, embedded_data_set) # calculate test accuracy for dataset in ["train", "test"]: - data = data_set[data_set["dataset"] == dataset] + data = embedded_data_set[embedded_data_set["dataset"] == dataset] a, se = _accuracy_and_se(data["cosine_similarity_custom"], data["label"]) # record results of each epoch @@ -426,46 +466,46 @@ def best_matrix( ) -> pd.DataFrame: runs_df = pd.concat([matrix_b10_l10, matrix_b100_l100, matrix_b1000_l1000]) # plot training loss and test loss over time - px.line( - runs_df, - line_group="run_id", - x="epoch", - y="loss", - color="type", - hover_data=["batch_size", "learning_rate", "dropout_fraction"], - facet_row="learning_rate", - facet_col="batch_size", - width=500, - ).show() + # px.line( + # runs_df, + # line_group="run_id", + # x="epoch", + # y="loss", + # color="type", + # hover_data=["batch_size", "learning_rate", "dropout_fraction"], + # facet_row="learning_rate", + # facet_col="batch_size", + # width=500, + # ).show() # plot accuracy over time - px.line( - runs_df, - line_group="run_id", - x="epoch", - y="accuracy", - color="type", - hover_data=["batch_size", "learning_rate", "dropout_fraction"], - facet_row="learning_rate", - facet_col="batch_size", - width=500, - ).show() + # px.line( + # runs_df, + # line_group="run_id", + # x="epoch", + # y="accuracy", + # color="type", + # hover_data=["batch_size", "learning_rate", "dropout_fraction"], + # facet_row="learning_rate", + # facet_col="batch_size", + # width=500, + # ).show() best_run = runs_df.sort_values(by="accuracy", ascending=False).iloc[0] best_matrix = best_run["matrix"] return best_matrix -@extract_columns("text_1_embedding_custom", "text_2_embedding_custom", "cosine_similarity_custom") -def modified_embeddings(data_set: pd.DataFrame, best_matrix: pd.DataFrame) -> pd.DataFrame: - _apply_matrix_to_embeddings_dataframe(best_matrix, data_set) - return data_set +# @extract_columns("text_1_embedding_custom", "text_2_embedding_custom", "cosine_similarity_custom") +def modified_embeddings(embedded_data_set: pd.DataFrame, best_matrix: pd.DataFrame) -> pd.DataFrame: + _apply_matrix_to_embeddings_dataframe(best_matrix, embedded_data_set) + return embedded_data_set def test_accuracy_post_optimization( - data_set: pd.DataFrame, cosine_similarity_custom: pd.Series + modified_embeddings: pd.DataFrame, # , cosine_similarity_custom: pd.Series ) -> tuple: - data = data_set[data_set["dataset"] == "test"] - a, se = _accuracy_and_se(cosine_similarity_custom[data.index], data["label"]) + data = modified_embeddings[modified_embeddings["dataset"] == "test"] + a, se = _accuracy_and_se(data["cosine_similarity_custom"], data["label"]) print(f"test accuracy after optimization: {a:0.1%} ± {1.96 * se:0.1%}") return a, se @@ -476,4 +516,5 @@ def test_accuracy_post_optimization( from hamilton import driver dr = driver.Driver({}, customize_embeddings) - dr.display_all_functions("customize_embeddings", render_kwargs={"format": "png"}) + # dr.display_all_functions("customize_embeddings", render_kwargs={"format": "png"}) + dr.execute(["test_accuracy_post_optimization"], inputs={}) diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/requirements.txt b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/requirements.txt index d44a95b86..105d1b5b4 100644 --- a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/requirements.txt +++ b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/requirements.txt @@ -1,3 +1,5 @@ openai +plotly scikit-learn sf-hamilton +torch