WIP - things run!

DAGWorks-Inc · Nov 29, 2023 · 148a4a9 · 148a4a9
1 parent 76dd683
commit 148a4a9
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 69 deletions.
diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py
@@ -32,12 +32,15 @@
 import numpy as np  # for manipulating arrays
 import openai
 import pandas as pd  # for manipulating data in dataframes
-import plotly.express as px  # for plots
+
+# import plotly.express as px  # for plots
 import torch  # for matrix optimization
 from sklearn.model_selection import train_test_split  # for splitting train & test data
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 
-from hamilton.function_modifiers import extract_columns, extract_fields, parameterize, value
+from hamilton.function_modifiers import extract_fields, parameterize, value
+
+client = openai.OpenAI()
 
 # import litellm
 
@@ -48,7 +51,7 @@ def _get_embedding(text: str, model="text-similarity-davinci-001", **kwargs) ->
     # replace newlines, which can negatively affect performance.
     text = text.replace("\n", " ")
 
-    response = openai.embeddings.create(input=[text], model=model, **kwargs)
+    response = client.embeddings.create(input=[text], model=model, **kwargs)
 
     return response.data[0].embedding
 
@@ -58,20 +61,25 @@ def _cosine_similarity(a, b):
 
 
 # input parameters
-embedding_cache_path = "data/snli_embedding_cache.pkl"  # embeddings will be saved/loaded here
-default_embedding_engine = "babbage-similarity"  # text-embedding-ada-002 is recommended
-num_pairs_to_embed = 1000  # 1000 is arbitrary
-local_dataset_path = (
-    "data/snli_1.0_train_2k.csv"  # download from: https://nlp.stanford.edu/projects/snli/
-)
+def embedding_cache_path() -> str:
+    return "data/snli_embedding_cache.pkl"  # embeddings will be saved/loaded here
 
 
-def local_dataset(local_dataset_path: str) -> pd.DataFrame:
-    return pd.read_csv(local_dataset_path)
+def default_embedding_engine() -> str:
+    return "babbage-similarity"  # text-embedding-ada-002 is recommended
+
+
+def local_dataset(
+    local_dataset_path: str = "data/snli_1.0_train.csv",  # download from: https://nlp.stanford.edu/projects/snli/
+) -> pd.DataFrame:
+    return pd.read_csv(local_dataset_path, delimiter="\t")
 
 
 # TODO: add pandera schema check
-def processed_local_dataset(local_dataset: pd.DataFrame) -> pd.DataFrame:
+def processed_local_dataset(
+    local_dataset: pd.DataFrame, num_pairs_to_embed: int = 1000  # 1000 is arbitrary
+) -> pd.DataFrame:
+
     # you can customize this to preprocess your own dataset
     # output should be a dataframe with 3 columns: text_1, text_2, label (1 for similar, -1 for dissimilar)
     local_dataset["label"] = local_dataset["gold_label"]
@@ -84,13 +92,11 @@ def processed_local_dataset(local_dataset: pd.DataFrame) -> pd.DataFrame:
 
 
 # split data into train and test sets
-test_fraction = 0.5  # 0.5 is fairly arbitrary
-random_seed = 123  # random seed is arbitrary, but is helpful in reproducibility
-
-
 @extract_fields({"base_train_df": pd.DataFrame, "base_test_df": pd.DataFrame})
 def split_data(
-    processed_local_dataset: pd.DataFrame, test_fraction: float = 0.5, random_seed: int = 123
+    processed_local_dataset: pd.DataFrame,
+    test_fraction: float = 0.5,  # 0.5 is fairly arbitrary
+    random_seed: int = 123,  # random seed is arbitrary, but is helpful in reproducibility
 ) -> dict:
     train_df, test_df = train_test_split(
         processed_local_dataset,
@@ -141,41 +147,49 @@ def test_df_negatives(base_test_df: pd.DataFrame) -> pd.DataFrame:
 
 
 def train_df(
-    base_train_df: pd.DataFrame, train_df_negatives: pd.DataFrame, negatives_per_positive: int
+    base_train_df: pd.DataFrame,
+    train_df_negatives: pd.DataFrame,
+    negatives_per_positive: int = 1,
+    random_seed: int = 123,
 ) -> pd.DataFrame:
     return pd.concat(
         [
             base_train_df,
             train_df_negatives.sample(
-                n=len(train_df) * negatives_per_positive, random_state=random_seed
+                n=len(base_train_df) * negatives_per_positive, random_state=random_seed
             ),
         ]
     )
 
 
 def test_df(
-    base_test_df: pd.DataFrame, test_df_negatives: pd.DataFrame, negatives_per_positive: int
+    base_test_df: pd.DataFrame,
+    test_df_negatives: pd.DataFrame,
+    negatives_per_positive: int = 1,
+    random_seed: int = 123,
 ) -> pd.DataFrame:
     return pd.concat(
         [
             base_test_df,
             test_df_negatives.sample(
-                n=len(test_df) * negatives_per_positive, random_state=random_seed
+                n=len(base_test_df) * negatives_per_positive, random_state=random_seed
             ),
         ]
     )
 
 
 def data_set(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
-    return pd.concat([train_df, test_df])
+    _df = pd.concat([train_df, test_df])
+    _df.reset_index(inplace=True)
+    return _df
 
 
 # this function will get embeddings from the cache and save them there afterward
 def _get_embedding_with_cache(
     text: str,
-    engine: str = default_embedding_engine,
+    engine: str = "babbage-similarity",
     embedding_cache: dict = None,
-    embedding_cache_path: str = embedding_cache_path,
+    embedding_cache_path: str = None,
 ) -> list:
     if embedding_cache is None:
         embedding_cache = {}
@@ -205,31 +219,53 @@ def embedding_cache(embedding_cache_path: str) -> dict:
 def text1_embedding(
     data_set: pd.DataFrame, embedding_cache_path: str, embedding_cache: dict
 ) -> pd.Series:
-    return data_set["text_1"].apply(
-        lambda x: _get_embedding_with_cache,
+    _col = data_set["text_1"].apply(
+        _get_embedding_with_cache,
         embedding_cache_path=embedding_cache_path,
         embedding_cache=embedding_cache,
     )
+    _col.name = "text_1_embedding"
+    return _col
 
 
 def text2_embedding(
     data_set: pd.DataFrame, embedding_cache_path: str, embedding_cache: dict
 ) -> pd.Series:
-    return data_set["text_2"].apply(
-        lambda x: _get_embedding_with_cache,
+    _col = data_set["text_2"].apply(
+        _get_embedding_with_cache,
         embedding_cache_path=embedding_cache_path,
         embedding_cache=embedding_cache,
     )
+    _col.name = "text_2_embedding"
+    return _col
 
 
 def cosine_similarity(text1_embedding: pd.Series, text2_embedding: pd.Series) -> pd.Series:
+    # def func(x1, x2):
+    #     if isinstance(x1, list) and isinstance(x2, list):
+    #         return 1 - _cosine_similarity(x1, x2)
+    #     else:
+    #         print(x1)
+    #         print(x2)
+    #         raise ValueError("x1 and x2 must be lists, got {} and {}".format(type(x1), type(x2)))
     similarity_scores = text1_embedding.combine(
         text2_embedding, lambda x1, x2: 1 - _cosine_similarity(x1, x2)
     )
+    similarity_scores.name = "cosine_similarity"
 
     return similarity_scores
 
 
+def embedded_data_set(
+    data_set: pd.DataFrame,
+    text1_embedding: pd.Series,
+    text2_embedding: pd.Series,
+    cosine_similarity: pd.Series,
+) -> pd.DataFrame:
+    _df = pd.concat([data_set, text1_embedding, text2_embedding, cosine_similarity], axis=1)
+    return _df
+
+
 # calculate accuracy (and its standard error) of predicting label=1 if similarity>x
 # x is optimized by sweeping from -1 to 1 in steps of 0.01
 def _accuracy_and_se(cosine_similarity: float, labeled_similarity: int) -> Tuple[float]:
@@ -260,11 +296,9 @@ def _accuracy_and_se(cosine_similarity: float, labeled_similarity: int) -> Tuple
         "test_accuracy": {"dataset_value": value("test")},
     }
 )
-def accuracy_computation(
-    dataset_value: str, data_set: pd.DataFrame, cosine_similarity: pd.Series
-) -> tuple:
-    data = data_set[data_set["dataset"] == dataset_value]
-    a, se = _accuracy_and_se(cosine_similarity[data.index], data["label"])
+def accuracy_computation(dataset_value: str, embedded_data_set: pd.DataFrame) -> tuple:
+    data = embedded_data_set[embedded_data_set["dataset"] == dataset_value]
+    a, se = _accuracy_and_se(data["cosine_similarity"], data["label"])
     print(f"{dataset_value} accuracy: {a:0.1%} ± {1.96 * se:0.1%}")
     return a, se
 
@@ -306,7 +340,7 @@ def _apply_matrix_to_embeddings_dataframe(matrix: torch.tensor, df: pd.DataFrame
     }
 )
 def optimize_matrix(
-    data_set: pd.DataFrame,
+    embedded_data_set: pd.DataFrame,
     modified_embedding_length: int = 2048,  # in my brief experimentation, bigger was better (2048 is length of babbage encoding)
     batch_size: int = 100,
     max_epochs: int = 10,  # set to this while initially exploring
@@ -337,10 +371,16 @@ def tensors_from_dataframe(
         return e1, e2, s
 
     e1_train, e2_train, s_train = tensors_from_dataframe(
-        data_set[data_set["dataset"] == "train"], "text_1_embedding", "text_2_embedding", "label"
+        embedded_data_set[embedded_data_set["dataset"] == "train"],
+        "text_1_embedding",
+        "text_2_embedding",
+        "label",
     )
     e1_test, e2_test, s_test = tensors_from_dataframe(
-        data_set[data_set["dataset"] == "test"], "text_1_embedding", "text_2_embedding", "label"
+        embedded_data_set[embedded_data_set["dataset"] == "test"],
+        "text_1_embedding",
+        "text_2_embedding",
+        "label",
     )
 
     # create dataset and loader
@@ -364,7 +404,7 @@ def mse_loss(predictions, targets):
         return torch.sum(difference * difference) / difference.numel()
 
     # initialize projection matrix
-    embedding_length = len(data_set["text_1_embedding"].values[0])
+    embedding_length = len(embedded_data_set["text_1_embedding"].values[0])
     matrix = torch.randn(embedding_length, modified_embedding_length, requires_grad=True)
 
     epochs, types, losses, accuracies, matrices = [], [], [], [], []
@@ -386,11 +426,11 @@ def mse_loss(predictions, targets):
         test_loss = mse_loss(test_predictions, s_test)
 
         # compute custom embeddings and new cosine similarities
-        _apply_matrix_to_embeddings_dataframe(matrix, data_set)
+        _apply_matrix_to_embeddings_dataframe(matrix, embedded_data_set)
 
         # calculate test accuracy
         for dataset in ["train", "test"]:
-            data = data_set[data_set["dataset"] == dataset]
+            data = embedded_data_set[embedded_data_set["dataset"] == dataset]
             a, se = _accuracy_and_se(data["cosine_similarity_custom"], data["label"])
 
             # record results of each epoch
@@ -426,46 +466,46 @@ def best_matrix(
 ) -> pd.DataFrame:
     runs_df = pd.concat([matrix_b10_l10, matrix_b100_l100, matrix_b1000_l1000])
     # plot training loss and test loss over time
-    px.line(
-        runs_df,
-        line_group="run_id",
-        x="epoch",
-        y="loss",
-        color="type",
-        hover_data=["batch_size", "learning_rate", "dropout_fraction"],
-        facet_row="learning_rate",
-        facet_col="batch_size",
-        width=500,
-    ).show()
+    # px.line(
+    #     runs_df,
+    #     line_group="run_id",
+    #     x="epoch",
+    #     y="loss",
+    #     color="type",
+    #     hover_data=["batch_size", "learning_rate", "dropout_fraction"],
+    #     facet_row="learning_rate",
+    #     facet_col="batch_size",
+    #     width=500,
+    # ).show()
 
     # plot accuracy over time
-    px.line(
-        runs_df,
-        line_group="run_id",
-        x="epoch",
-        y="accuracy",
-        color="type",
-        hover_data=["batch_size", "learning_rate", "dropout_fraction"],
-        facet_row="learning_rate",
-        facet_col="batch_size",
-        width=500,
-    ).show()
+    # px.line(
+    #     runs_df,
+    #     line_group="run_id",
+    #     x="epoch",
+    #     y="accuracy",
+    #     color="type",
+    #     hover_data=["batch_size", "learning_rate", "dropout_fraction"],
+    #     facet_row="learning_rate",
+    #     facet_col="batch_size",
+    #     width=500,
+    # ).show()
     best_run = runs_df.sort_values(by="accuracy", ascending=False).iloc[0]
     best_matrix = best_run["matrix"]
     return best_matrix
 
 
-@extract_columns("text_1_embedding_custom", "text_2_embedding_custom", "cosine_similarity_custom")
-def modified_embeddings(data_set: pd.DataFrame, best_matrix: pd.DataFrame) -> pd.DataFrame:
-    _apply_matrix_to_embeddings_dataframe(best_matrix, data_set)
-    return data_set
+# @extract_columns("text_1_embedding_custom", "text_2_embedding_custom", "cosine_similarity_custom")
+def modified_embeddings(embedded_data_set: pd.DataFrame, best_matrix: pd.DataFrame) -> pd.DataFrame:
+    _apply_matrix_to_embeddings_dataframe(best_matrix, embedded_data_set)
+    return embedded_data_set
 
 
 def test_accuracy_post_optimization(
-    data_set: pd.DataFrame, cosine_similarity_custom: pd.Series
+    modified_embeddings: pd.DataFrame,  # , cosine_similarity_custom: pd.Series
 ) -> tuple:
-    data = data_set[data_set["dataset"] == "test"]
-    a, se = _accuracy_and_se(cosine_similarity_custom[data.index], data["label"])
+    data = modified_embeddings[modified_embeddings["dataset"] == "test"]
+    a, se = _accuracy_and_se(data["cosine_similarity_custom"], data["label"])
     print(f"test accuracy after optimization: {a:0.1%} ± {1.96 * se:0.1%}")
     return a, se
 
@@ -476,4 +516,5 @@ def test_accuracy_post_optimization(
     from hamilton import driver
 
     dr = driver.Driver({}, customize_embeddings)
-    dr.display_all_functions("customize_embeddings", render_kwargs={"format": "png"})
+    # dr.display_all_functions("customize_embeddings", render_kwargs={"format": "png"})
+    dr.execute(["test_accuracy_post_optimization"], inputs={})
diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/requirements.txt b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/requirements.txt
@@ -1,3 +1,5 @@
 openai
+plotly
 scikit-learn
 sf-hamilton
+torch