Skip to content

Commit

Permalink
WIP - things run!
Browse files Browse the repository at this point in the history
  • Loading branch information
skrawcz committed Nov 29, 2023
1 parent 76dd683 commit 148a4a9
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 69 deletions.
179 changes: 110 additions & 69 deletions contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,15 @@
import numpy as np # for manipulating arrays
import openai
import pandas as pd # for manipulating data in dataframes
import plotly.express as px # for plots

# import plotly.express as px # for plots
import torch # for matrix optimization
from sklearn.model_selection import train_test_split # for splitting train & test data
from tenacity import retry, stop_after_attempt, wait_random_exponential

from hamilton.function_modifiers import extract_columns, extract_fields, parameterize, value
from hamilton.function_modifiers import extract_fields, parameterize, value

client = openai.OpenAI()

# import litellm

Expand All @@ -48,7 +51,7 @@ def _get_embedding(text: str, model="text-similarity-davinci-001", **kwargs) ->
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")

response = openai.embeddings.create(input=[text], model=model, **kwargs)
response = client.embeddings.create(input=[text], model=model, **kwargs)

return response.data[0].embedding

Expand All @@ -58,20 +61,25 @@ def _cosine_similarity(a, b):


# input parameters
embedding_cache_path = "data/snli_embedding_cache.pkl" # embeddings will be saved/loaded here
default_embedding_engine = "babbage-similarity" # text-embedding-ada-002 is recommended
num_pairs_to_embed = 1000 # 1000 is arbitrary
local_dataset_path = (
"data/snli_1.0_train_2k.csv" # download from: https://nlp.stanford.edu/projects/snli/
)
def embedding_cache_path() -> str:
return "data/snli_embedding_cache.pkl" # embeddings will be saved/loaded here


def local_dataset(local_dataset_path: str) -> pd.DataFrame:
return pd.read_csv(local_dataset_path)
def default_embedding_engine() -> str:
return "babbage-similarity" # text-embedding-ada-002 is recommended


def local_dataset(
local_dataset_path: str = "data/snli_1.0_train.csv", # download from: https://nlp.stanford.edu/projects/snli/
) -> pd.DataFrame:
return pd.read_csv(local_dataset_path, delimiter="\t")


# TODO: add pandera schema check
def processed_local_dataset(local_dataset: pd.DataFrame) -> pd.DataFrame:
def processed_local_dataset(
local_dataset: pd.DataFrame, num_pairs_to_embed: int = 1000 # 1000 is arbitrary
) -> pd.DataFrame:

# you can customize this to preprocess your own dataset
# output should be a dataframe with 3 columns: text_1, text_2, label (1 for similar, -1 for dissimilar)
local_dataset["label"] = local_dataset["gold_label"]
Expand All @@ -84,13 +92,11 @@ def processed_local_dataset(local_dataset: pd.DataFrame) -> pd.DataFrame:


# split data into train and test sets
test_fraction = 0.5 # 0.5 is fairly arbitrary
random_seed = 123 # random seed is arbitrary, but is helpful in reproducibility


@extract_fields({"base_train_df": pd.DataFrame, "base_test_df": pd.DataFrame})
def split_data(
processed_local_dataset: pd.DataFrame, test_fraction: float = 0.5, random_seed: int = 123
processed_local_dataset: pd.DataFrame,
test_fraction: float = 0.5, # 0.5 is fairly arbitrary
random_seed: int = 123, # random seed is arbitrary, but is helpful in reproducibility
) -> dict:
train_df, test_df = train_test_split(
processed_local_dataset,
Expand Down Expand Up @@ -141,41 +147,49 @@ def test_df_negatives(base_test_df: pd.DataFrame) -> pd.DataFrame:


def train_df(
base_train_df: pd.DataFrame, train_df_negatives: pd.DataFrame, negatives_per_positive: int
base_train_df: pd.DataFrame,
train_df_negatives: pd.DataFrame,
negatives_per_positive: int = 1,
random_seed: int = 123,
) -> pd.DataFrame:
return pd.concat(
[
base_train_df,
train_df_negatives.sample(
n=len(train_df) * negatives_per_positive, random_state=random_seed
n=len(base_train_df) * negatives_per_positive, random_state=random_seed
),
]
)


def test_df(
base_test_df: pd.DataFrame, test_df_negatives: pd.DataFrame, negatives_per_positive: int
base_test_df: pd.DataFrame,
test_df_negatives: pd.DataFrame,
negatives_per_positive: int = 1,
random_seed: int = 123,
) -> pd.DataFrame:
return pd.concat(
[
base_test_df,
test_df_negatives.sample(
n=len(test_df) * negatives_per_positive, random_state=random_seed
n=len(base_test_df) * negatives_per_positive, random_state=random_seed
),
]
)


def data_set(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
return pd.concat([train_df, test_df])
_df = pd.concat([train_df, test_df])
_df.reset_index(inplace=True)
return _df


# this function will get embeddings from the cache and save them there afterward
def _get_embedding_with_cache(
text: str,
engine: str = default_embedding_engine,
engine: str = "babbage-similarity",
embedding_cache: dict = None,
embedding_cache_path: str = embedding_cache_path,
embedding_cache_path: str = None,
) -> list:
if embedding_cache is None:
embedding_cache = {}
Expand Down Expand Up @@ -205,31 +219,53 @@ def embedding_cache(embedding_cache_path: str) -> dict:
def text1_embedding(
data_set: pd.DataFrame, embedding_cache_path: str, embedding_cache: dict
) -> pd.Series:
return data_set["text_1"].apply(
lambda x: _get_embedding_with_cache,
_col = data_set["text_1"].apply(
_get_embedding_with_cache,
embedding_cache_path=embedding_cache_path,
embedding_cache=embedding_cache,
)
_col.name = "text_1_embedding"
return _col


def text2_embedding(
data_set: pd.DataFrame, embedding_cache_path: str, embedding_cache: dict
) -> pd.Series:
return data_set["text_2"].apply(
lambda x: _get_embedding_with_cache,
_col = data_set["text_2"].apply(
_get_embedding_with_cache,
embedding_cache_path=embedding_cache_path,
embedding_cache=embedding_cache,
)
_col.name = "text_2_embedding"
return _col


def cosine_similarity(text1_embedding: pd.Series, text2_embedding: pd.Series) -> pd.Series:
# def func(x1, x2):
# if isinstance(x1, list) and isinstance(x2, list):
# return 1 - _cosine_similarity(x1, x2)
# else:
# print(x1)
# print(x2)
# raise ValueError("x1 and x2 must be lists, got {} and {}".format(type(x1), type(x2)))
similarity_scores = text1_embedding.combine(
text2_embedding, lambda x1, x2: 1 - _cosine_similarity(x1, x2)
)
similarity_scores.name = "cosine_similarity"

return similarity_scores


def embedded_data_set(
data_set: pd.DataFrame,
text1_embedding: pd.Series,
text2_embedding: pd.Series,
cosine_similarity: pd.Series,
) -> pd.DataFrame:
_df = pd.concat([data_set, text1_embedding, text2_embedding, cosine_similarity], axis=1)
return _df


# calculate accuracy (and its standard error) of predicting label=1 if similarity>x
# x is optimized by sweeping from -1 to 1 in steps of 0.01
def _accuracy_and_se(cosine_similarity: float, labeled_similarity: int) -> Tuple[float]:
Expand Down Expand Up @@ -260,11 +296,9 @@ def _accuracy_and_se(cosine_similarity: float, labeled_similarity: int) -> Tuple
"test_accuracy": {"dataset_value": value("test")},
}
)
def accuracy_computation(
dataset_value: str, data_set: pd.DataFrame, cosine_similarity: pd.Series
) -> tuple:
data = data_set[data_set["dataset"] == dataset_value]
a, se = _accuracy_and_se(cosine_similarity[data.index], data["label"])
def accuracy_computation(dataset_value: str, embedded_data_set: pd.DataFrame) -> tuple:
data = embedded_data_set[embedded_data_set["dataset"] == dataset_value]
a, se = _accuracy_and_se(data["cosine_similarity"], data["label"])
print(f"{dataset_value} accuracy: {a:0.1%} ± {1.96 * se:0.1%}")
return a, se

Expand Down Expand Up @@ -306,7 +340,7 @@ def _apply_matrix_to_embeddings_dataframe(matrix: torch.tensor, df: pd.DataFrame
}
)
def optimize_matrix(
data_set: pd.DataFrame,
embedded_data_set: pd.DataFrame,
modified_embedding_length: int = 2048, # in my brief experimentation, bigger was better (2048 is length of babbage encoding)
batch_size: int = 100,
max_epochs: int = 10, # set to this while initially exploring
Expand Down Expand Up @@ -337,10 +371,16 @@ def tensors_from_dataframe(
return e1, e2, s

e1_train, e2_train, s_train = tensors_from_dataframe(
data_set[data_set["dataset"] == "train"], "text_1_embedding", "text_2_embedding", "label"
embedded_data_set[embedded_data_set["dataset"] == "train"],
"text_1_embedding",
"text_2_embedding",
"label",
)
e1_test, e2_test, s_test = tensors_from_dataframe(
data_set[data_set["dataset"] == "test"], "text_1_embedding", "text_2_embedding", "label"
embedded_data_set[embedded_data_set["dataset"] == "test"],
"text_1_embedding",
"text_2_embedding",
"label",
)

# create dataset and loader
Expand All @@ -364,7 +404,7 @@ def mse_loss(predictions, targets):
return torch.sum(difference * difference) / difference.numel()

# initialize projection matrix
embedding_length = len(data_set["text_1_embedding"].values[0])
embedding_length = len(embedded_data_set["text_1_embedding"].values[0])
matrix = torch.randn(embedding_length, modified_embedding_length, requires_grad=True)

epochs, types, losses, accuracies, matrices = [], [], [], [], []
Expand All @@ -386,11 +426,11 @@ def mse_loss(predictions, targets):
test_loss = mse_loss(test_predictions, s_test)

# compute custom embeddings and new cosine similarities
_apply_matrix_to_embeddings_dataframe(matrix, data_set)
_apply_matrix_to_embeddings_dataframe(matrix, embedded_data_set)

# calculate test accuracy
for dataset in ["train", "test"]:
data = data_set[data_set["dataset"] == dataset]
data = embedded_data_set[embedded_data_set["dataset"] == dataset]
a, se = _accuracy_and_se(data["cosine_similarity_custom"], data["label"])

# record results of each epoch
Expand Down Expand Up @@ -426,46 +466,46 @@ def best_matrix(
) -> pd.DataFrame:
runs_df = pd.concat([matrix_b10_l10, matrix_b100_l100, matrix_b1000_l1000])
# plot training loss and test loss over time
px.line(
runs_df,
line_group="run_id",
x="epoch",
y="loss",
color="type",
hover_data=["batch_size", "learning_rate", "dropout_fraction"],
facet_row="learning_rate",
facet_col="batch_size",
width=500,
).show()
# px.line(
# runs_df,
# line_group="run_id",
# x="epoch",
# y="loss",
# color="type",
# hover_data=["batch_size", "learning_rate", "dropout_fraction"],
# facet_row="learning_rate",
# facet_col="batch_size",
# width=500,
# ).show()

# plot accuracy over time
px.line(
runs_df,
line_group="run_id",
x="epoch",
y="accuracy",
color="type",
hover_data=["batch_size", "learning_rate", "dropout_fraction"],
facet_row="learning_rate",
facet_col="batch_size",
width=500,
).show()
# px.line(
# runs_df,
# line_group="run_id",
# x="epoch",
# y="accuracy",
# color="type",
# hover_data=["batch_size", "learning_rate", "dropout_fraction"],
# facet_row="learning_rate",
# facet_col="batch_size",
# width=500,
# ).show()
best_run = runs_df.sort_values(by="accuracy", ascending=False).iloc[0]
best_matrix = best_run["matrix"]
return best_matrix


@extract_columns("text_1_embedding_custom", "text_2_embedding_custom", "cosine_similarity_custom")
def modified_embeddings(data_set: pd.DataFrame, best_matrix: pd.DataFrame) -> pd.DataFrame:
_apply_matrix_to_embeddings_dataframe(best_matrix, data_set)
return data_set
# @extract_columns("text_1_embedding_custom", "text_2_embedding_custom", "cosine_similarity_custom")
def modified_embeddings(embedded_data_set: pd.DataFrame, best_matrix: pd.DataFrame) -> pd.DataFrame:
_apply_matrix_to_embeddings_dataframe(best_matrix, embedded_data_set)
return embedded_data_set


def test_accuracy_post_optimization(
data_set: pd.DataFrame, cosine_similarity_custom: pd.Series
modified_embeddings: pd.DataFrame, # , cosine_similarity_custom: pd.Series
) -> tuple:
data = data_set[data_set["dataset"] == "test"]
a, se = _accuracy_and_se(cosine_similarity_custom[data.index], data["label"])
data = modified_embeddings[modified_embeddings["dataset"] == "test"]
a, se = _accuracy_and_se(data["cosine_similarity_custom"], data["label"])
print(f"test accuracy after optimization: {a:0.1%} ± {1.96 * se:0.1%}")
return a, se

Expand All @@ -476,4 +516,5 @@ def test_accuracy_post_optimization(
from hamilton import driver

dr = driver.Driver({}, customize_embeddings)
dr.display_all_functions("customize_embeddings", render_kwargs={"format": "png"})
# dr.display_all_functions("customize_embeddings", render_kwargs={"format": "png"})
dr.execute(["test_accuracy_post_optimization"], inputs={})
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
openai
plotly
scikit-learn
sf-hamilton
torch

0 comments on commit 148a4a9

Please sign in to comment.