Skip to content

Commit

Permalink
Responding to review comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
skrawcz committed Dec 3, 2023
1 parent 09431a9 commit 3146b15
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 32 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Purpose of this module

This module is used to customize embeddings for text data. It is based on MIT licensed code from the OpenAI cookbook.
This module is used to customize embeddings for text data. It is based on MIT licensed code from
this [OpenAI cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/Customizing_embeddings.ipynb).

The output is a matrix that you can use to multiply your embeddings. The product of this multiplication is a
'custom embedding' that will better emphasize aspects of the text relevant to your use case.
Expand All @@ -25,7 +26,7 @@ If you pass in `{"source":"local"}` as configuration to the driver, the module w
path to. The dataset should be a csv with columns "text_1", "text_2", and "label". The label should be +1 if the text
pairs are similar and -1 if the text pairs are dissimilar.

Otheriwse if you pass in `{}` as configuration to the driver, the module will require you to pass in a dataframe as
Otherwise if you pass in `{}` as configuration to the driver, the module will require you to pass in a dataframe as
`processed_local_dataset` as an input. The dataframe should have columns "text_1", "text_2", and "label". The label should be +1 if the
text pairs are similar and -1 if the text pairs are dissimilar.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
extract_fields,
group,
inject,
load_from,
parameterize,
source,
value,
Expand Down Expand Up @@ -155,22 +156,24 @@ def processed_local_dataset__snli(

@config.when(source="local")
@check_output(schema=processed_dataset_schema, importance="fail")
@load_from.csv(
path=source("local_dataset_path")
# see data loader docuemntation and the PandasCSVReader for values you can pass in:
# - https://hamilton.dagworks.io/en/latest/reference/io/available-data-adapters/#data-loaders
# - https://github.com/dagworks-inc/hamilton/blob/main/hamilton/plugins/pandas_extensions.py#L89-L255
)
def processed_local_dataset__local(
local_dataset_path: str,
read_csv_kwargs: dict = None,
local_dataset: pd.DataFrame,
) -> pd.DataFrame:
"""Uses a local dataset, reading it from the given path.
Override this with a dataframe (`overrides={"processed_local_dataset": my_df}`) if you have your own dataset
that's ready in a notebook, or modify it to match your use case.
:param local_dataset_path: path to file.
:param read_csv_kwargs: kwargs to pass to pd.read_csv. e.g. delimiter.
:param local_dataset: dataframe loaded by the Pandas CSV Reader.
:return: dataframe of text pairs with labels. Schema: text_1, text_2, label (1 for similar, -1 for dissimilar).
"""
if read_csv_kwargs is None:
read_csv_kwargs = {}
return pd.read_csv(local_dataset_path, **read_csv_kwargs)
return local_dataset


# split data into train and test sets
Expand Down Expand Up @@ -239,38 +242,40 @@ def test_df_negatives(base_test_df: pd.DataFrame) -> pd.DataFrame:
return _df


def train_df(
base_train_df: pd.DataFrame,
train_df_negatives: pd.DataFrame,
@parameterize(
train_df={"base_df": source("base_train_df"), "df_negatives": source("train_df_negatives")},
test_df={"base_df": source("base_test_df"), "df_negatives": source("test_df_negatives")},
)
def construct_df(
base_df: pd.DataFrame,
df_negatives: pd.DataFrame,
negatives_per_positive: int = 1,
random_seed: int = 123,
) -> pd.DataFrame:
"""Return dataframe of training pairs, with negatives added."""
f"""Return dataframe of {base_df} paris with negatives added."""
return pd.concat(
[
base_train_df,
train_df_negatives.sample(
n=len(base_train_df) * negatives_per_positive, random_state=random_seed
),
base_df,
df_negatives.sample(n=len(base_df) * negatives_per_positive, random_state=random_seed),
]
)


def test_df(
base_test_df: pd.DataFrame,
test_df_negatives: pd.DataFrame,
negatives_per_positive: int = 1,
random_seed: int = 123,
) -> pd.DataFrame:
"""Return dataframe of testing pairs, with negatives added."""
return pd.concat(
[
base_test_df,
test_df_negatives.sample(
n=len(base_test_df) * negatives_per_positive, random_state=random_seed
),
]
)
# def test_df(
# base_test_df: pd.DataFrame,
# test_df_negatives: pd.DataFrame,
# negatives_per_positive: int = 1,
# random_seed: int = 123,
# ) -> pd.DataFrame:
# """Return dataframe of testing pairs, with negatives added."""
# return pd.concat(
# [
# base_test_df,
# test_df_negatives.sample(
# n=len(base_test_df) * negatives_per_positive, random_state=random_seed
# ),
# ]
# )


# Expose text_1 and text_2 columns from train and test dataframes
Expand Down Expand Up @@ -731,3 +736,7 @@ def test_accuracy_post_optimization(
],
inputs={},
)
print(result["train_accuracy"])
print(result["test_accuracy"])
print(result["train_accuracy"])
print(result["test_accuracy_post_optimization"])
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 3146b15

Please sign in to comment.