Responding to review comments.

DAGWorks-Inc · Dec 3, 2023 · 3146b15 · 3146b15
1 parent 09431a9
commit 3146b15
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 32 deletions.
diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/README.md b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/README.md
@@ -1,6 +1,7 @@
 # Purpose of this module
 
-This module is used to customize embeddings for text data. It is based on MIT licensed code from the OpenAI cookbook.
+This module is used to customize embeddings for text data. It is based on MIT licensed code from
+this [OpenAI cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/Customizing_embeddings.ipynb).
 
 The output is a matrix that you can use to multiply your embeddings. The product of this multiplication is a
 'custom embedding' that will better emphasize aspects of the text relevant to your use case.
@@ -25,7 +26,7 @@ If you pass in `{"source":"local"}` as configuration to the driver, the module w
 path to. The dataset should be a csv with columns "text_1", "text_2", and "label". The label should be +1 if the text
 pairs are similar and -1 if the text pairs are dissimilar.
 
-Otheriwse if you pass in `{}` as configuration to the driver, the module will require you to pass in a dataframe as
+Otherwise if you pass in `{}` as configuration to the driver, the module will require you to pass in a dataframe as
 `processed_local_dataset` as an input. The dataframe should have columns "text_1", "text_2", and "label". The label should be +1 if the
 text pairs are similar and -1 if the text pairs are dissimilar.
 

diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py
@@ -53,6 +53,7 @@
     extract_fields,
     group,
     inject,
+    load_from,
     parameterize,
     source,
     value,
@@ -155,22 +156,24 @@ def processed_local_dataset__snli(
 
 @config.when(source="local")
 @check_output(schema=processed_dataset_schema, importance="fail")
+@load_from.csv(
+    path=source("local_dataset_path")
+    # see data loader docuemntation and the PandasCSVReader for values you can pass in:
+    #  - https://hamilton.dagworks.io/en/latest/reference/io/available-data-adapters/#data-loaders
+    #  - https://github.com/dagworks-inc/hamilton/blob/main/hamilton/plugins/pandas_extensions.py#L89-L255
+)
 def processed_local_dataset__local(
-    local_dataset_path: str,
-    read_csv_kwargs: dict = None,
+    local_dataset: pd.DataFrame,
 ) -> pd.DataFrame:
     """Uses a local dataset, reading it from the given path.
 
     Override this with a dataframe (`overrides={"processed_local_dataset": my_df}`) if you have your own dataset
     that's ready in a notebook, or modify it to match your use case.
 
-    :param local_dataset_path: path to file.
-    :param read_csv_kwargs: kwargs to pass to pd.read_csv. e.g. delimiter.
+    :param local_dataset: dataframe loaded by the Pandas CSV Reader.
     :return: dataframe of text pairs with labels. Schema: text_1, text_2, label (1 for similar, -1 for dissimilar).
     """
-    if read_csv_kwargs is None:
-        read_csv_kwargs = {}
-    return pd.read_csv(local_dataset_path, **read_csv_kwargs)
+    return local_dataset
 
 
 # split data into train and test sets
@@ -239,38 +242,40 @@ def test_df_negatives(base_test_df: pd.DataFrame) -> pd.DataFrame:
     return _df
 
 
-def train_df(
-    base_train_df: pd.DataFrame,
-    train_df_negatives: pd.DataFrame,
+@parameterize(
+    train_df={"base_df": source("base_train_df"), "df_negatives": source("train_df_negatives")},
+    test_df={"base_df": source("base_test_df"), "df_negatives": source("test_df_negatives")},
+)
+def construct_df(
+    base_df: pd.DataFrame,
+    df_negatives: pd.DataFrame,
     negatives_per_positive: int = 1,
     random_seed: int = 123,
 ) -> pd.DataFrame:
-    """Return dataframe of training pairs, with negatives added."""
+    f"""Return dataframe of {base_df} paris with negatives added."""
     return pd.concat(
         [
-            base_train_df,
-            train_df_negatives.sample(
-                n=len(base_train_df) * negatives_per_positive, random_state=random_seed
-            ),
+            base_df,
+            df_negatives.sample(n=len(base_df) * negatives_per_positive, random_state=random_seed),
         ]
     )
 
 
-def test_df(
-    base_test_df: pd.DataFrame,
-    test_df_negatives: pd.DataFrame,
-    negatives_per_positive: int = 1,
-    random_seed: int = 123,
-) -> pd.DataFrame:
-    """Return dataframe of testing pairs, with negatives added."""
-    return pd.concat(
-        [
-            base_test_df,
-            test_df_negatives.sample(
-                n=len(base_test_df) * negatives_per_positive, random_state=random_seed
-            ),
-        ]
-    )
+# def test_df(
+#     base_test_df: pd.DataFrame,
+#     test_df_negatives: pd.DataFrame,
+#     negatives_per_positive: int = 1,
+#     random_seed: int = 123,
+# ) -> pd.DataFrame:
+#     """Return dataframe of testing pairs, with negatives added."""
+#     return pd.concat(
+#         [
+#             base_test_df,
+#             test_df_negatives.sample(
+#                 n=len(base_test_df) * negatives_per_positive, random_state=random_seed
+#             ),
+#         ]
+#     )
 
 
 # Expose text_1 and text_2 columns from train and test dataframes
@@ -731,3 +736,7 @@ def test_accuracy_post_optimization(
         ],
         inputs={},
     )
+    print(result["train_accuracy"])
+    print(result["test_accuracy"])
+    print(result["train_accuracy"])
+    print(result["test_accuracy_post_optimization"])
diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/dag.png b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/dag.png