TF-IDF Embeddings (#25)

Implements TF-IDF text embeddings. Performs classification on these embeddings using Logistic Regression, but not the other models, due to slow training time. There are 70K terms / columns in the document term matrix, which makes training very slow. It also makes it hard to save the resulting file of embeddings, which is 5GB as an HD5 file 😮 So we should consider limiting the number of terms or doing dimensionality reduction or something.
s2t2 · Dec 1, 2023 · de1e096 · de1e096
1 parent f530f60
commit de1e096
Show file tree

Hide file tree

Showing 148 changed files with 512,881 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,13 +15,17 @@ results/*/*.csv.gz
 results/*/*.json
 results/*/*.json
 
-results/word2vec_embeddings/*.model
-results/word2vec_embeddings/*.kv
-results/word2vec_embeddings/*.csv
+# embedding models:
+results/*_embeddings/*.model
+results/*_embeddings/*.kv
+results/*_embeddings/*.csv
+results/*_embeddings/*.hd5
+results/*_embeddings/*.csv.gz
+!results/*_embeddings/*.json
+results/*_embeddings/*/*.csv
 
 results/word2vec_classification/*/*/model.joblib
 results/word2vec_classification/*/*/*.csv
-
 #results/*/*/*.png
 #results/*/*/*.html
 results/*/*/*.json
@@ -30,6 +34,7 @@ results/classification/*/*/model.joblib
 
 # ignore these files b/c they contains user ids:
 results/classification/*/*/predictions.csv
+results/*classification/*/*/predictions.csv
 
 # keep this:
 !results/reduced_classification/all_results.csv

diff --git a/app/__init__.py b/app/__init__.py
@@ -1,7 +1,13 @@
 
 
 import os
-
+import json
 
 DATA_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "data")
 RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results")
+
+
+
+def save_results_json(results, json_filepath):
+    with open(json_filepath, "w") as json_file:
+        json.dump(results, json_file, indent=4)
diff --git a/app/classification/__init__.py b/app/classification/__init__.py
@@ -1,6 +1,5 @@
 
 import os
-import json
 
 from app import RESULTS_DIRPATH
 
@@ -41,8 +40,3 @@ def class_labels(y_col, class_names):
         classes_map = CLASSES_MAP[y_col]
         class_names = [classes_map[val] for val in class_names]
     return class_names
-
-
-def save_results_json(results, json_filepath):
-    with open(json_filepath, "w") as json_file:
-        json.dump(results, json_file, indent=4)
diff --git a/app/classification/logistic_regression.py b/app/classification/logistic_regression.py
@@ -4,7 +4,6 @@
 from sklearn.linear_model import LogisticRegression
 from pandas import Series
 
-from app.classification import save_results_json
 from app.classification.pipeline import ClassificationPipeline
 
 

diff --git a/app/classification/pipeline.py b/app/classification/pipeline.py
@@ -14,10 +14,11 @@
 from sklearn.pipeline import Pipeline
 from sklearn.metrics import roc_curve, auc
 
+from app import save_results_json
 from app.colors import ORANGES
 from app.dataset import Dataset
 from app.model_storage import ModelStorage
-from app.classification import CLASSIFICATION_RESULTS_DIRPATH, save_results_json, class_labels
+from app.classification import CLASSIFICATION_RESULTS_DIRPATH, class_labels
 from app.classification.results import ClassificationResults
 
 

diff --git a/app/nlp.py b/app/nlp.py
@@ -0,0 +1,86 @@
+
+
+import re
+
+
+
+
+def convert_non_ascii(txt):
+    # we see tokens like:
+    #   'état', 'être',
+    # 'últimahora', 'μολωνλαβε', 'قاسم_سليماني', '𝐔𝐍𝐇𝐈𝐍𝐆𝐄𝐃', '𝐜𝐨𝐮𝐧𝐭𝐫𝐲',
+    # '𝐝𝐚𝐲𝐬', '𝐨𝐮𝐫', '𝐩𝐨𝐥𝐢𝐭𝐢𝐜𝐬', '𝐮𝐬', '𝑤𝑒𝑎𝑘𝑒𝑠𝑡', '𝑱𝑶𝑯𝑵', '𝑹𝑶𝑩𝑬𝑹𝑻𝑺',
+    # '𝔽𝕆ℝ𝔼𝕍𝔼ℝ', '𝕋𝕙𝕒𝕥', '𝕖𝕞𝕓𝕒𝕣𝕣𝕒𝕤𝕤𝕚𝕟𝕘', '𝕛𝕦𝕤𝕥', '𝗖𝗿𝗼𝗽𝘀', '𝗗𝗡𝗖', '𝗗𝗮𝗶𝗹𝘆',
+    # '𝗘𝗡𝗗𝗢𝗥𝗦𝗘𝗗', '𝗘𝗡𝗙𝗢𝗥𝗖𝗘𝗦', '𝗝𝗢𝗬', '𝗝𝗢𝗬𝗦', '𝗟𝗲𝗮𝗱𝗶𝗻𝗴', '𝗡𝗢', '𝗢𝗙',
+    # '𝗣𝗹𝗮𝘆𝗯𝗼𝗼𝗸', '𝗥𝗲𝗺𝗶𝗻𝗱𝗲𝗿', '𝗦𝘁𝗮𝘁𝗲𝘀', '𝗩𝗲𝗴𝗲𝘁𝗮𝗯𝗹𝗲𝘀', '𝗰𝗿𝗲𝗱𝗶𝗯𝗶𝗹𝗶𝘁𝘆',
+    # '𝗳𝗼𝗿𝗲𝘃𝗲𝗿', '𝗶𝗺𝗽𝗲𝗮𝗰𝗵𝗲𝗱', '𝗶𝗻', '𝗶𝗻𝗲𝘃𝗶𝘁𝗮𝗯𝗹𝗲', '𝗻𝗲𝘃𝗲𝗿', '𝗻𝗼',
+    # '𝙀𝙢𝙗𝙤𝙡𝙙𝙚𝙣', '𝙛𝙖𝙢𝙞𝙡𝙮', '𝙛𝙚𝙚𝙡', '𝙜𝙧𝙤𝙪𝙥', '𝙝𝙞𝙨', '𝙞𝙣', '𝙠𝙞𝙙𝙨', '𝙨𝙖𝙙'
+
+    # so we'll convert to keep their meaning:
+
+    terms_map = {
+        'état': 'etat',
+        'être': 'etre',
+        'últimahora': 'ultimahora',
+        'μολωνλαβε': 'molonlabe',
+        'قاسم_سليماني': 'Qasem_Soleimani',
+        '𝐔𝐍𝐇𝐈𝐍𝐆𝐄𝐃': 'UNHINGED',
+        '𝐜𝐨𝐮𝐧𝐭𝐫𝐲': 'country',
+        '𝐝𝐚𝐲𝐬': 'days',
+        '𝐨𝐮𝐫': 'our',
+        '𝐩𝐨𝐥𝐢𝐭𝐢𝐜𝐬': 'politics',
+        '𝐮𝐬': 'us',
+        '𝑤𝑒𝑎𝑘𝑒𝑠𝑡': 'weakest',
+        '𝑱𝑶𝑯𝑵': 'JOHN',
+        '𝑹𝑶𝑩𝑬𝑹𝑻𝑺': 'ROBERTS',
+        '𝔽𝕆ℝ𝔼𝕍𝔼ℝ': 'FOREVER',
+        '𝕋𝕙𝕒𝕥': 'That',
+        '𝕖𝕞𝕓𝕒𝕣𝕣𝕒𝕤𝕤𝕚𝕟𝕘': 'embarrassing',
+        '𝕛𝕦𝕤𝕥': 'just',
+        '𝗖𝗿𝗼𝗽𝘀': 'Crops',
+        '𝗗𝗡𝗖': 'DNC',
+        '𝗗𝗮𝗶𝗹𝘆': 'Daily',
+        '𝗘𝗡𝗗𝗢𝗥𝗦𝗘𝗗': 'ENDORSED',
+        '𝗘𝗡𝗙𝗢𝗥𝗖𝗘𝗦': 'ENFORCES',
+        '𝗝𝗢𝗬': 'JOY',
+        '𝗝𝗢𝗬𝗦': 'JOYS',
+        '𝗟𝗲𝗮𝗱𝗶𝗻𝗴': 'Leading',
+        '𝗡𝗢': 'NO',
+        '𝗢𝗙': 'OF',
+        '𝗣𝗹𝗮𝘆𝗯𝗼𝗼𝗸': 'Playbook',
+        '𝗥𝗲𝗺𝗶𝗻𝗱𝗲𝗿': 'Reminder',
+        '𝗦𝘁𝗮𝘁𝗲𝘀': 'States',
+        '𝗩𝗲𝗴𝗲𝘁𝗮𝗯𝗹𝗲𝘀': 'Vegetables',
+        '𝗰𝗿𝗲𝗱𝗶𝗯𝗶𝗹𝗶𝘁𝘆': 'credibility',
+        '𝗳𝗼𝗿𝗲𝘃𝗲𝗿': 'forever',
+        '𝗶𝗺𝗽𝗲𝗮𝗰𝗵𝗲𝗱': 'impeached',
+        '𝗶𝗻': 'in',
+        '𝗶𝗻𝗲𝘃𝗶𝘁𝗮𝗯𝗹𝗲': 'inevitable',
+        '𝗻𝗲𝘃𝗲𝗿': 'never',
+        '𝙀𝙢𝙗𝙤𝙡𝙙𝙚𝙣': 'Embolden',
+        '𝙛𝙖𝙢𝙞𝙡𝙮': 'family',
+        '𝙛𝙚𝙚𝙡': 'feel',
+        '𝙜𝙧𝙤𝙪𝙥': 'group',
+        '𝙝𝙞𝙨': 'his',
+        '𝙞𝙣': 'in',
+        '𝙠𝙞𝙙𝙨': 'kids',
+        '𝙨𝙖𝙙': 'sad',
+        '𝗻𝗼': 'no',
+        '𝙩𝙚𝙭𝙩': 'text',
+
+        # these don't work ?:
+        'zürich': 'zurich',
+        'últimahora': 'ultimahora',
+        'μολωνλαβε': 'molonlabe', # come and take them
+
+    }
+
+    for k, v in terms_map.items():
+        txt = txt.replace(k, v)
+
+    # despite best efforts, remove any remaining non-asci:
+
+    non_ascii_pattern = re.compile(r'[^\x00-\x7F]+')
+    txt = non_ascii_pattern.sub('', txt)
+
+    return txt
diff --git a/app/tfidf_embeddings/README.md b/app/tfidf_embeddings/README.md
@@ -0,0 +1,30 @@
+
+
+## TF-IDF
+
+A simple embeddings method.
+
+
+### Text Embeddings
+
+Run the pipeline. Saves embeddings to HD5 file because CSV export was taking too long.
+
+```sh
+python -m app.tfidf_embeddings.pipeline
+```
+
+### Dimensionality Reduction
+
+Perform dimensionality reduction on the resulting word and document embeddings, respectively:
+
+```sh
+FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.reduction
+```
+
+### Classification Job
+
+```sh
+FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.classification
+```
+
+This is taking a while. There are so many columns. We should consider using less features. Perhaps 1500 max to be in line with OpenAI text embeddings.
diff --git a/app/tfidf_embeddings/classification.py b/app/tfidf_embeddings/classification.py
@@ -0,0 +1,91 @@
+
+import os
+from functools import cached_property
+
+from pandas import read_csv
+
+from app import RESULTS_DIRPATH
+from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS
+from app.classification.logistic_regression import LogisticRegressionPipeline
+from app.classification.random_forest import RandomForestPipeline
+from app.classification.xgboost import XGBoostPipeline
+
+from app.tfidf_embeddings.pipeline import TextEmbeddingPipeline
+
+CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "tfidf_classification")
+
+class TextDataset():
+    """The original dataset interface assumes a CSV file and that's too opinionated"""
+
+    def __init__(self, df, x):
+        #self.csv_filepath = None
+        #self.label_cols = None
+        #self.labels_df = None
+
+        self.df = df
+        self.x = x
+
+
+
+if __name__ == "__main__":
+
+
+    from app.dataset import Dataset
+
+    ds = Dataset()
+    df = ds.df
+    df.index = df["user_id"]
+
+    pipeline = TextEmbeddingPipeline(corpus=df["tweet_texts"])
+    pipeline.perform()
+
+    # USE TFIDF EMBEDDINGS
+
+    x = pipeline.embeddings_df
+    print(x.shape)
+
+    # dataset api on the fly:
+    text_ds = TextDataset(df=df, x=x)
+
+    will_upload = False
+    for y_col in Y_COLS:
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
+        #pipeline = LogisticRegressionPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
+        #
+        #    # C (float), default=1.0
+        #    # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
+        #    "classifier__C": [
+        #        1, #2, 5,
+        #        10, #25, 50,
+        #        #100
+        #    ],
+        #
+        #    # default max_iter is 100
+        #    "classifier__max_iter": [#10, 25,
+        #                             50,
+        #                             100,
+        #                             #250,
+        #                             500,
+        #                             #1_000, #5_000, 10_000
+        #                             ],
+        #})
+        #pipeline.perform()
+
+        #continue
+
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
+        pipeline = XGBoostPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
+            'classifier__n_estimators': [50,
+                                         100, 150,
+                                         250]
+        })
+        pipeline.perform()
+
+        # the slowest can go last:
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest")
+        pipeline = RandomForestPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
+            'classifier__n_estimators': [50,
+                                         100, 150,
+                                         250]
+        })
+        pipeline.perform()