Skip to content

Commit

Permalink
TF-IDF Embeddings (#25)
Browse files Browse the repository at this point in the history
Implements TF-IDF text embeddings. Performs classification on these embeddings using Logistic Regression, but not the other models, due to slow training time.

There are 70K terms / columns in the document term matrix, which makes training very slow. It also makes it hard to save the resulting file of embeddings, which is 5GB as an HD5 file 😮 

So we should consider limiting the number of terms or doing dimensionality reduction or something.
  • Loading branch information
s2t2 authored Dec 1, 2023
1 parent f530f60 commit de1e096
Show file tree
Hide file tree
Showing 148 changed files with 512,881 additions and 14 deletions.
13 changes: 9 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@ results/*/*.csv.gz
results/*/*.json
results/*/*.json

results/word2vec_embeddings/*.model
results/word2vec_embeddings/*.kv
results/word2vec_embeddings/*.csv
# embedding models:
results/*_embeddings/*.model
results/*_embeddings/*.kv
results/*_embeddings/*.csv
results/*_embeddings/*.hd5
results/*_embeddings/*.csv.gz
!results/*_embeddings/*.json
results/*_embeddings/*/*.csv

results/word2vec_classification/*/*/model.joblib
results/word2vec_classification/*/*/*.csv

#results/*/*/*.png
#results/*/*/*.html
results/*/*/*.json
Expand All @@ -30,6 +34,7 @@ results/classification/*/*/model.joblib

# ignore these files b/c they contains user ids:
results/classification/*/*/predictions.csv
results/*classification/*/*/predictions.csv

# keep this:
!results/reduced_classification/all_results.csv
Expand Down
8 changes: 7 additions & 1 deletion app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@


import os

import json

DATA_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "data")
RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results")



def save_results_json(results, json_filepath):
with open(json_filepath, "w") as json_file:
json.dump(results, json_file, indent=4)
6 changes: 0 additions & 6 deletions app/classification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

import os
import json

from app import RESULTS_DIRPATH

Expand Down Expand Up @@ -41,8 +40,3 @@ def class_labels(y_col, class_names):
classes_map = CLASSES_MAP[y_col]
class_names = [classes_map[val] for val in class_names]
return class_names


def save_results_json(results, json_filepath):
with open(json_filepath, "w") as json_file:
json.dump(results, json_file, indent=4)
1 change: 0 additions & 1 deletion app/classification/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from sklearn.linear_model import LogisticRegression
from pandas import Series

from app.classification import save_results_json
from app.classification.pipeline import ClassificationPipeline


Expand Down
3 changes: 2 additions & 1 deletion app/classification/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc

from app import save_results_json
from app.colors import ORANGES
from app.dataset import Dataset
from app.model_storage import ModelStorage
from app.classification import CLASSIFICATION_RESULTS_DIRPATH, save_results_json, class_labels
from app.classification import CLASSIFICATION_RESULTS_DIRPATH, class_labels
from app.classification.results import ClassificationResults


Expand Down
86 changes: 86 additions & 0 deletions app/nlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@


import re




def convert_non_ascii(txt):
# we see tokens like:
# 'état', 'être',
# 'últimahora', 'μολωνλαβε', 'قاسم_سليماني', '𝐔𝐍𝐇𝐈𝐍𝐆𝐄𝐃', '𝐜𝐨𝐮𝐧𝐭𝐫𝐲',
# '𝐝𝐚𝐲𝐬', '𝐨𝐮𝐫', '𝐩𝐨𝐥𝐢𝐭𝐢𝐜𝐬', '𝐮𝐬', '𝑤𝑒𝑎𝑘𝑒𝑠𝑡', '𝑱𝑶𝑯𝑵', '𝑹𝑶𝑩𝑬𝑹𝑻𝑺',
# '𝔽𝕆ℝ𝔼𝕍𝔼ℝ', '𝕋𝕙𝕒𝕥', '𝕖𝕞𝕓𝕒𝕣𝕣𝕒𝕤𝕤𝕚𝕟𝕘', '𝕛𝕦𝕤𝕥', '𝗖𝗿𝗼𝗽𝘀', '𝗗𝗡𝗖', '𝗗𝗮𝗶𝗹𝘆',
# '𝗘𝗡𝗗𝗢𝗥𝗦𝗘𝗗', '𝗘𝗡𝗙𝗢𝗥𝗖𝗘𝗦', '𝗝𝗢𝗬', '𝗝𝗢𝗬𝗦', '𝗟𝗲𝗮𝗱𝗶𝗻𝗴', '𝗡𝗢', '𝗢𝗙',
# '𝗣𝗹𝗮𝘆𝗯𝗼𝗼𝗸', '𝗥𝗲𝗺𝗶𝗻𝗱𝗲𝗿', '𝗦𝘁𝗮𝘁𝗲𝘀', '𝗩𝗲𝗴𝗲𝘁𝗮𝗯𝗹𝗲𝘀', '𝗰𝗿𝗲𝗱𝗶𝗯𝗶𝗹𝗶𝘁𝘆',
# '𝗳𝗼𝗿𝗲𝘃𝗲𝗿', '𝗶𝗺𝗽𝗲𝗮𝗰𝗵𝗲𝗱', '𝗶𝗻', '𝗶𝗻𝗲𝘃𝗶𝘁𝗮𝗯𝗹𝗲', '𝗻𝗲𝘃𝗲𝗿', '𝗻𝗼',
# '𝙀𝙢𝙗𝙤𝙡𝙙𝙚𝙣', '𝙛𝙖𝙢𝙞𝙡𝙮', '𝙛𝙚𝙚𝙡', '𝙜𝙧𝙤𝙪𝙥', '𝙝𝙞𝙨', '𝙞𝙣', '𝙠𝙞𝙙𝙨', '𝙨𝙖𝙙'

# so we'll convert to keep their meaning:

terms_map = {
'état': 'etat',
'être': 'etre',
'últimahora': 'ultimahora',
'μολωνλαβε': 'molonlabe',
'قاسم_سليماني': 'Qasem_Soleimani',
'𝐔𝐍𝐇𝐈𝐍𝐆𝐄𝐃': 'UNHINGED',
'𝐜𝐨𝐮𝐧𝐭𝐫𝐲': 'country',
'𝐝𝐚𝐲𝐬': 'days',
'𝐨𝐮𝐫': 'our',
'𝐩𝐨𝐥𝐢𝐭𝐢𝐜𝐬': 'politics',
'𝐮𝐬': 'us',
'𝑤𝑒𝑎𝑘𝑒𝑠𝑡': 'weakest',
'𝑱𝑶𝑯𝑵': 'JOHN',
'𝑹𝑶𝑩𝑬𝑹𝑻𝑺': 'ROBERTS',
'𝔽𝕆ℝ𝔼𝕍𝔼ℝ': 'FOREVER',
'𝕋𝕙𝕒𝕥': 'That',
'𝕖𝕞𝕓𝕒𝕣𝕣𝕒𝕤𝕤𝕚𝕟𝕘': 'embarrassing',
'𝕛𝕦𝕤𝕥': 'just',
'𝗖𝗿𝗼𝗽𝘀': 'Crops',
'𝗗𝗡𝗖': 'DNC',
'𝗗𝗮𝗶𝗹𝘆': 'Daily',
'𝗘𝗡𝗗𝗢𝗥𝗦𝗘𝗗': 'ENDORSED',
'𝗘𝗡𝗙𝗢𝗥𝗖𝗘𝗦': 'ENFORCES',
'𝗝𝗢𝗬': 'JOY',
'𝗝𝗢𝗬𝗦': 'JOYS',
'𝗟𝗲𝗮𝗱𝗶𝗻𝗴': 'Leading',
'𝗡𝗢': 'NO',
'𝗢𝗙': 'OF',
'𝗣𝗹𝗮𝘆𝗯𝗼𝗼𝗸': 'Playbook',
'𝗥𝗲𝗺𝗶𝗻𝗱𝗲𝗿': 'Reminder',
'𝗦𝘁𝗮𝘁𝗲𝘀': 'States',
'𝗩𝗲𝗴𝗲𝘁𝗮𝗯𝗹𝗲𝘀': 'Vegetables',
'𝗰𝗿𝗲𝗱𝗶𝗯𝗶𝗹𝗶𝘁𝘆': 'credibility',
'𝗳𝗼𝗿𝗲𝘃𝗲𝗿': 'forever',
'𝗶𝗺𝗽𝗲𝗮𝗰𝗵𝗲𝗱': 'impeached',
'𝗶𝗻': 'in',
'𝗶𝗻𝗲𝘃𝗶𝘁𝗮𝗯𝗹𝗲': 'inevitable',
'𝗻𝗲𝘃𝗲𝗿': 'never',
'𝙀𝙢𝙗𝙤𝙡𝙙𝙚𝙣': 'Embolden',
'𝙛𝙖𝙢𝙞𝙡𝙮': 'family',
'𝙛𝙚𝙚𝙡': 'feel',
'𝙜𝙧𝙤𝙪𝙥': 'group',
'𝙝𝙞𝙨': 'his',
'𝙞𝙣': 'in',
'𝙠𝙞𝙙𝙨': 'kids',
'𝙨𝙖𝙙': 'sad',
'𝗻𝗼': 'no',
'𝙩𝙚𝙭𝙩': 'text',

# these don't work ?:
'zürich': 'zurich',
'últimahora': 'ultimahora',
'μολωνλαβε': 'molonlabe', # come and take them

}

for k, v in terms_map.items():
txt = txt.replace(k, v)

# despite best efforts, remove any remaining non-asci:

non_ascii_pattern = re.compile(r'[^\x00-\x7F]+')
txt = non_ascii_pattern.sub('', txt)

return txt
30 changes: 30 additions & 0 deletions app/tfidf_embeddings/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@


## TF-IDF

A simple embeddings method.


### Text Embeddings

Run the pipeline. Saves embeddings to HD5 file because CSV export was taking too long.

```sh
python -m app.tfidf_embeddings.pipeline
```

### Dimensionality Reduction

Perform dimensionality reduction on the resulting word and document embeddings, respectively:

```sh
FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.reduction
```

### Classification Job

```sh
FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.classification
```

This is taking a while. There are so many columns. We should consider using less features. Perhaps 1500 max to be in line with OpenAI text embeddings.
91 changes: 91 additions & 0 deletions app/tfidf_embeddings/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@

import os
from functools import cached_property

from pandas import read_csv

from app import RESULTS_DIRPATH
from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS
from app.classification.logistic_regression import LogisticRegressionPipeline
from app.classification.random_forest import RandomForestPipeline
from app.classification.xgboost import XGBoostPipeline

from app.tfidf_embeddings.pipeline import TextEmbeddingPipeline

CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "tfidf_classification")

class TextDataset():
"""The original dataset interface assumes a CSV file and that's too opinionated"""

def __init__(self, df, x):
#self.csv_filepath = None
#self.label_cols = None
#self.labels_df = None

self.df = df
self.x = x



if __name__ == "__main__":


from app.dataset import Dataset

ds = Dataset()
df = ds.df
df.index = df["user_id"]

pipeline = TextEmbeddingPipeline(corpus=df["tweet_texts"])
pipeline.perform()

# USE TFIDF EMBEDDINGS

x = pipeline.embeddings_df
print(x.shape)

# dataset api on the fly:
text_ds = TextDataset(df=df, x=x)

will_upload = False
for y_col in Y_COLS:
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
#pipeline = LogisticRegressionPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
#
# # C (float), default=1.0
# # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
# "classifier__C": [
# 1, #2, 5,
# 10, #25, 50,
# #100
# ],
#
# # default max_iter is 100
# "classifier__max_iter": [#10, 25,
# 50,
# 100,
# #250,
# 500,
# #1_000, #5_000, 10_000
# ],
#})
#pipeline.perform()

#continue

results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
pipeline = XGBoostPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
'classifier__n_estimators': [50,
100, 150,
250]
})
pipeline.perform()

# the slowest can go last:
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest")
pipeline = RandomForestPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
'classifier__n_estimators': [50,
100, 150,
250]
})
pipeline.perform()
Loading

0 comments on commit de1e096

Please sign in to comment.