-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implements TF-IDF text embeddings. Performs classification on these embeddings using Logistic Regression, but not the other models, due to slow training time. There are 70K terms / columns in the document term matrix, which makes training very slow. It also makes it hard to save the resulting file of embeddings, which is 5GB as an HD5 file 😮 So we should consider limiting the number of terms or doing dimensionality reduction or something.
- Loading branch information
Showing
148 changed files
with
512,881 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,13 @@ | ||
|
||
|
||
import os | ||
|
||
import json | ||
|
||
DATA_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "data") | ||
RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results") | ||
|
||
|
||
|
||
def save_results_json(results, json_filepath): | ||
with open(json_filepath, "w") as json_file: | ||
json.dump(results, json_file, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
|
||
|
||
import re | ||
|
||
|
||
|
||
|
||
def convert_non_ascii(txt): | ||
# we see tokens like: | ||
# 'état', 'être', | ||
# 'últimahora', 'μολωνλαβε', 'قاسم_سليماني', '𝐔𝐍𝐇𝐈𝐍𝐆𝐄𝐃', '𝐜𝐨𝐮𝐧𝐭𝐫𝐲', | ||
# '𝐝𝐚𝐲𝐬', '𝐨𝐮𝐫', '𝐩𝐨𝐥𝐢𝐭𝐢𝐜𝐬', '𝐮𝐬', '𝑤𝑒𝑎𝑘𝑒𝑠𝑡', '𝑱𝑶𝑯𝑵', '𝑹𝑶𝑩𝑬𝑹𝑻𝑺', | ||
# '𝔽𝕆ℝ𝔼𝕍𝔼ℝ', '𝕋𝕙𝕒𝕥', '𝕖𝕞𝕓𝕒𝕣𝕣𝕒𝕤𝕤𝕚𝕟𝕘', '𝕛𝕦𝕤𝕥', '𝗖𝗿𝗼𝗽𝘀', '𝗗𝗡𝗖', '𝗗𝗮𝗶𝗹𝘆', | ||
# '𝗘𝗡𝗗𝗢𝗥𝗦𝗘𝗗', '𝗘𝗡𝗙𝗢𝗥𝗖𝗘𝗦', '𝗝𝗢𝗬', '𝗝𝗢𝗬𝗦', '𝗟𝗲𝗮𝗱𝗶𝗻𝗴', '𝗡𝗢', '𝗢𝗙', | ||
# '𝗣𝗹𝗮𝘆𝗯𝗼𝗼𝗸', '𝗥𝗲𝗺𝗶𝗻𝗱𝗲𝗿', '𝗦𝘁𝗮𝘁𝗲𝘀', '𝗩𝗲𝗴𝗲𝘁𝗮𝗯𝗹𝗲𝘀', '𝗰𝗿𝗲𝗱𝗶𝗯𝗶𝗹𝗶𝘁𝘆', | ||
# '𝗳𝗼𝗿𝗲𝘃𝗲𝗿', '𝗶𝗺𝗽𝗲𝗮𝗰𝗵𝗲𝗱', '𝗶𝗻', '𝗶𝗻𝗲𝘃𝗶𝘁𝗮𝗯𝗹𝗲', '𝗻𝗲𝘃𝗲𝗿', '𝗻𝗼', | ||
# '𝙀𝙢𝙗𝙤𝙡𝙙𝙚𝙣', '𝙛𝙖𝙢𝙞𝙡𝙮', '𝙛𝙚𝙚𝙡', '𝙜𝙧𝙤𝙪𝙥', '𝙝𝙞𝙨', '𝙞𝙣', '𝙠𝙞𝙙𝙨', '𝙨𝙖𝙙' | ||
|
||
# so we'll convert to keep their meaning: | ||
|
||
terms_map = { | ||
'état': 'etat', | ||
'être': 'etre', | ||
'últimahora': 'ultimahora', | ||
'μολωνλαβε': 'molonlabe', | ||
'قاسم_سليماني': 'Qasem_Soleimani', | ||
'𝐔𝐍𝐇𝐈𝐍𝐆𝐄𝐃': 'UNHINGED', | ||
'𝐜𝐨𝐮𝐧𝐭𝐫𝐲': 'country', | ||
'𝐝𝐚𝐲𝐬': 'days', | ||
'𝐨𝐮𝐫': 'our', | ||
'𝐩𝐨𝐥𝐢𝐭𝐢𝐜𝐬': 'politics', | ||
'𝐮𝐬': 'us', | ||
'𝑤𝑒𝑎𝑘𝑒𝑠𝑡': 'weakest', | ||
'𝑱𝑶𝑯𝑵': 'JOHN', | ||
'𝑹𝑶𝑩𝑬𝑹𝑻𝑺': 'ROBERTS', | ||
'𝔽𝕆ℝ𝔼𝕍𝔼ℝ': 'FOREVER', | ||
'𝕋𝕙𝕒𝕥': 'That', | ||
'𝕖𝕞𝕓𝕒𝕣𝕣𝕒𝕤𝕤𝕚𝕟𝕘': 'embarrassing', | ||
'𝕛𝕦𝕤𝕥': 'just', | ||
'𝗖𝗿𝗼𝗽𝘀': 'Crops', | ||
'𝗗𝗡𝗖': 'DNC', | ||
'𝗗𝗮𝗶𝗹𝘆': 'Daily', | ||
'𝗘𝗡𝗗𝗢𝗥𝗦𝗘𝗗': 'ENDORSED', | ||
'𝗘𝗡𝗙𝗢𝗥𝗖𝗘𝗦': 'ENFORCES', | ||
'𝗝𝗢𝗬': 'JOY', | ||
'𝗝𝗢𝗬𝗦': 'JOYS', | ||
'𝗟𝗲𝗮𝗱𝗶𝗻𝗴': 'Leading', | ||
'𝗡𝗢': 'NO', | ||
'𝗢𝗙': 'OF', | ||
'𝗣𝗹𝗮𝘆𝗯𝗼𝗼𝗸': 'Playbook', | ||
'𝗥𝗲𝗺𝗶𝗻𝗱𝗲𝗿': 'Reminder', | ||
'𝗦𝘁𝗮𝘁𝗲𝘀': 'States', | ||
'𝗩𝗲𝗴𝗲𝘁𝗮𝗯𝗹𝗲𝘀': 'Vegetables', | ||
'𝗰𝗿𝗲𝗱𝗶𝗯𝗶𝗹𝗶𝘁𝘆': 'credibility', | ||
'𝗳𝗼𝗿𝗲𝘃𝗲𝗿': 'forever', | ||
'𝗶𝗺𝗽𝗲𝗮𝗰𝗵𝗲𝗱': 'impeached', | ||
'𝗶𝗻': 'in', | ||
'𝗶𝗻𝗲𝘃𝗶𝘁𝗮𝗯𝗹𝗲': 'inevitable', | ||
'𝗻𝗲𝘃𝗲𝗿': 'never', | ||
'𝙀𝙢𝙗𝙤𝙡𝙙𝙚𝙣': 'Embolden', | ||
'𝙛𝙖𝙢𝙞𝙡𝙮': 'family', | ||
'𝙛𝙚𝙚𝙡': 'feel', | ||
'𝙜𝙧𝙤𝙪𝙥': 'group', | ||
'𝙝𝙞𝙨': 'his', | ||
'𝙞𝙣': 'in', | ||
'𝙠𝙞𝙙𝙨': 'kids', | ||
'𝙨𝙖𝙙': 'sad', | ||
'𝗻𝗼': 'no', | ||
'𝙩𝙚𝙭𝙩': 'text', | ||
|
||
# these don't work ?: | ||
'zürich': 'zurich', | ||
'últimahora': 'ultimahora', | ||
'μολωνλαβε': 'molonlabe', # come and take them | ||
|
||
} | ||
|
||
for k, v in terms_map.items(): | ||
txt = txt.replace(k, v) | ||
|
||
# despite best efforts, remove any remaining non-asci: | ||
|
||
non_ascii_pattern = re.compile(r'[^\x00-\x7F]+') | ||
txt = non_ascii_pattern.sub('', txt) | ||
|
||
return txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
|
||
|
||
## TF-IDF | ||
|
||
A simple embeddings method. | ||
|
||
|
||
### Text Embeddings | ||
|
||
Run the pipeline. Saves embeddings to HD5 file because CSV export was taking too long. | ||
|
||
```sh | ||
python -m app.tfidf_embeddings.pipeline | ||
``` | ||
|
||
### Dimensionality Reduction | ||
|
||
Perform dimensionality reduction on the resulting word and document embeddings, respectively: | ||
|
||
```sh | ||
FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.reduction | ||
``` | ||
|
||
### Classification Job | ||
|
||
```sh | ||
FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.classification | ||
``` | ||
|
||
This is taking a while. There are so many columns. We should consider using less features. Perhaps 1500 max to be in line with OpenAI text embeddings. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
|
||
import os | ||
from functools import cached_property | ||
|
||
from pandas import read_csv | ||
|
||
from app import RESULTS_DIRPATH | ||
from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS | ||
from app.classification.logistic_regression import LogisticRegressionPipeline | ||
from app.classification.random_forest import RandomForestPipeline | ||
from app.classification.xgboost import XGBoostPipeline | ||
|
||
from app.tfidf_embeddings.pipeline import TextEmbeddingPipeline | ||
|
||
CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "tfidf_classification") | ||
|
||
class TextDataset(): | ||
"""The original dataset interface assumes a CSV file and that's too opinionated""" | ||
|
||
def __init__(self, df, x): | ||
#self.csv_filepath = None | ||
#self.label_cols = None | ||
#self.labels_df = None | ||
|
||
self.df = df | ||
self.x = x | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
|
||
from app.dataset import Dataset | ||
|
||
ds = Dataset() | ||
df = ds.df | ||
df.index = df["user_id"] | ||
|
||
pipeline = TextEmbeddingPipeline(corpus=df["tweet_texts"]) | ||
pipeline.perform() | ||
|
||
# USE TFIDF EMBEDDINGS | ||
|
||
x = pipeline.embeddings_df | ||
print(x.shape) | ||
|
||
# dataset api on the fly: | ||
text_ds = TextDataset(df=df, x=x) | ||
|
||
will_upload = False | ||
for y_col in Y_COLS: | ||
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression") | ||
#pipeline = LogisticRegressionPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={ | ||
# | ||
# # C (float), default=1.0 | ||
# # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. | ||
# "classifier__C": [ | ||
# 1, #2, 5, | ||
# 10, #25, 50, | ||
# #100 | ||
# ], | ||
# | ||
# # default max_iter is 100 | ||
# "classifier__max_iter": [#10, 25, | ||
# 50, | ||
# 100, | ||
# #250, | ||
# 500, | ||
# #1_000, #5_000, 10_000 | ||
# ], | ||
#}) | ||
#pipeline.perform() | ||
|
||
#continue | ||
|
||
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost") | ||
pipeline = XGBoostPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={ | ||
'classifier__n_estimators': [50, | ||
100, 150, | ||
250] | ||
}) | ||
pipeline.perform() | ||
|
||
# the slowest can go last: | ||
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest") | ||
pipeline = RandomForestPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={ | ||
'classifier__n_estimators': [50, | ||
100, 150, | ||
250] | ||
}) | ||
pipeline.perform() |
Oops, something went wrong.