diff --git a/Dockerfile b/Dockerfile index b26c1fb..66389b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,8 +14,7 @@ RUN pip3.10 install --no-cache-dir \ 'git+https://github.com/exorde-labs/exorde_data.git' \ 'git+https://github.com/exorde-labs/exorde-client.git'\ selenium==4.2.0 \ - wtpsplit==1.3.0 \ - && pip3.10 install --no-cache-dir --upgrade 'git+https://github.com/JustAnotherArchivist/snscrape.git' + wtpsplit==1.3.0 # Clean cache now that we have installed everything RUN rm -rf /root/.cache/* \ diff --git a/exorde/lab_initialization.py b/exorde/lab_initialization.py index df69b57..2f167ff 100644 --- a/exorde/lab_initialization.py +++ b/exorde/lab_initialization.py @@ -4,18 +4,13 @@ import torch from transformers import pipeline from argostranslate import translate as _translate +from exorde.tag import initialize_models def lab_initialization(): device = torch.cuda.current_device() if torch.cuda.is_available() else -1 - classifier = pipeline( - "zero-shot-classification", - model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", - device=device, - batch_size=16, - top_k=None, - max_length=64, - ) + # initalize models + models = initialize_models(device) labels = requests.get( "https://raw.githubusercontent.com/exorde-labs/TestnetProtocol/main/targets/class_names.json" ).json() @@ -34,7 +29,7 @@ def lab_initialization(): installed_languages = _translate.get_installed_languages() return { "device": device, - "classifier": classifier, + "models": models, "labeldict": labels, "mappings": mappings, "nlp": nlp, diff --git a/exorde/models.py b/exorde/models.py index d1cb171..938bac0 100644 --- a/exorde/models.py +++ b/exorde/models.py @@ -155,6 +155,7 @@ class Age(dict, metaclass=MadType): class Analysis(dict, metaclass=MadType): language_score: LanguageScore sentiment: Sentiment + classification: Classification embedding: Embedding gender: Gender text_type: TextType diff --git a/exorde/pre_install.py b/exorde/pre_install.py index 4de152d..0521fff 100755 --- a/exorde/pre_install.py +++ b/exorde/pre_install.py @@ -16,12 +16,10 @@ test_lang_detect = ft_test_detect("test") models = [ - "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", + "MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33", "SamLowe/roberta-base-go_emotions", "cardiffnlp/twitter-roberta-base-irony", - "salesken/query_wellformedness_score", "marieke93/MiniLM-evidence-types", - "alimazhar-110/website_classification", "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", "lxyuan/distilbert-base-multilingual-cased-sentiments-student", "bert-large-uncased" diff --git a/exorde/process_batch.py b/exorde/process_batch.py index 9747264..18cecd7 100644 --- a/exorde/process_batch.py +++ b/exorde/process_batch.py @@ -258,7 +258,7 @@ async def process_batch( completed: ProcessedItem = ProcessedItem( item=prot_item, analysis=ProtocolAnalysis( - classification=processed.classification, + classification=analysis.classification, top_keywords=processed.top_keywords, language_score=analysis.language_score, gender=analysis.gender, diff --git a/exorde/tag.py b/exorde/tag.py index 20da7ee..4fc631b 100644 --- a/exorde/tag.py +++ b/exorde/tag.py @@ -8,13 +8,13 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import tensorflow as tf import swifter +import logging from exorde.models import ( - Translation, + Classification, LanguageScore, Sentiment, Embedding, - SourceType, - TextType, + TextType, Emotion, Irony, Age, @@ -22,110 +22,29 @@ Analysis, ) +logging.basicConfig(level=logging.INFO) -class TokenAndPositionEmbedding(tf.keras.layers.Layer): - def __init__(self, maxlen, vocab_size, embed_dim, **__kwargs__): - super().__init__() - self.token_emb = tf.keras.layers.Embedding( - input_dim=vocab_size, output_dim=embed_dim - ) - self.pos_emb = tf.keras.layers.Embedding( - input_dim=maxlen, output_dim=embed_dim - ) - - def call(self, x): - maxlen = tf.shape(x)[-1] - positions = tf.range(start=0, limit=maxlen, delta=1) - positions = self.pos_emb(positions) - x = self.token_emb(x) - return x + positions - - -class TransformerBlock(tf.keras.layers.Layer): - def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **__kwargs__): - super().__init__() - self.att = tf.keras.layers.MultiHeadAttention( - num_heads=num_heads, key_dim=embed_dim - ) - self.ffn = tf.keras.Sequential( - [ - tf.keras.layers.Dense(ff_dim, activation="relu"), - tf.keras.layers.Dense(embed_dim), - ] - ) - self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) - self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) - self.dropout1 = tf.keras.layers.Dropout(rate) - self.dropout2 = tf.keras.layers.Dropout(rate) - - def call(self, inputs, training): - attn_output = self.att(inputs, inputs) - attn_output = self.dropout1(attn_output, training=training) - out1 = self.layernorm1(inputs + attn_output) - ffn_output = self.ffn(out1) - ffn_output = self.dropout2(ffn_output, training=training) - return self.layernorm2(out1 + ffn_output) - - -def tag(documents: list[str], lab_configuration): - """ - Analyzes and tags a list of text documents using various NLP models and techniques. - - The function processes the input documents using pre-trained models for tasks such as - sentence embeddings, text classification, sentiment analysis, and custom models for age, - gender, and hate speech detection. It returns a list of dictionaries containing the - processed data for each input document. - - Args: - documents (list): A list of text documents (strings) to be analyzed and tagged. - nlp: model - device: device - mappings: labels - - Returns: - list: A list of dictionaries, where each dictionary represents a single input text and - contains various processed data like embeddings, text classifications, sentiment, etc., - as key-value pairs. - """ - nlp = lab_configuration["nlp"] - device = lab_configuration["device"] - mappings = lab_configuration["mappings"] - - def predict(text, pipe, tag, mappings): - preds = pipe.predict(text, verbose=0)[0] - result = [] - for i in range(len(preds)): - result.append((mappings[tag][i], float(preds[i]))) - return result - - # get text content attribute from all items - for doc in documents: - assert isinstance(doc, str) - - # Create an empty DataFrame - tmp = pd.DataFrame() - - # Add the original text documents - tmp["Translation"] = documents - - assert tmp["Translation"] is not None - assert len(tmp["Translation"]) > 0 - - # Compute sentence embeddings - model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") - tmp["Embedding"] = tmp["Translation"].swifter.apply( - lambda x: list(model.encode(x).astype(float)) +def initialize_models(device): + logging.info("[TAGGING] Initializing models to be pre-ready for batch processing:") + models = {} + + logging.info("[TAGGING] Loading model: MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33") + models['zs_pipe'] = pipeline( + "zero-shot-classification", + model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33", + device=device ) - - # Text classification pipelines + logging.info("[TAGGING] Loading model: sentence-transformers/all-MiniLM-L6-v2") + models['sentence_transformer'] = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") + text_classification_models = [ ("Emotion", "SamLowe/roberta-base-go_emotions"), ("Irony", "cardiffnlp/twitter-roberta-base-irony"), - ("LanguageScore", "salesken/query_wellformedness_score"), ("TextType", "marieke93/MiniLM-evidence-types"), ] for col_name, model_name in text_classification_models: - pipe = pipeline( + logging.info(f"[TAGGING] Loading model: {model_name}") + models[col_name] = pipeline( "text-classification", model=model_name, top_k=None, @@ -133,30 +52,12 @@ def predict(text, pipe, tag, mappings): max_length=512, padding=True, ) - tmp[col_name] = tmp["Translation"].swifter.apply( - lambda x: [(y["label"], float(y["score"])) for y in pipe(x)[0]] - ) - del pipe # free ram for latest pipe - - # Tokenization for custom models - tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased") - tmp["Embedded"] = tmp["Translation"].swifter.apply( - lambda x: np.array( - tokenizer.encode_plus( - x, - add_special_tokens=True, - max_length=512, - truncation=True, - padding="max_length", - return_attention_mask=False, - return_tensors="tf", - )["input_ids"][0] - ).reshape(1, -1) - ) - - sentiment_analyzer = SentimentIntensityAnalyzer() + + logging.info("[TAGGING] Loading model: bert-large-uncased") + models['bert_tokenizer'] = AutoTokenizer.from_pretrained("bert-large-uncased") + logging.info("[TAGGING] Loading model: vaderSentiment") + models['sentiment_analyzer'] = SentimentIntensityAnalyzer() try: - # Sentiment analysis using VADER emoji_lexicon = hf_hub_download( repo_id="ExordeLabs/SentimentDetection", filename="emoji_unic_lexicon.json", @@ -164,126 +65,153 @@ def predict(text, pipe, tag, mappings): loughran_dict = hf_hub_download( repo_id="ExordeLabs/SentimentDetection", filename="loughran_dict.json" ) + logging.info("[TAGGING] Loading Loughran_dict & unic_emoji_dict for sentiment_analyzer.") with open(emoji_lexicon) as f: unic_emoji_dict = json.load(f) with open(loughran_dict) as f: Loughran_dict = json.load(f) - sentiment_analyzer.lexicon.update(Loughran_dict) - sentiment_analyzer.lexicon.update(unic_emoji_dict) + models['sentiment_analyzer'].lexicon.update(Loughran_dict) + models['sentiment_analyzer'].lexicon.update(unic_emoji_dict) except Exception as e: logging.info("[TAGGING] Error loading Loughran_dict & unic_emoji_dict for sentiment_analyzer. Doing without.") - - ############################ - # financial distilroberta - fdb_tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis") - fdb_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis") - ############################ - # distilbert sentiment - gdb_tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student") - gdb_model = AutoModelForSequenceClassification.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student") - ############################ - - fdb_pipe = pipeline( + logging.info("[TAGGING] Loading model: mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis") + models['fdb_tokenizer'] = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis") + logging.info("[TAGGING] Loading model: mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis") + models['fdb_model'] = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis") + models['fdb_pipe'] = pipeline( "text-classification", - model=fdb_model, - tokenizer=fdb_tokenizer, + model=models['fdb_model'], + tokenizer=models['fdb_tokenizer'], top_k=None, max_length=512, padding=True, ) - - gdb_pipe = pipeline( + + logging.info("[TAGGING] Loading model: lxyuan/distilbert-base-multilingual-cased-sentiments-student") + models['gdb_tokenizer'] = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student") + logging.info("[TAGGING] Loading model: lxyuan/distilbert-base-multilingual-cased-sentiments-student") + models['gdb_model'] = AutoModelForSequenceClassification.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student") + models['gdb_pipe'] = pipeline( "text-classification", - model=gdb_model, - tokenizer=gdb_tokenizer, + model=models['gdb_model'], + tokenizer=models['gdb_tokenizer'], top_k=None, max_length=512, padding=True, ) + logging.info("[TAGGING] Models loaded successfully.") + + return models + +def tag(documents: list[str], lab_configuration): + # loading from lab configuration, previously initialized + models = lab_configuration["models"] + + for doc in documents: + assert isinstance(doc, str) + + tmp = pd.DataFrame() + tmp["Translation"] = documents + + assert tmp["Translation"] is not None + assert len(tmp["Translation"]) > 0 + + logging.info("Starting Tagging Batch pipeline...") + model = models['sentence_transformer'] + tmp["Embedding"] = tmp["Translation"].swifter.apply( + lambda x: list(model.encode(x).astype(float)) + ) + + zs_pipe = models['zs_pipe'] + classification_labels = list(lab_configuration["labeldict"].keys()) + tmp["Classification"] = tmp["Translation"].swifter.apply( + lambda x: zs_pipe(x, candidate_labels=classification_labels) + ) + + + text_classification_models = ["Emotion", "Irony", "TextType"] + for col_name in text_classification_models: + pipe = models[col_name] + tmp[col_name] = tmp["Translation"].swifter.apply( + lambda x: [(y["label"], float(y["score"])) for y in pipe(x)[0]] + ) + + tokenizer = models['bert_tokenizer'] + tmp["Embedded"] = tmp["Translation"].swifter.apply( + lambda x: np.array( + tokenizer.encode_plus( + x, + add_special_tokens=True, + max_length=512, + truncation=True, + padding="max_length", + return_attention_mask=False, + return_tensors="tf", + )["input_ids"][0] + ).reshape(1, -1) + ) + + sentiment_analyzer = models['sentiment_analyzer'] + fdb_pipe = models['fdb_pipe'] + gdb_pipe = models['gdb_pipe'] def vader_sentiment(text): - # predict financial sentiment - return round(sentiment_analyzer.polarity_scores(text)["compound"],2) + return round(sentiment_analyzer.polarity_scores(text)["compound"], 2) def fin_vader_sentiment(text): - # predict general sentiment - return round(finvader(text, - use_sentibignomics = True, - use_henry = True, - indicator = 'compound' ),2) + return round(finvader(text, use_sentibignomics=True, use_henry=True, indicator='compound'), 2) def fdb_sentiment(text): prediction = fdb_pipe(text) - fdb_sentiment_dict = {} - for e in prediction[0]: - if e["label"] == "negative": - fdb_sentiment_dict["negative"] = round(e["score"],3) - elif e["label"] == "neutral": - fdb_sentiment_dict["neutral"] = round(e["score"],3) - elif e["label"] == "positive": - fdb_sentiment_dict["positive"] = round(e["score"],3) - # compounded score - fdb_compounded_score = round((fdb_sentiment_dict["positive"] - fdb_sentiment_dict["negative"]),3) - return fdb_compounded_score + fdb_sentiment_dict = {e["label"]: round(e["score"], 3) for e in prediction[0]} + return round(fdb_sentiment_dict["positive"] - fdb_sentiment_dict["negative"], 3) def gdb_sentiment(text): - # predict general sentiment prediction = gdb_pipe(text) - gen_distilbert_sent = {} - for e in prediction[0]: - if e["label"] == "negative": - gen_distilbert_sent["negative"] = round(e["score"],3) - elif e["label"] == "neutral": - gen_distilbert_sent["neutral"] = round(e["score"],3) - elif e["label"] == "positive": - gen_distilbert_sent["positive"] = round(e["score"],3) - # compounded score - gdb_score = round((gen_distilbert_sent["positive"] - gen_distilbert_sent["negative"]),3) - return gdb_score + gen_distilbert_sent = {e["label"]: round(e["score"], 3) for e in prediction[0]} + return round(gen_distilbert_sent["positive"] - gen_distilbert_sent["negative"], 3) def compounded_financial_sentiment(text): - # 65% financial distil roberta model + 35% fin_vader_score fin_vader_sent = fin_vader_sentiment(text) fin_distil_score = fdb_sentiment(text) - fin_compounded_score = round((0.70 * fin_distil_score + 0.30 * fin_vader_sent),2) - return fin_compounded_score + return round((0.70 * fin_distil_score + 0.30 * fin_vader_sent), 2) def compounded_sentiment(text): - # compounded_total_score: gen_distilbert_sentiment * 60% + vader_sentiment * 20% + compounded_fin_sentiment * 20% gen_distilbert_sentiment = gdb_sentiment(text) vader_sent = vader_sentiment(text) compounded_fin_sentiment = compounded_financial_sentiment(text) if abs(compounded_fin_sentiment) >= 0.6: - compounded_total_score = round((0.30 * gen_distilbert_sentiment + 0.10 * vader_sent + 0.60 * compounded_fin_sentiment),2) + return round((0.30 * gen_distilbert_sentiment + 0.10 * vader_sent + 0.60 * compounded_fin_sentiment), 2) elif abs(compounded_fin_sentiment) >= 0.4: - compounded_total_score = round((0.40 * gen_distilbert_sentiment + 0.20 * vader_sent + 0.40 * compounded_fin_sentiment),2) + return round((0.40 * gen_distilbert_sentiment + 0.20 * vader_sent + 0.40 * compounded_fin_sentiment), 2) elif abs(compounded_fin_sentiment) >= 0.1: - compounded_total_score = round((0.60 * gen_distilbert_sentiment + 0.25 * vader_sent + 0.15 * compounded_fin_sentiment),2) - else: # if abs(compounded_fin_sentiment) < 0.1, so no apparent financial component - compounded_total_score = round((0.60 * gen_distilbert_sentiment + 0.40 * vader_sent),2) - return compounded_total_score + return round((0.60 * gen_distilbert_sentiment + 0.25 * vader_sent + 0.15 * compounded_fin_sentiment), 2) + else: + return round((0.60 * gen_distilbert_sentiment + 0.40 * vader_sent), 2) - # sentiment swifter apply compounded_sentiment tmp["Sentiment"] = tmp["Translation"].swifter.apply(compounded_sentiment) - - # financial sentiment swifter apply compounded_financial_sentiment tmp["FinancialSentiment"] = tmp["Translation"].swifter.apply(compounded_financial_sentiment) del tmp["Embedded"] - # The output is a list of dictionaries, where each dictionary represents a single input text and contains - # various processed data like embeddings, text classifications, sentiment, etc., as key-value pairs. - # Update the items with processed data tmp = tmp.to_dict(orient="records") _out = [] for i in range(len(tmp)): - language_score = LanguageScore(tmp[i]["LanguageScore"][0][1]) - + # add Sentiment sentiment = Sentiment(tmp[i]["Sentiment"]) + # add Embedding embedding = Embedding(tmp[i]["Embedding"]) + # log tmp[i]["Classification"] + # logging.info(f"[TAGGING] classification item: {tmp[i]['Classification']}") + # they are of the form {'sequence': 'text', 'labels': ['label1', 'label2', ...], 'scores': [score1, score2, ...]} + # we keep only the top label and score into a Classification object (tuple) + top_label = tmp[i]["Classification"]["labels"][0] + top_score = round(tmp[i]["Classification"]["scores"][0], 4) + classification = Classification(label=top_label, score=top_score) + # mock gender gender = Gender(male=0.5, female=0.5) types = {item[0]: item[1] for item in tmp[i]["TextType"]} @@ -297,7 +225,10 @@ def compounded_sentiment(text): study=types["Statistics/Study"], ) + # Emotions emotions = {item[0]: item[1] for item in tmp[i]["Emotion"]} + # round all values to 4 decimal places + emotions = {k: round(v, 4) for k, v in emotions.items()} emotion = Emotion( love=emotions["love"], admiration=emotions["admiration"], @@ -328,22 +259,16 @@ def compounded_sentiment(text): nervousness=emotions["nervousness"], ) + # Irony ironies = {item[0]: item[1] for item in tmp[i]["Irony"]} - irony = Irony(irony=ironies["irony"], non_irony=ironies["non_irony"]) - - # ages = {item[0]: item[1] for item in tmp[i]["Age"]} - - # age = Age( - # below_twenty=ages["<20"], - # twenty_thirty=ages["20<30"], - # thirty_forty=ages["30<40"], - # forty_more=ages[">=40"], - # ) - # hardcode age / unused + # Age (untrained model) age = Age(below_twenty=0.0, twenty_thirty=0.0, thirty_forty=0.0, forty_more=0.0) - + # Language score (untrained model) + language_score = LanguageScore(1.0) # default value + # Add the analysis to the output list analysis = Analysis( + classification=classification, language_score=language_score, sentiment=sentiment, embedding=embedding, diff --git a/exorde/zero_shot.py b/exorde/zero_shot.py index 875ebc3..8efef12 100644 --- a/exorde/zero_shot.py +++ b/exorde/zero_shot.py @@ -22,38 +22,4 @@ def zero_shot( Returns: - path (list): A list containing the path of labels from the root to the predicted label. If the label hierarchy was not explored fully and the max_depth parameter was set, the path may not be complete. """ - # If max_depth is 0, return immediately with empty label and score 0 - if max_depth == 0: - return Classification(label=str(""), score=float(0)) - - labeldict = lab_configuration["labeldict"] - classifier = lab_configuration["classifier"] - text_ = item.translation - texts = [text_] - keys = list(labeldict.keys()) - - # Perform first level of classification - output = classifier(texts, keys, multi_label=False, max_length=32) - - # If max_depth is 1, return after first level of classification - if max_depth == 1: - return Classification( - label=output[0]["labels"][0], score=output[0]["scores"][0] - ) - - labels_list = list() - - for i in range(len(texts)): - labels = [ - output[i]["labels"][x] - for x in range(len(output[i]["labels"])) - if output[i]["scores"][x] > 0.1 - ] - labels_list.append(labels) - - # If max_depth not specified or larger than 1, perform second level of classification - keys = list(labeldict[labels_list[0][0]].keys()) - output = classifier(texts, keys, multi_label=False, max_length=32) - return Classification( - label=output[0]["labels"][0], score=output[0]["scores"][0] - ) + return Classification(label=str(""), score=float(0)) # not used anymore