Skip to content

Commit

Permalink
Batch processing optimization (#66)
Browse files Browse the repository at this point in the history
* Update lab_initialization.py
* Update pre_install.py
* Update zero_shot.py
* Update tag.py
* Update Dockerfile
* Update models.py


Removes a large zero shot model used until now, for a smaller, more efficient, with similar accuracy model.
We now are able to add it to the batch processing data pipeline and remove it from inter-item processing, accelerating the process & lot, reducing RAM usage over time.

The image will now be 1.2Gb smaller or so, and will have very fast item processing time, with all models loaded in RAM beforehand.
  • Loading branch information
MathiasExorde authored Oct 14, 2024
1 parent 4df9c4a commit 6dd1b2d
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 252 deletions.
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ RUN pip3.10 install --no-cache-dir \
'git+https://github.com/exorde-labs/exorde_data.git' \
'git+https://github.com/exorde-labs/exorde-client.git'\
selenium==4.2.0 \
wtpsplit==1.3.0 \
&& pip3.10 install --no-cache-dir --upgrade 'git+https://github.com/JustAnotherArchivist/snscrape.git'
wtpsplit==1.3.0

# Clean cache now that we have installed everything
RUN rm -rf /root/.cache/* \
Expand Down
13 changes: 4 additions & 9 deletions exorde/lab_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,13 @@
import torch
from transformers import pipeline
from argostranslate import translate as _translate
from exorde.tag import initialize_models


def lab_initialization():
device = torch.cuda.current_device() if torch.cuda.is_available() else -1
classifier = pipeline(
"zero-shot-classification",
model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
device=device,
batch_size=16,
top_k=None,
max_length=64,
)
# initalize models
models = initialize_models(device)
labels = requests.get(
"https://raw.githubusercontent.com/exorde-labs/TestnetProtocol/main/targets/class_names.json"
).json()
Expand All @@ -34,7 +29,7 @@ def lab_initialization():
installed_languages = _translate.get_installed_languages()
return {
"device": device,
"classifier": classifier,
"models": models,
"labeldict": labels,
"mappings": mappings,
"nlp": nlp,
Expand Down
1 change: 1 addition & 0 deletions exorde/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ class Age(dict, metaclass=MadType):
class Analysis(dict, metaclass=MadType):
language_score: LanguageScore
sentiment: Sentiment
classification: Classification
embedding: Embedding
gender: Gender
text_type: TextType
Expand Down
4 changes: 1 addition & 3 deletions exorde/pre_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@
test_lang_detect = ft_test_detect("test")

models = [
"MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
"MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33",
"SamLowe/roberta-base-go_emotions",
"cardiffnlp/twitter-roberta-base-irony",
"salesken/query_wellformedness_score",
"marieke93/MiniLM-evidence-types",
"alimazhar-110/website_classification",
"mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
"lxyuan/distilbert-base-multilingual-cased-sentiments-student",
"bert-large-uncased"
Expand Down
2 changes: 1 addition & 1 deletion exorde/process_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ async def process_batch(
completed: ProcessedItem = ProcessedItem(
item=prot_item,
analysis=ProtocolAnalysis(
classification=processed.classification,
classification=analysis.classification,
top_keywords=processed.top_keywords,
language_score=analysis.language_score,
gender=analysis.gender,
Expand Down
Loading

0 comments on commit 6dd1b2d

Please sign in to comment.