Skip to content

Commit

Permalink
implemented memory fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
dfoshidero committed Aug 8, 2024
1 parent bd3ed01 commit 71abfef
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 32 deletions.
Binary file modified src/__pycache__/feature_extractor.cpython-310.pyc
Binary file not shown.
Binary file modified src/__pycache__/model_predictor.cpython-310.pyc
Binary file not shown.
47 changes: 39 additions & 8 deletions src/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import spacy
import nltk
import random
import multiprocessing
import numpy as np

from word2number import w2n
Expand All @@ -14,14 +13,44 @@
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util

# Initialize variables for models and other resources
_nlp_model = None
_sentence_transformer_model = None
_stop_words = None
_lemmatizer = None


# Load pre-trained NER model (spaCy example)
nlp = spacy.load("en_core_web_trf")
def get_nlp_model():
global _nlp_model
if _nlp_model is None:
_nlp_model = spacy.load("en_core_web_trf")
return _nlp_model


# Load pre-trained sentence transformer model for semantic similarity
model = SentenceTransformer("all-mpnet-base-v2")
def get_sentence_transformer_model():
global _sentence_transformer_model
if _sentence_transformer_model is None:
_sentence_transformer_model = SentenceTransformer("all-mpnet-base-v2")
return _sentence_transformer_model


# Lazy loading for stop words
def get_stop_words():
global _stop_words
if _stop_words is None:
_stop_words = set(stopwords.words("english"))
return _stop_words


# Lazy loading for lemmatizer
def get_lemmatizer():
global _lemmatizer
if _lemmatizer is None:
_lemmatizer = WordNetLemmatizer()
return _lemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

numerical_features = [
"Gross Internal Area (m2)",
Expand Down Expand Up @@ -59,6 +88,9 @@ def get_related_terms(word, synonym_dict):


def preprocess_text(text, synonym_dict):
stop_words = get_stop_words()
lemmatizer = get_lemmatizer()

tokens = [
lemmatizer.lemmatize(word)
for word in text.split()
Expand Down Expand Up @@ -167,6 +199,8 @@ def extract_feature_values(
synonym_dict,
threshold=SIMILARITY_THRESHOLD,
):
nlp = get_nlp_model()
model = get_sentence_transformer_model()
doc = nlp(input_text)
explicit_features, filtered_text = extract_explicit_features(
input_text, unique_values, synonym_dict, model, numerical_features
Expand Down Expand Up @@ -352,7 +386,4 @@ def extract(input_text):
)
feature_values[feature] = numerical_values[feature]

# DEBUG
for feature, value in feature_values.items():
print(f"{feature}: {value}")
return feature_values
37 changes: 32 additions & 5 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from model_predictor import predictor
from model_predictor import load_resources, predict as model_predict
from feature_extractor import extract
import time
import numpy as np
import os
import psutil
import gc

# Load resources
model, features, label_encoders, unique_values = load_resources()


def predict(
Expand Down Expand Up @@ -40,9 +47,9 @@ def predict(
SERVICES,
):
"""
Get user input for the new columns.
Get user input for the new columns and make a prediction.
:return: dictionary with user input values for the new columns
:return: list with prediction values
"""
user_input = {
"Sector": [None if SECTOR == "None" else SECTOR],
Expand Down Expand Up @@ -97,12 +104,32 @@ def predict(
"Services": [None if SERVICES == "None" else SERVICES],
}

prediction = predictor(user_input)
prediction = model_predict(user_input, model, features, label_encoders)
prediction_list = (
prediction.tolist() if isinstance(prediction, np.ndarray) else prediction
)

log_memory_usage("During Prediction")

return prediction
return prediction_list


def log_memory_usage(phase):
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
gc.collect()
print(
f"[{phase}] Memory Usage: RSS={memory_info.rss / (1024 * 1024):.2f} MB, VMS={memory_info.vms / (1024 * 1024):.2f} MB"
)


def get_natural_language_input(text):
"""
Extracts structured data from a given natural language input.
:param text: The natural language input text.
:return: tuple of extracted values
"""
value_list = extract(text)
SECTOR = value_list.get("Sector")
SUBSECTOR = value_list.get("Sub-Sector")
Expand Down
70 changes: 51 additions & 19 deletions src/model_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,32 @@
import pandas as pd
import numpy as np

# Define the base directory and model paths
current_dir = os.path.dirname(os.path.abspath(__file__))
model_dir = os.path.join(current_dir, "model")

# Updated model file paths based on new files
features_filepath = os.path.join(model_dir, "features.pkl")
label_encoders_filepath = os.path.join(model_dir, "label_encoders.pkl")
synthetic_model_filepath = os.path.join(model_dir, "synthetic_HistGradientBoosting.pkl")
unique_values_filepath = os.path.join(model_dir, "unique_values.pkl")
def load_resources():
"""
Load the necessary resources.
:return: tuple of loaded resources
"""
current_dir = os.path.dirname(os.path.abspath(__file__))
model_dir = os.path.join(current_dir, "model")

features_filepath = os.path.join(model_dir, "features.pkl")
label_encoders_filepath = os.path.join(model_dir, "label_encoders.pkl")
synthetic_model_filepath = os.path.join(
model_dir, "synthetic_HistGradientBoosting.pkl"
)
unique_values_filepath = os.path.join(model_dir, "unique_values.pkl")

with open(synthetic_model_filepath, "rb") as f:
model = joblib.load(f)
with open(features_filepath, "rb") as f:
features = joblib.load(f)
with open(label_encoders_filepath, "rb") as f:
label_encoders = joblib.load(f)
with open(unique_values_filepath, "rb") as f:
unique_values = joblib.load(f)

# Load pre-trained models, label encoders, and unique values
model = joblib.load(synthetic_model_filepath)
features = joblib.load(features_filepath)
label_encoders = joblib.load(label_encoders_filepath)
unique_values = joblib.load(unique_values_filepath)
return model, features, label_encoders, unique_values


def apply_label_encoding(user_input, label_encoders):
Expand All @@ -39,7 +50,6 @@ def apply_label_encoding(user_input, label_encoders):
elif "Other" in encoder.classes_:
encoded_values.append(encoder.transform(["Other"])[0])
else:
# Create a new category "Unknown" if it doesn't exist
new_classes = np.append(encoder.classes_, "Unknown")
encoder.classes_ = new_classes
encoded_values.append(encoder.transform(["Unknown"])[0])
Expand All @@ -66,18 +76,29 @@ def preprocess_input(user_input, features, label_encoders):
"Aligned DataFrame is empty. Check if input features match training features."
)

# Clear input DataFrame to free memory
del input_df

return aligned_df


def predict(user_input):
def predict(user_input, model, features, label_encoders):
"""
Predict using the model.
:param user_input: dictionary with user inputs
:param model: trained model
:param features: list of feature names used during training
:param label_encoders: dictionary with label encoders
:return: prediction result
"""
preprocessed_input = preprocess_input(user_input, features, label_encoders)
return model.predict(preprocessed_input)
prediction = model.predict(preprocessed_input)

# Clear intermediate data
del preprocessed_input

return prediction


def predictor(user_input):
Expand All @@ -87,9 +108,13 @@ def predictor(user_input):
:param user_input: dictionary with user inputs
:return: combined prediction result
"""
pred = predict(user_input)
final_prediction = pred # Adjust as necessary if you have multiple models
return final_prediction
model, features, label_encoders, unique_values = load_resources()
pred = predict(user_input, model, features, label_encoders)

# Clear loaded resources to free memory
del model, features, label_encoders, unique_values

return pred


def align_features(input_df, training_columns):
Expand All @@ -106,6 +131,10 @@ def align_features(input_df, training_columns):
aligned_df[col] = input_df[col]
else:
aligned_df[col] = np.nan # Keep missing values as NaN

# Clear input DataFrame to free memory
del input_df

return aligned_df


Expand All @@ -124,3 +153,6 @@ def validate_user_input(user_input, unique_values):
raise ValueError(
f"Value for {feature} can only be {unique_values[feature]}."
)

# Clear user_input and unique_values to free memory
del user_input, unique_values

0 comments on commit 71abfef

Please sign in to comment.