Skip to content

Commit

Permalink
Latest code push
Browse files Browse the repository at this point in the history
  • Loading branch information
dfoshidero committed Aug 31, 2024
1 parent 9224fa5 commit 74ee46f
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 40 deletions.
3 changes: 3 additions & 0 deletions src/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def extract_numerical_feature(text, label, feature_keywords):

for feature in feature_numbers:
if feature_numbers[feature]:
# TODO this doesnt make sense. need to update it to use closest feature if more than one, instead of max frequency
feature_numbers[feature] = max(
set(feature_numbers[feature]), key=feature_numbers[feature].count
)
Expand Down Expand Up @@ -353,4 +354,6 @@ def extract(input_text):
)
feature_values[feature] = numerical_values[feature]

for feature, value in feature_values.items():
print(f"{feature}: {value}")
return feature_values
4 changes: 1 addition & 3 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,7 @@ def time_it(description, func, *args, **kwargs):
elapsed_time = end_time - start_time
return result, elapsed_time

text = (
"A residential concrete building with raft, a basement and timber joists floors"
)
text = "A combination of concrete and steel supports the structure, with the exterior finished in glass and aluminum paneling."

print(f"From text: {text}.")
inputs, time_elapsed_extraction = time_it(
Expand Down
43 changes: 13 additions & 30 deletions trainers/model_train_validate.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,28 @@
"""
This script is designed to load a dataset, preprocess it, train multiple machine learning models,
evaluate their performance using cross-validation, and save the models along with relevant metadata.
The script limits the dataset size for model training, tunes the models, and logs their performance metrics.
"""

import os
import joblib
import json

from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.base import clone

from model_utils import (
tune_model,
load_datasets,
prepare_datasets,
save_model_and_data,
)
from model_utils import tune_model, load_datasets, prepare_datasets, save_model_and_data

# Define the base directory and model paths
current_dir = os.path.dirname(os.path.abspath(__file__))
model_dir = os.path.join(current_dir, "../src/model")

# Create directories if they don't exist
os.makedirs(model_dir, exist_ok=True)

# Load the dataset
df = load_datasets()

# Extract and save unique values before any preprocessing
# Save unique values from the dataset before preprocessing for later use
unique_values = {col: df[col].dropna().unique().tolist() for col in df.columns}
joblib.dump(unique_values, os.path.join(model_dir, "unique_values.pkl"))

# Prepare datasets for model training
X_cleaned, y_cleaned, cleaned_label_encoders = prepare_datasets(df)

# Save feature names for later use
Expand All @@ -38,53 +31,43 @@
# Save label encoders for later use
joblib.dump(cleaned_label_encoders, os.path.join(model_dir, "label_encoders.pkl"))

# Set the limiter for the number of data points
LIMITER = 150000 # You can change this value as needed
# Limiter for the number of data points to train
LIMITER = 150000 # Modify this value as needed

# Ensure LIMITER does not exceed the available data points
LIMITER = min(LIMITER, X_cleaned.shape[0])

# Reduce dataset size based on the limiter
X_cleaned_limited = X_cleaned.iloc[:LIMITER]
y_cleaned_limited = y_cleaned.iloc[:LIMITER]

# Tune models and store the best estimators
model_cleaned = tune_model(X_cleaned_limited, y_cleaned_limited)

# Save the preprocessing pipelines along with the model
# joblib.dump(model_cleaned['RandomForest'], os.path.join(model_dir, 'pipeline.pkl'))

# Initialize performance logs
performance_logs = []

# Train and evaluate models
for model_name, model in model_cleaned.items():
full_model_name = f"synthetic_{model_name}"

# First split: train and test
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X_cleaned_limited, y_cleaned_limited, test_size=0.3, random_state=42
)

# Train and tune the model using the training and validation sets
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)

r_squared_train = r2_score(y_train, y_train_pred)

print(f"R-squared for {full_model_name} on training set: {r_squared_train}")

performance_logs.append(f"{full_model_name}: Training R-squared: {r_squared_train}")

# Evaluate the model on the test set
y_test_pred = model.predict(X_test)
r_squared_test = r2_score(y_test, y_test_pred)
print(f"R-squared for {full_model_name} on testing set: {r_squared_test}")

performance_logs.append(f"{full_model_name}: Testing R-squared: {r_squared_test}")

# Clone the model with verbosity turned off
# Clone the model to perform cross-validation without affecting the original model
model_cv = clone(model)
if hasattr(model_cv, "verbose"):
model_cv.verbose = 0
Expand All @@ -108,7 +91,7 @@
print(f"Error in cross-validation for {full_model_name}: {e}")
performance_logs.append(f"{full_model_name}: Cross-validation error: {e}\n")

# Save the model
# Save the model and associated data
save_model_and_data(model, full_model_name, model_dir, performance_logs)

# Save performance logs to a text file with date and time
Expand Down
15 changes: 8 additions & 7 deletions trainers/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,8 @@
GradientBoostingRegressor,
RandomForestRegressor,
StackingRegressor,
ExtraTreesRegressor,
)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
Expand Down Expand Up @@ -51,9 +47,14 @@
"HistGradientBoosting": (
HistGradientBoostingRegressor(random_state=42, verbose=1),
param_dist_hgb,
# "GradientBoosting": (
# GradientBoostingRegressor(random_state=42, verbose=1),
# param_dist_gb,
),
"GradientBoosting": (
GradientBoostingRegressor(random_state=42, verbose=1),
param_dist_gb,
),
"RandomForest": (
RandomForestRegressor(random_state=42, verbose=1),
param_dist_rf,
),
}

Expand Down

0 comments on commit 74ee46f

Please sign in to comment.