Latest code push

dfoshidero · Aug 31, 2024 · 74ee46f · 74ee46f
1 parent 9224fa5
commit 74ee46f
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 40 deletions.
diff --git a/src/feature_extractor.py b/src/feature_extractor.py
@@ -252,6 +252,7 @@ def extract_numerical_feature(text, label, feature_keywords):
 
     for feature in feature_numbers:
         if feature_numbers[feature]:
+            # TODO this doesnt make sense. need to update it to use closest feature if more than one, instead of max frequency
             feature_numbers[feature] = max(
                 set(feature_numbers[feature]), key=feature_numbers[feature].count
             )
@@ -353,4 +354,6 @@ def extract(input_text):
         )
         feature_values[feature] = numerical_values[feature]
 
+    for feature, value in feature_values.items():
+        print(f"{feature}: {value}")
     return feature_values
diff --git a/src/main.py b/src/main.py
@@ -214,9 +214,7 @@ def time_it(description, func, *args, **kwargs):
         elapsed_time = end_time - start_time
         return result, elapsed_time
 
-    text = (
-        "A residential concrete building with raft, a basement and timber joists floors"
-    )
+    text = "A combination of concrete and steel supports the structure, with the exterior finished in glass and aluminum paneling."
 
     print(f"From text: {text}.")
     inputs, time_elapsed_extraction = time_it(

diff --git a/trainers/model_train_validate.py b/trainers/model_train_validate.py
@@ -1,35 +1,28 @@
+"""
+This script is designed to load a dataset, preprocess it, train multiple machine learning models, 
+evaluate their performance using cross-validation, and save the models along with relevant metadata. 
+The script limits the dataset size for model training, tunes the models, and logs their performance metrics.
+"""
+
 import os
 import joblib
-import json
-
 from datetime import datetime
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.metrics import r2_score
-from sklearn.linear_model import Ridge
 from sklearn.base import clone
-
-from model_utils import (
-    tune_model,
-    load_datasets,
-    prepare_datasets,
-    save_model_and_data,
-)
+from model_utils import tune_model, load_datasets, prepare_datasets, save_model_and_data
 
 # Define the base directory and model paths
 current_dir = os.path.dirname(os.path.abspath(__file__))
 model_dir = os.path.join(current_dir, "../src/model")
-
-# Create directories if they don't exist
 os.makedirs(model_dir, exist_ok=True)
 
-# Load the dataset
 df = load_datasets()
 
-# Extract and save unique values before any preprocessing
+# Save unique values from the dataset before preprocessing for later use
 unique_values = {col: df[col].dropna().unique().tolist() for col in df.columns}
 joblib.dump(unique_values, os.path.join(model_dir, "unique_values.pkl"))
 
-# Prepare datasets for model training
 X_cleaned, y_cleaned, cleaned_label_encoders = prepare_datasets(df)
 
 # Save feature names for later use
@@ -38,53 +31,43 @@
 # Save label encoders for later use
 joblib.dump(cleaned_label_encoders, os.path.join(model_dir, "label_encoders.pkl"))
 
-# Set the limiter for the number of data points
-LIMITER = 150000  # You can change this value as needed
+# Limiter for the number of data points to train
+LIMITER = 150000  # Modify this value as needed
 
 # Ensure LIMITER does not exceed the available data points
 LIMITER = min(LIMITER, X_cleaned.shape[0])
 
-# Reduce dataset size based on the limiter
 X_cleaned_limited = X_cleaned.iloc[:LIMITER]
 y_cleaned_limited = y_cleaned.iloc[:LIMITER]
 
 # Tune models and store the best estimators
 model_cleaned = tune_model(X_cleaned_limited, y_cleaned_limited)
 
-# Save the preprocessing pipelines along with the model
-# joblib.dump(model_cleaned['RandomForest'], os.path.join(model_dir, 'pipeline.pkl'))
-
 # Initialize performance logs
 performance_logs = []
 
-# Train and evaluate models
 for model_name, model in model_cleaned.items():
     full_model_name = f"synthetic_{model_name}"
 
-    # First split: train and test
+    # Split dataset into training and testing sets
     X_train, X_test, y_train, y_test = train_test_split(
         X_cleaned_limited, y_cleaned_limited, test_size=0.3, random_state=42
     )
 
-    # Train and tune the model using the training and validation sets
     model.fit(X_train, y_train)
-
     y_train_pred = model.predict(X_train)
-
     r_squared_train = r2_score(y_train, y_train_pred)
 
     print(f"R-squared for {full_model_name} on training set: {r_squared_train}")
-
     performance_logs.append(f"{full_model_name}: Training R-squared: {r_squared_train}")
 
     # Evaluate the model on the test set
     y_test_pred = model.predict(X_test)
     r_squared_test = r2_score(y_test, y_test_pred)
     print(f"R-squared for {full_model_name} on testing set: {r_squared_test}")
-
     performance_logs.append(f"{full_model_name}: Testing R-squared: {r_squared_test}")
 
-    # Clone the model with verbosity turned off
+    # Clone the model to perform cross-validation without affecting the original model
     model_cv = clone(model)
     if hasattr(model_cv, "verbose"):
         model_cv.verbose = 0
@@ -108,7 +91,7 @@
         print(f"Error in cross-validation for {full_model_name}: {e}")
         performance_logs.append(f"{full_model_name}: Cross-validation error: {e}\n")
 
-    # Save the model
+    # Save the model and associated data
     save_model_and_data(model, full_model_name, model_dir, performance_logs)
 
 # Save performance logs to a text file with date and time

diff --git a/trainers/model_utils.py b/trainers/model_utils.py
@@ -6,12 +6,8 @@
     GradientBoostingRegressor,
     RandomForestRegressor,
     StackingRegressor,
-    ExtraTreesRegressor,
 )
-from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
 from sklearn.svm import SVR
-from sklearn.neighbors import KNeighborsRegressor
-from sklearn.tree import DecisionTreeRegressor
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler, LabelEncoder
@@ -51,9 +47,14 @@
     "HistGradientBoosting": (
         HistGradientBoostingRegressor(random_state=42, verbose=1),
         param_dist_hgb,
-        # "GradientBoosting": (
-        #    GradientBoostingRegressor(random_state=42, verbose=1),
-        #    param_dist_gb,
+    ),
+    "GradientBoosting": (
+        GradientBoostingRegressor(random_state=42, verbose=1),
+        param_dist_gb,
+    ),
+    "RandomForest": (
+        RandomForestRegressor(random_state=42, verbose=1),
+        param_dist_rf,
     ),
 }