-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainingModel.py
127 lines (85 loc) · 5.03 KB
/
trainingModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
This is the Entry point for Training the Machine Learning Model.
Written By: iNeuron Intelligence
Version: 1.0
Revisions: None
"""
# Doing the necessary imports
from sklearn.model_selection import train_test_split
from data_ingestion import data_loader
from data_preprocessing import preprocessing
from data_preprocessing import clustering
from best_model_finder import tuner
from file_operations import file_methods
from application_logging import logger
from azure_file import azure_methodes
import yaml,os
#Creating the common Logging object
class trainModel:
def __init__(self):
with open(os.path.join("configfile","hyperparameter.yaml"),"r") as file:
self.config = yaml.safe_load(file)
self.drop_cols = self.config["columns"]["drop_cols"]
self.label = self.config["columns"]["label_column_name"]
self.random_state = self.config["base"]["random_state"]
self.test_size = self.config["base"]["test_size"]
self.log_writer = logger.App_Logger()
self.file_object = "ModelTrainingLog"
self.file = "Azurelog"
self.azure = azure_methodes(logger_object=self.log_writer,file_object=self.file)
def trainingModel(self):
# Logging the start of Training
self.log_writer.log(self.file_object, 'Start of Training')
try:
# Getting the data from the source
data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
data=data_getter.get_data()
"""doing the data preprocessing"""
preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)
#data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.
#removing unwanted columns as discussed in the EDA part in ipynb file
data = preprocessor.dropUnnecessaryColumns(data,self.drop_cols) #['Id','ActivityDate','TotalDistance','TrackerDistance']
#replacing 'na' values with np.nan as discussed in the EDA part
data = preprocessor.replaceInvalidValuesWithNull(data)
# check if missing values are present in the dataset
is_null_present,cols_with_missing_values=preprocessor.is_null_present(data)
# if missing values are there, replace them appropriately.
if(is_null_present):
data=preprocessor.impute_missing_values(data) # missing value imputation
# get encoded values for categorical data
#data = preprocessor.encodeCategoricalValues(data)
# create separate features and labels
X, Y = preprocessor.separate_label_feature(data, label_column_name=self.label)#'Calories'
# drop the columns obtained above
#X=preprocessor.remove_columns(X,cols_to_drop)
""" Applying the clustering approach"""
kmeans=clustering.KMeansClustering(self.file_object,self.log_writer) # object initialization.
number_of_clusters=kmeans.elbow_plot(X) # using the elbow plot to find the number of optimum clusters
# Divide the data into clusters
X=kmeans.create_clusters(X,number_of_clusters)
#create a new column in the dataset consisting of the corresponding cluster assignments.
X['Labels']=Y
# getting the unique clusters from our dataset
list_of_clusters=X['Cluster'].unique()
"""parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
for i in list_of_clusters:
cluster_data=X[X['Cluster']==i] # filter the data for one cluster
# Prepare the feature and Label columns
cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
cluster_label= cluster_data['Labels']
# splitting the data into training and test set for each cluster one by one
x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=self.test_size, random_state=self.random_state)
x_train_scaled = preprocessor.standardScalingData(x_train)
x_test_scaled = preprocessor.standardScalingData(x_test)
model_finder=tuner.Model_Finder(self.file_object,self.log_writer) # object initialization
#getting the best model for each of the clusters
best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test,cluster=i)
#saving the best model to the directory.
file_op = file_methods.File_Operation(self.file_object,self.log_writer)
save_model=file_op.save_model(best_model,best_model_name+str(i))
# logging the successful Training
self.log_writer.log(self.file_object, 'Successful End of Training')
except Exception:
# logging the unsuccessful Training
self.log_writer.log(self.file_object, 'Unsuccessful End of Training')
raise Exception