-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfull.py
175 lines (149 loc) · 7.35 KB
/
full.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from keras.models import Sequential
from keras.layers import Dense
# Load the dataset
data = pd.read_csv('/mnt/data/data_clean.csv')
# Preprocess the data
# Use pd.get_dummies to encode the 'industry' column and any other categorical columns
data = pd.get_dummies(data, columns=['industry','country'], drop_first=True)
# Encoding the 'stage_grouped' column if necessary
if data['stage_grouped'].dtype == 'object':
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['stage_grouped'] = label_encoder.fit_transform(data['stage_grouped'])
# Drop any unnecessary columns
columns_to_drop = ['customer_number','stage_grouped','max_closed_date','cloud_revenue','total_opp_amount']
# Feature and target split for classification
X_class = data.drop(columns=[columns_to_drop])
y_class = data['stage_grouped']
# Feature and target split for regression
X_reg = data.drop(columns=[columns_to_drop])
y_reg = data['total_opp_amount']
# Standardize the features
scaler = StandardScaler()
X_class = scaler.fit_transform(X_class)
X_reg = scaler.fit_transform(X_reg)
# Split the data
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
### 1. Correlation Analysis
print("Correlation Analysis:")
correlation_matrix = pd.DataFrame(X_class, columns=data.drop(columns=[columns_to_drop]).columns).corrwith(pd.Series(y_class)).abs()
print(correlation_matrix.sort_values(ascending=False))
# Plot correlation heatmap for top features
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix.values.reshape(-1, 1), annot=True, cmap='coolwarm', cbar=True)
plt.title("Feature Correlation with Target (Classification)")
plt.show()
### 2. Recursive Feature Elimination (RFE)
print("Recursive Feature Elimination (RFE):")
rfe_classifier = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=10)
X_train_rfe = rfe_classifier.fit_transform(X_train_class, y_train_class)
X_test_rfe = rfe_classifier.transform(X_test_class)
print(f"Selected Features: {rfe_classifier.support_}")
### 3. Feature Importance using Random Forest
print("Feature Importance from Random Forest:")
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_class, y_train_class)
importances = rf_classifier.feature_importances_
# Plot feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x=importances, y=data.drop(columns=['stage_grouped', 'total_opp_amount']).columns)
plt.title("Feature Importance (Classification)")
plt.show()
### 4. PCA for Dimensionality Reduction
print("Principal Component Analysis (PCA):")
pca = PCA(n_components=0.95) # Keep 95% of the variance
X_train_pca = pca.fit_transform(X_train_class)
X_test_pca = pca.transform(X_test_class)
print(f"Number of components selected: {pca.n_components_}")
# Hyperparameter Tuning for Classification Models
classifiers = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Random Forest': RandomForestClassifier(),
'SVM': SVC(probability=True),
'K-Nearest Neighbors': KNeighborsClassifier()
}
param_grid = {
'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},
'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]},
'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7, 9]}
}
print("\nClassification Results with Hyperparameter Tuning:")
for name, model in classifiers.items():
grid_search = GridSearchCV(estimator=model, param_grid=param_grid[name], scoring='accuracy', cv=5)
grid_search.fit(X_train_rfe, y_train_class) # Using RFE selected features
best_model = grid_search.best_estimator_
y_pred_class = best_model.predict(X_test_rfe)
y_prob_class = best_model.predict_proba(X_test_rfe)[:, 1]
# Calculate AUC
auc = roc_auc_score(y_test_class, y_prob_class)
print(f"{name} Best Parameters: {grid_search.best_params_}")
print(f"{name} Accuracy: {accuracy_score(y_test_class, y_pred_class):.4f}")
print(f"{name} AUC: {auc:.4f}")
print(classification_report(y_test_class, y_pred_class))
# Plot Confusion Matrix
cm = confusion_matrix(y_test_class, y_pred_class)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title(f"{name} - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
# Hyperparameter Tuning for Regression Models
regressors = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(),
'SVR': SVR(),
'K-Nearest Neighbors': KNeighborsRegressor()
}
param_grid_reg = {
'Linear Regression': {}, # No hyperparameters to tune
'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]},
'SVR': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7, 9]}
}
print("\nRegression Results with Hyperparameter Tuning:")
for name, model in regressors.items():
grid_search = GridSearchCV(estimator=model, param_grid=param_grid_reg[name], scoring='r2', cv=5)
grid_search.fit(X_train_reg, y_train_reg)
best_model = grid_search.best_estimator_
y_pred_reg = best_model.predict(X_test_reg)
print(f"{name} Best Parameters: {grid_search.best_params_}")
print(f"{name} MSE: {mean_squared_error(y_test_reg, y_pred_reg):.4f}")
print(f"{name} R2 Score: {r2_score(y_test_reg, y_pred_reg):.4f}")
# Deep Learning Model for Classification
print("\nDeep Learning Classification:")
model_class = Sequential()
model_class.add(Dense(64, input_dim=X_train_rfe.shape[1], activation='relu'))
model_class.add(Dense(32, activation='relu'))
model_class.add(Dense(1, activation='sigmoid'))
model_class.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_class.fit(X_train_rfe, y_train_class, epochs=50, batch_size=10, verbose=1)
# Evaluate the classification model
loss, accuracy = model_class.evaluate(X_test_rfe, y_test_class)
print(f"Deep Learning Classification Accuracy: {accuracy:.4f}")
# Deep Learning Model for Regression
print("\nDeep Learning Regression:")
model_reg = Sequential()
model_reg.add(Dense(64, input_dim=X_train_reg.shape[1], activation='relu'))
model_reg.add(Dense(32, activation='relu'))
model_reg.add(Dense(1))
model_reg.compile(optimizer='adam', loss='mean_squared_error')
model_reg.fit(X_train_reg, y_train_reg, epochs=50, batch_size=10, verbose=1)
# Evaluate the regression model
y_pred_reg_dl = model_reg.predict(X_test_reg).flatten()
print(f"Deep Learning Regression MSE: {mean_squared_error(y_test_reg, y_pred_reg_dl):.4f}")
print(f"Deep Learning Regression R2 Score: {r2_score(y_test_reg, y_pred_reg_dl):.4f}")