logistic regression in sklearn:
# Create a list of original variable names from the training DataFrame
original_variables = X_train.columns
# Extract the coefficients of the logistic regression estimator
model_coefficients = log_reg_clf.coef_[0]
# Create a dataframe of the variables and coefficients & print it out
coefficient_df = pd.DataFrame({"Variable" : original_variables, "Coefficient": model_coefficients})
print(coefficient_df)
# Print out the top 3 positive variables
top_three_df = coefficient_df.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df)
Random Forest:
# Extract the 7th (index 6) tree from the random forest
chosen_tree = rf_clf.estimators_[6]
# Visualize the graph using the provided image
imgplot = plt.imshow(tree_viz_image)
plt.show()
# Extract the parameters and level of the top (index 0) node
split_column = chosen_tree.tree_.feature[0]
split_column_name = X_train.columns[split_column]
split_value = chosen_tree.tree_.threshold[0]
# Print out the feature and level
print("This node split on feature {}, at a value of {}".format(split_column_name, split_value))
# Print out the old estimator, notice which hyperparameter is badly set
print(rf_clf_old)
# Get confusion matrix & accuracy for the old rf_model
print("Confusion Matrix: \n\n {} \n Accuracy Score: \n\n {}".format(
confusion_matrix(y_test, rf_old_predictions),
accuracy_score(rf_old_predictions, y_test)))
# Create a new random forest classifier with better hyperparamaters
rf_clf_new = RandomForestClassifier(n_estimators=500)
# Fit this to the data and obtain predictions
rf_new_predictions = rf_clf_new.fit(X_train, y_train).predict(X_test)
# Assess the new model (using new predictions!)
print("Confusion Matrix: \n\n", confusion_matrix(y_test, rf_new_predictions))
print("Accuracy Score: \n\n", accuracy_score(y_test, rf_new_predictions))
# Build a knn estimator for each value of n_neighbours
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_20 = KNeighborsClassifier(n_neighbors=20)
# Fit each to the training data & produce predictions
knn_5_predictions = knn_5.fit(X_train, y_train).predict(X_test)
knn_10_predictions = knn_10.fit(X_train, y_train).predict(X_test)
knn_20_predictions = knn_20.fit(X_train, y_train).predict(X_test)
# Get an accuracy score for each of the models
knn_5_accuracy = accuracy_score(y_test, knn_5_predictions)
knn_10_accuracy = accuracy_score(y_test, knn_10_predictions)
knn_20_accuracy = accuracy_score(y_test, knn_20_predictions)
print("The accuracy of 5, 10, 20 neighbours was {}, {}, {}".format(knn_5_accuracy, knn_10_accuracy, knn_20_accuracy))
# Set the learning rates & results storage
learning_rates = [0.001,0.01,0.05,0.1,0.2,0.5]
results_list = []
# Create the for loop to evaluate model predictions for each learning rate
for learning_rate in learning_rates:
model = GradientBoostingClassifier(learning_rate=learning_rate)
predictions = model.fit(X_train, y_train).predict(X_test)
# Save the learning rate and accuracy score
results_list.append([learning_rate, accuracy_score(y_test, predictions)])
# Gather everything into a DataFrame
results_df = pd.DataFrame(results_list, columns=['learning_rate', 'accuracy'])
print(results_df)
# Set the learning rates & accuracies list
learn_rates = np.linspace(0.01, 2, num=30)
accuracies = []
# Create the for loop
for learn_rate in learn_rates:
# Create the model, predictions & save the accuracies as before
model = GradientBoostingClassifier(learning_rate=learn_rate)
predictions = model.fit(X_train, y_train).predict(X_test)
accuracies.append(accuracy_score(y_test, predictions))
# Plot results
plt.plot(learn_rates, accuracies)
plt.gca().set(xlabel='learning_rate', ylabel='Accuracy', title='Accuracy for different learning_rates')
plt.show()
# Confirm the size of the combinations_list
print(len(combinations_list))
# Sort the results_df by accuracy and print the top 10 rows
print(results_df.sort_values(by='accuracy', ascending=False).head(10))
# Confirm which hyperparameters were used in this search
print(results_df.columns)
# Call visualize_hyperparameter() with each hyperparameter in turn
visualize_hyperparameter('max_depth') #scatter plot of col values against accuracy
visualize_hyperparameter('min_samples_leaf')
visualize_hyperparameter('learn_rate')
Hyperopt package
import hyperopt as hp
# Set up space dictionary with specified hyperparameters
space = {'max_depth': hp.quniform('max_depth', 2, 10, 2),'learning_rate': hp.uniform('learning_rate', 0.001,0.9)}
# Set up objective function
def objective(params):
params = {'max_depth': int(params['max_depth']),'learning_rate': params['learning_rate']}
gbm_clf = GradientBoostingClassifier(n_estimators=100, **params)
best_score = cross_val_score(gbm_clf, X_train, y_train, scoring='accuracy', cv=2, n_jobs=4).mean()
loss = 1 - best_score
return loss
# Run the algorithm
best = fmin(fn=objective,space=space, max_evals=20, rstate=np.random.RandomState(42), algo=tpe.suggest)
print(best)
TPOT
# Assign the values outlined to the inputs
number_generations = 3
population_size = 4
offspring_size = 3
scoring_function = 'accuracy'
# Create the tpot classifier
tpot_clf = TPOTClassifier(generations=number_generations, population_size=population_size,
offspring_size=offspring_size, scoring=scoring_function,
verbosity=2, random_state=2, cv=2)
# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)
# Score on the test set
print(tpot_clf.score(X_test, y_test))