diff --git a/Notebooks/Clean_up_data.ipynb b/Notebooks/Clean_up_data.ipynb
new file mode 100644
index 0000000..2eedc61
--- /dev/null
+++ b/Notebooks/Clean_up_data.ipynb
@@ -0,0 +1,483 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepair Steps\n",
+ "\n",
+ "1. Load data from data (csv file) to dataframe\n",
+ "2. Clean up all NA columns, all row NA, columns cannot be features such as Id, or having only one value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Dataframe with panda\n",
+ "import pandas as pd\n",
+ "# IO path\n",
+ "from path import Path\n",
+ "# Count Distint in array or series\n",
+ "from collections import Counter\n",
+ "# Numpy\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# set the data path\n",
+ "data = Path('../Resources/LoanStats_2019Q1.csv')\n",
+ "# read data from the path (low_memory=False to avoid warning for dtype size)\n",
+ "# do not read data until 2nd row (skip 1 row)\n",
+ "df = pd.read_csv(data, low_memory=False,skiprows=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(115677, 144)"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# find the shape\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 2\n",
+ "member_id 0\n",
+ "loan_amnt 115675\n",
+ "funded_amnt 115675\n",
+ "funded_amnt_inv 115675\n",
+ " ... \n",
+ "settlement_status 0\n",
+ "settlement_date 0\n",
+ "settlement_amount 0\n",
+ "settlement_percentage 0\n",
+ "settlement_term 0\n",
+ "Length: 144, dtype: int64"
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Find columns contain all NA or Empty value\n",
+ "df.count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',\n",
+ " 'installment', 'grade', 'sub_grade', 'emp_title',\n",
+ " ...\n",
+ " 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util',\n",
+ " 'sec_app_open_act_il', 'sec_app_num_rev_accts',\n",
+ " 'sec_app_chargeoff_within_12_mths',\n",
+ " 'sec_app_collections_12_mths_ex_med',\n",
+ " 'sec_app_mths_since_last_major_derog', 'hardship_flag',\n",
+ " 'debt_settlement_flag'],\n",
+ " dtype='object', length=121)"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Drop columns all NA\n",
+ "df = df.dropna(axis='columns',how='all')\n",
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(115677, 121)"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# the dataframe shape \n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(115675, 120)"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# we do not need id column\n",
+ "# id has only 2 rows not NA\n",
+ "df= df.drop(columns = [\"id\"])\n",
+ "# drop rows all NA\n",
+ "df = df.dropna(axis='rows',how='all')\n",
+ "# the dataframe shape \n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(115675, 120)"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# the dataframe shape \n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "loan_amnt 115675\n",
+ "funded_amnt 115675\n",
+ "funded_amnt_inv 115675\n",
+ "term 115675\n",
+ "int_rate 115675\n",
+ " ... \n",
+ "sec_app_chargeoff_within_12_mths 16681\n",
+ "sec_app_collections_12_mths_ex_med 16681\n",
+ "sec_app_mths_since_last_major_derog 4901\n",
+ "hardship_flag 115675\n",
+ "debt_settlement_flag 115675\n",
+ "Length: 120, dtype: int64"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check if NA columns\n",
+ "df.count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Column name pymnt_plan:\n",
+ "Column name recoveries:\n",
+ "Column name collection_recovery_fee:\n",
+ "Column name policy_code:\n",
+ "Column name acc_now_delinq:\n",
+ "Column name num_tl_120dpd_2m:\n",
+ "Column name num_tl_30dpd:\n",
+ "Column name tax_liens:\n",
+ "Column name hardship_flag:\n",
+ "Column name debt_settlement_flag:\n",
+ "['pymnt_plan', 'recoveries', 'collection_recovery_fee', 'policy_code', 'acc_now_delinq', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'tax_liens', 'hardship_flag', 'debt_settlement_flag']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find columns that have only one value\n",
+ "# They are not feature\n",
+ "columns_1_value = []\n",
+ "for column in df.columns:\n",
+ " if (len(df[column].value_counts()) == 1):\n",
+ " columns_1_value.append(column)\n",
+ " print (f\"Column name {column}:\")\n",
+ "print(columns_1_value)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(115675, 110)"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Drop columns that have only one value\n",
+ "df= df.drop(columns = columns_1_value)\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['term',\n",
+ " 'int_rate',\n",
+ " 'grade',\n",
+ " 'sub_grade',\n",
+ " 'emp_title',\n",
+ " 'emp_length',\n",
+ " 'home_ownership',\n",
+ " 'verification_status',\n",
+ " 'issue_d',\n",
+ " 'loan_status',\n",
+ " 'purpose',\n",
+ " 'title',\n",
+ " 'zip_code',\n",
+ " 'addr_state',\n",
+ " 'earliest_cr_line',\n",
+ " 'revol_util',\n",
+ " 'initial_list_status',\n",
+ " 'last_pymnt_d',\n",
+ " 'next_pymnt_d',\n",
+ " 'last_credit_pull_d',\n",
+ " 'application_type',\n",
+ " 'verification_status_joint',\n",
+ " 'sec_app_earliest_cr_line']"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj_columns = list(df.dtypes[df.dtypes == np.object].index)\n",
+ "obj_columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Current 94116\n",
+ "Issued 18835\n",
+ "Fully Paid 2157\n",
+ "In Grace Period 233\n",
+ "Late (16-30 days) 155\n",
+ "Late (31-120 days) 138\n",
+ "Charged Off 41\n",
+ "Name: loan_status, dtype: int64"
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['loan_status'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['loan_amnt',\n",
+ " 'funded_amnt',\n",
+ " 'funded_amnt_inv',\n",
+ " 'installment',\n",
+ " 'annual_inc',\n",
+ " 'dti',\n",
+ " 'delinq_2yrs',\n",
+ " 'inq_last_6mths',\n",
+ " 'mths_since_last_delinq',\n",
+ " 'mths_since_last_record',\n",
+ " 'open_acc',\n",
+ " 'pub_rec',\n",
+ " 'revol_bal',\n",
+ " 'total_acc',\n",
+ " 'out_prncp',\n",
+ " 'out_prncp_inv',\n",
+ " 'total_pymnt',\n",
+ " 'total_pymnt_inv',\n",
+ " 'total_rec_prncp',\n",
+ " 'total_rec_int',\n",
+ " 'total_rec_late_fee',\n",
+ " 'last_pymnt_amnt',\n",
+ " 'collections_12_mths_ex_med',\n",
+ " 'mths_since_last_major_derog',\n",
+ " 'annual_inc_joint',\n",
+ " 'dti_joint',\n",
+ " 'tot_coll_amt',\n",
+ " 'tot_cur_bal',\n",
+ " 'open_acc_6m',\n",
+ " 'open_act_il',\n",
+ " 'open_il_12m',\n",
+ " 'open_il_24m',\n",
+ " 'mths_since_rcnt_il',\n",
+ " 'total_bal_il',\n",
+ " 'il_util',\n",
+ " 'open_rv_12m',\n",
+ " 'open_rv_24m',\n",
+ " 'max_bal_bc',\n",
+ " 'all_util',\n",
+ " 'total_rev_hi_lim',\n",
+ " 'inq_fi',\n",
+ " 'total_cu_tl',\n",
+ " 'inq_last_12m',\n",
+ " 'acc_open_past_24mths',\n",
+ " 'avg_cur_bal',\n",
+ " 'bc_open_to_buy',\n",
+ " 'bc_util',\n",
+ " 'chargeoff_within_12_mths',\n",
+ " 'delinq_amnt',\n",
+ " 'mo_sin_old_il_acct',\n",
+ " 'mo_sin_old_rev_tl_op',\n",
+ " 'mo_sin_rcnt_rev_tl_op',\n",
+ " 'mo_sin_rcnt_tl',\n",
+ " 'mort_acc',\n",
+ " 'mths_since_recent_bc',\n",
+ " 'mths_since_recent_bc_dlq',\n",
+ " 'mths_since_recent_inq',\n",
+ " 'mths_since_recent_revol_delinq',\n",
+ " 'num_accts_ever_120_pd',\n",
+ " 'num_actv_bc_tl',\n",
+ " 'num_actv_rev_tl',\n",
+ " 'num_bc_sats',\n",
+ " 'num_bc_tl',\n",
+ " 'num_il_tl',\n",
+ " 'num_op_rev_tl',\n",
+ " 'num_rev_accts',\n",
+ " 'num_rev_tl_bal_gt_0',\n",
+ " 'num_sats',\n",
+ " 'num_tl_90g_dpd_24m',\n",
+ " 'num_tl_op_past_12m',\n",
+ " 'pct_tl_nvr_dlq',\n",
+ " 'percent_bc_gt_75',\n",
+ " 'pub_rec_bankruptcies',\n",
+ " 'tot_hi_cred_lim',\n",
+ " 'total_bal_ex_mort',\n",
+ " 'total_bc_limit',\n",
+ " 'total_il_high_credit_limit',\n",
+ " 'revol_bal_joint',\n",
+ " 'sec_app_inq_last_6mths',\n",
+ " 'sec_app_mort_acc',\n",
+ " 'sec_app_open_acc',\n",
+ " 'sec_app_revol_util',\n",
+ " 'sec_app_open_act_il',\n",
+ " 'sec_app_num_rev_accts',\n",
+ " 'sec_app_chargeoff_within_12_mths',\n",
+ " 'sec_app_collections_12_mths_ex_med',\n",
+ " 'sec_app_mths_since_last_major_derog']"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "num_columns = list(df.dtypes[df.dtypes != np.object].index)\n",
+ "num_columns"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mlenv",
+ "language": "python",
+ "name": "mlenv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/Notebooks/credit_risk_ensemble.ipynb b/Notebooks/credit_risk_ensemble.ipynb
new file mode 100644
index 0000000..6681596
--- /dev/null
+++ b/Notebooks/credit_risk_ensemble.ipynb
@@ -0,0 +1,1750 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from pathlib import Path\n",
+ "from collections import Counter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import balanced_accuracy_score\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "from imblearn.metrics import classification_report_imbalanced"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Read the CSV and Perform Basic Data Cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-\n",
+ "\n",
+ "columns = [\n",
+ " \"loan_amnt\", \"int_rate\", \"installment\", \"home_ownership\",\n",
+ " \"annual_inc\", \"verification_status\", \"issue_d\", \"loan_status\",\n",
+ " \"pymnt_plan\", \"dti\", \"delinq_2yrs\", \"inq_last_6mths\",\n",
+ " \"open_acc\", \"pub_rec\", \"revol_bal\", \"total_acc\",\n",
+ " \"initial_list_status\", \"out_prncp\", \"out_prncp_inv\", \"total_pymnt\",\n",
+ " \"total_pymnt_inv\", \"total_rec_prncp\", \"total_rec_int\", \"total_rec_late_fee\",\n",
+ " \"recoveries\", \"collection_recovery_fee\", \"last_pymnt_amnt\", \"next_pymnt_d\",\n",
+ " \"collections_12_mths_ex_med\", \"policy_code\", \"application_type\", \"acc_now_delinq\",\n",
+ " \"tot_coll_amt\", \"tot_cur_bal\", \"open_acc_6m\", \"open_act_il\",\n",
+ " \"open_il_12m\", \"open_il_24m\", \"mths_since_rcnt_il\", \"total_bal_il\",\n",
+ " \"il_util\", \"open_rv_12m\", \"open_rv_24m\", \"max_bal_bc\",\n",
+ " \"all_util\", \"total_rev_hi_lim\", \"inq_fi\", \"total_cu_tl\",\n",
+ " \"inq_last_12m\", \"acc_open_past_24mths\", \"avg_cur_bal\", \"bc_open_to_buy\",\n",
+ " \"bc_util\", \"chargeoff_within_12_mths\", \"delinq_amnt\", \"mo_sin_old_il_acct\",\n",
+ " \"mo_sin_old_rev_tl_op\", \"mo_sin_rcnt_rev_tl_op\", \"mo_sin_rcnt_tl\", \"mort_acc\",\n",
+ " \"mths_since_recent_bc\", \"mths_since_recent_inq\", \"num_accts_ever_120_pd\", \"num_actv_bc_tl\",\n",
+ " \"num_actv_rev_tl\", \"num_bc_sats\", \"num_bc_tl\", \"num_il_tl\",\n",
+ " \"num_op_rev_tl\", \"num_rev_accts\", \"num_rev_tl_bal_gt_0\",\n",
+ " \"num_sats\", \"num_tl_120dpd_2m\", \"num_tl_30dpd\", \"num_tl_90g_dpd_24m\",\n",
+ " \"num_tl_op_past_12m\", \"pct_tl_nvr_dlq\", \"percent_bc_gt_75\", \"pub_rec_bankruptcies\",\n",
+ " \"tax_liens\", \"tot_hi_cred_lim\", \"total_bal_ex_mort\", \"total_bc_limit\",\n",
+ " \"total_il_high_credit_limit\", \"hardship_flag\", \"debt_settlement_flag\"\n",
+ "]\n",
+ "\n",
+ "target = [\"loan_status\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " home_ownership | \n",
+ " annual_inc | \n",
+ " verification_status | \n",
+ " issue_d | \n",
+ " loan_status | \n",
+ " pymnt_plan | \n",
+ " dti | \n",
+ " ... | \n",
+ " pct_tl_nvr_dlq | \n",
+ " percent_bc_gt_75 | \n",
+ " pub_rec_bankruptcies | \n",
+ " tax_liens | \n",
+ " tot_hi_cred_lim | \n",
+ " total_bal_ex_mort | \n",
+ " total_bc_limit | \n",
+ " total_il_high_credit_limit | \n",
+ " hardship_flag | \n",
+ " debt_settlement_flag | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10500.0 | \n",
+ " 0.1719 | \n",
+ " 375.35 | \n",
+ " RENT | \n",
+ " 66000.0 | \n",
+ " Source Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 27.24 | \n",
+ " ... | \n",
+ " 85.7 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 65687.0 | \n",
+ " 38199.0 | \n",
+ " 2000.0 | \n",
+ " 61987.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 25000.0 | \n",
+ " 0.2000 | \n",
+ " 929.09 | \n",
+ " MORTGAGE | \n",
+ " 105000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 20.23 | \n",
+ " ... | \n",
+ " 91.2 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 271427.0 | \n",
+ " 60641.0 | \n",
+ " 41200.0 | \n",
+ " 49197.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20000.0 | \n",
+ " 0.2000 | \n",
+ " 529.88 | \n",
+ " MORTGAGE | \n",
+ " 56000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 24.26 | \n",
+ " ... | \n",
+ " 66.7 | \n",
+ " 50.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 60644.0 | \n",
+ " 45684.0 | \n",
+ " 7500.0 | \n",
+ " 43144.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10000.0 | \n",
+ " 0.1640 | \n",
+ " 353.55 | \n",
+ " RENT | \n",
+ " 92000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 31.44 | \n",
+ " ... | \n",
+ " 100.0 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 99506.0 | \n",
+ " 68784.0 | \n",
+ " 19700.0 | \n",
+ " 76506.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 22000.0 | \n",
+ " 0.1474 | \n",
+ " 520.39 | \n",
+ " MORTGAGE | \n",
+ " 52000.0 | \n",
+ " Not Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 18.76 | \n",
+ " ... | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 219750.0 | \n",
+ " 25919.0 | \n",
+ " 27600.0 | \n",
+ " 20000.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 86 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment home_ownership annual_inc \\\n",
+ "0 10500.0 0.1719 375.35 RENT 66000.0 \n",
+ "1 25000.0 0.2000 929.09 MORTGAGE 105000.0 \n",
+ "2 20000.0 0.2000 529.88 MORTGAGE 56000.0 \n",
+ "3 10000.0 0.1640 353.55 RENT 92000.0 \n",
+ "4 22000.0 0.1474 520.39 MORTGAGE 52000.0 \n",
+ "\n",
+ " verification_status issue_d loan_status pymnt_plan dti ... \\\n",
+ "0 Source Verified Mar-2019 low_risk n 27.24 ... \n",
+ "1 Verified Mar-2019 low_risk n 20.23 ... \n",
+ "2 Verified Mar-2019 low_risk n 24.26 ... \n",
+ "3 Verified Mar-2019 low_risk n 31.44 ... \n",
+ "4 Not Verified Mar-2019 low_risk n 18.76 ... \n",
+ "\n",
+ " pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens \\\n",
+ "0 85.7 100.0 0.0 0.0 \n",
+ "1 91.2 50.0 1.0 0.0 \n",
+ "2 66.7 50.0 0.0 0.0 \n",
+ "3 100.0 50.0 1.0 0.0 \n",
+ "4 100.0 0.0 0.0 0.0 \n",
+ "\n",
+ " tot_hi_cred_lim total_bal_ex_mort total_bc_limit \\\n",
+ "0 65687.0 38199.0 2000.0 \n",
+ "1 271427.0 60641.0 41200.0 \n",
+ "2 60644.0 45684.0 7500.0 \n",
+ "3 99506.0 68784.0 19700.0 \n",
+ "4 219750.0 25919.0 27600.0 \n",
+ "\n",
+ " total_il_high_credit_limit hardship_flag debt_settlement_flag \n",
+ "0 61987.0 N N \n",
+ "1 49197.0 N N \n",
+ "2 43144.0 N N \n",
+ "3 76506.0 N N \n",
+ "4 20000.0 N N \n",
+ "\n",
+ "[5 rows x 86 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load the data\n",
+ "file_path = Path('../Resources/LoanStats_2019Q1.csv')\n",
+ "df = pd.read_csv(file_path, skiprows=1)[:-2]\n",
+ "df = df.loc[:, columns].copy()\n",
+ "\n",
+ "# Drop the null columns where all values are null\n",
+ "df = df.dropna(axis='columns', how='all')\n",
+ "\n",
+ "# Drop the null rows\n",
+ "df = df.dropna()\n",
+ "\n",
+ "# Remove the `Issued` loan status\n",
+ "issued_mask = df['loan_status'] != 'Issued'\n",
+ "df = df.loc[issued_mask]\n",
+ "\n",
+ "# convert interest rate to numerical\n",
+ "df['int_rate'] = df['int_rate'].str.replace('%', '')\n",
+ "df['int_rate'] = df['int_rate'].astype('float') / 100\n",
+ "\n",
+ "\n",
+ "# Convert the target column values to low_risk and high_risk based on their values\n",
+ "x = {'Current': 'low_risk'} \n",
+ "df = df.replace(x)\n",
+ "\n",
+ "x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk') \n",
+ "df = df.replace(x)\n",
+ "\n",
+ "df.reset_index(inplace=True, drop=True)\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Split the Data into Training and Testing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['home_ownership',\n",
+ " 'verification_status',\n",
+ " 'issue_d',\n",
+ " 'pymnt_plan',\n",
+ " 'initial_list_status',\n",
+ " 'next_pymnt_d',\n",
+ " 'application_type',\n",
+ " 'hardship_flag',\n",
+ " 'debt_settlement_flag']"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Reserve the original data frame by copying\n",
+ "df_temp = df.copy()\n",
+ "#Drop the target column\n",
+ "df_temp = df.drop(columns=target)\n",
+ "#Find columns that is string\n",
+ "obj_columns = list(df_temp.dtypes[df_temp.dtypes == np.object].index)\n",
+ "obj_columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " annual_inc | \n",
+ " dti | \n",
+ " delinq_2yrs | \n",
+ " inq_last_6mths | \n",
+ " open_acc | \n",
+ " pub_rec | \n",
+ " revol_bal | \n",
+ " ... | \n",
+ " issue_d_Mar-2019 | \n",
+ " pymnt_plan_n | \n",
+ " initial_list_status_f | \n",
+ " initial_list_status_w | \n",
+ " next_pymnt_d_Apr-2019 | \n",
+ " next_pymnt_d_May-2019 | \n",
+ " application_type_Individual | \n",
+ " application_type_Joint App | \n",
+ " hardship_flag_N | \n",
+ " debt_settlement_flag_N | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10500.0 | \n",
+ " 0.1719 | \n",
+ " 375.35 | \n",
+ " 66000.0 | \n",
+ " 27.24 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 1609.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 25000.0 | \n",
+ " 0.2000 | \n",
+ " 929.09 | \n",
+ " 105000.0 | \n",
+ " 20.23 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 17.0 | \n",
+ " 1.0 | \n",
+ " 18368.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20000.0 | \n",
+ " 0.2000 | \n",
+ " 529.88 | \n",
+ " 56000.0 | \n",
+ " 24.26 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 13247.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10000.0 | \n",
+ " 0.1640 | \n",
+ " 353.55 | \n",
+ " 92000.0 | \n",
+ " 31.44 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 17996.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 22000.0 | \n",
+ " 0.1474 | \n",
+ " 520.39 | \n",
+ " 52000.0 | \n",
+ " 18.76 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 14.0 | \n",
+ " 0.0 | \n",
+ " 9091.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 68812 | \n",
+ " 10000.0 | \n",
+ " 0.1502 | \n",
+ " 346.76 | \n",
+ " 26000.0 | \n",
+ " 9.60 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ " 0.0 | \n",
+ " 2684.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68813 | \n",
+ " 12000.0 | \n",
+ " 0.2727 | \n",
+ " 368.37 | \n",
+ " 63000.0 | \n",
+ " 29.07 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 13314.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68814 | \n",
+ " 5000.0 | \n",
+ " 0.1992 | \n",
+ " 185.62 | \n",
+ " 52000.0 | \n",
+ " 14.86 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 3715.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68815 | \n",
+ " 40000.0 | \n",
+ " 0.0646 | \n",
+ " 1225.24 | \n",
+ " 520000.0 | \n",
+ " 9.96 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 21.0 | \n",
+ " 0.0 | \n",
+ " 59529.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68816 | \n",
+ " 16000.0 | \n",
+ " 0.1131 | \n",
+ " 350.36 | \n",
+ " 72000.0 | \n",
+ " 7.02 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 12.0 | \n",
+ " 1.0 | \n",
+ " 11882.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
68817 rows × 95 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment annual_inc dti delinq_2yrs \\\n",
+ "0 10500.0 0.1719 375.35 66000.0 27.24 0.0 \n",
+ "1 25000.0 0.2000 929.09 105000.0 20.23 0.0 \n",
+ "2 20000.0 0.2000 529.88 56000.0 24.26 0.0 \n",
+ "3 10000.0 0.1640 353.55 92000.0 31.44 0.0 \n",
+ "4 22000.0 0.1474 520.39 52000.0 18.76 0.0 \n",
+ "... ... ... ... ... ... ... \n",
+ "68812 10000.0 0.1502 346.76 26000.0 9.60 0.0 \n",
+ "68813 12000.0 0.2727 368.37 63000.0 29.07 0.0 \n",
+ "68814 5000.0 0.1992 185.62 52000.0 14.86 0.0 \n",
+ "68815 40000.0 0.0646 1225.24 520000.0 9.96 0.0 \n",
+ "68816 16000.0 0.1131 350.36 72000.0 7.02 2.0 \n",
+ "\n",
+ " inq_last_6mths open_acc pub_rec revol_bal ... issue_d_Mar-2019 \\\n",
+ "0 0.0 8.0 0.0 1609.0 ... 1 \n",
+ "1 0.0 17.0 1.0 18368.0 ... 1 \n",
+ "2 0.0 8.0 0.0 13247.0 ... 1 \n",
+ "3 1.0 10.0 1.0 17996.0 ... 1 \n",
+ "4 1.0 14.0 0.0 9091.0 ... 1 \n",
+ "... ... ... ... ... ... ... \n",
+ "68812 0.0 9.0 0.0 2684.0 ... 0 \n",
+ "68813 0.0 8.0 0.0 13314.0 ... 0 \n",
+ "68814 0.0 5.0 1.0 3715.0 ... 0 \n",
+ "68815 1.0 21.0 0.0 59529.0 ... 0 \n",
+ "68816 0.0 12.0 1.0 11882.0 ... 0 \n",
+ "\n",
+ " pymnt_plan_n initial_list_status_f initial_list_status_w \\\n",
+ "0 1 0 1 \n",
+ "1 1 0 1 \n",
+ "2 1 0 1 \n",
+ "3 1 0 1 \n",
+ "4 1 0 1 \n",
+ "... ... ... ... \n",
+ "68812 1 0 1 \n",
+ "68813 1 0 1 \n",
+ "68814 1 0 1 \n",
+ "68815 1 1 0 \n",
+ "68816 1 0 1 \n",
+ "\n",
+ " next_pymnt_d_Apr-2019 next_pymnt_d_May-2019 \\\n",
+ "0 0 1 \n",
+ "1 0 1 \n",
+ "2 0 1 \n",
+ "3 0 1 \n",
+ "4 0 1 \n",
+ "... ... ... \n",
+ "68812 0 1 \n",
+ "68813 0 1 \n",
+ "68814 0 1 \n",
+ "68815 0 1 \n",
+ "68816 0 1 \n",
+ "\n",
+ " application_type_Individual application_type_Joint App \\\n",
+ "0 1 0 \n",
+ "1 1 0 \n",
+ "2 1 0 \n",
+ "3 1 0 \n",
+ "4 1 0 \n",
+ "... ... ... \n",
+ "68812 1 0 \n",
+ "68813 1 0 \n",
+ "68814 1 0 \n",
+ "68815 1 0 \n",
+ "68816 1 0 \n",
+ "\n",
+ " hardship_flag_N debt_settlement_flag_N \n",
+ "0 1 1 \n",
+ "1 1 1 \n",
+ "2 1 1 \n",
+ "3 1 1 \n",
+ "4 1 1 \n",
+ "... ... ... \n",
+ "68812 1 1 \n",
+ "68813 1 1 \n",
+ "68814 1 1 \n",
+ "68815 1 1 \n",
+ "68816 1 1 \n",
+ "\n",
+ "[68817 rows x 95 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Convert categorical variable into dummy/indicator variables\n",
+ "df_temp = pd.get_dummies(df_temp, columns= obj_columns)\n",
+ "df_temp"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create our features\n",
+ "X = df_temp\n",
+ "# Create our target\n",
+ "y = df[target]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " annual_inc | \n",
+ " dti | \n",
+ " delinq_2yrs | \n",
+ " inq_last_6mths | \n",
+ " open_acc | \n",
+ " pub_rec | \n",
+ " revol_bal | \n",
+ " ... | \n",
+ " issue_d_Mar-2019 | \n",
+ " pymnt_plan_n | \n",
+ " initial_list_status_f | \n",
+ " initial_list_status_w | \n",
+ " next_pymnt_d_Apr-2019 | \n",
+ " next_pymnt_d_May-2019 | \n",
+ " application_type_Individual | \n",
+ " application_type_Joint App | \n",
+ " hardship_flag_N | \n",
+ " debt_settlement_flag_N | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 6.881700e+04 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " ... | \n",
+ " 68817.000000 | \n",
+ " 68817.0 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.0 | \n",
+ " 68817.0 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 16677.594562 | \n",
+ " 0.127718 | \n",
+ " 480.652863 | \n",
+ " 8.821371e+04 | \n",
+ " 21.778153 | \n",
+ " 0.217766 | \n",
+ " 0.497697 | \n",
+ " 12.587340 | \n",
+ " 0.126030 | \n",
+ " 17604.142828 | \n",
+ " ... | \n",
+ " 0.177238 | \n",
+ " 1.0 | \n",
+ " 0.123879 | \n",
+ " 0.876121 | \n",
+ " 0.383161 | \n",
+ " 0.616839 | \n",
+ " 0.860340 | \n",
+ " 0.139660 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 10277.348590 | \n",
+ " 0.048130 | \n",
+ " 288.062432 | \n",
+ " 1.155800e+05 | \n",
+ " 20.199244 | \n",
+ " 0.718367 | \n",
+ " 0.758122 | \n",
+ " 6.022869 | \n",
+ " 0.336797 | \n",
+ " 21835.880400 | \n",
+ " ... | \n",
+ " 0.381873 | \n",
+ " 0.0 | \n",
+ " 0.329446 | \n",
+ " 0.329446 | \n",
+ " 0.486161 | \n",
+ " 0.486161 | \n",
+ " 0.346637 | \n",
+ " 0.346637 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1000.000000 | \n",
+ " 0.060000 | \n",
+ " 30.890000 | \n",
+ " 4.000000e+01 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 2.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 9000.000000 | \n",
+ " 0.088100 | \n",
+ " 265.730000 | \n",
+ " 5.000000e+04 | \n",
+ " 13.890000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 8.000000 | \n",
+ " 0.000000 | \n",
+ " 6293.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 15000.000000 | \n",
+ " 0.118000 | \n",
+ " 404.560000 | \n",
+ " 7.300000e+04 | \n",
+ " 19.760000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 11.000000 | \n",
+ " 0.000000 | \n",
+ " 12068.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 24000.000000 | \n",
+ " 0.155700 | \n",
+ " 648.100000 | \n",
+ " 1.040000e+05 | \n",
+ " 26.660000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 16.000000 | \n",
+ " 0.000000 | \n",
+ " 21735.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 40000.000000 | \n",
+ " 0.308400 | \n",
+ " 1676.230000 | \n",
+ " 8.797500e+06 | \n",
+ " 999.000000 | \n",
+ " 18.000000 | \n",
+ " 5.000000 | \n",
+ " 72.000000 | \n",
+ " 4.000000 | \n",
+ " 587191.000000 | \n",
+ " ... | \n",
+ " 1.000000 | \n",
+ " 1.0 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8 rows × 95 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment annual_inc dti \\\n",
+ "count 68817.000000 68817.000000 68817.000000 6.881700e+04 68817.000000 \n",
+ "mean 16677.594562 0.127718 480.652863 8.821371e+04 21.778153 \n",
+ "std 10277.348590 0.048130 288.062432 1.155800e+05 20.199244 \n",
+ "min 1000.000000 0.060000 30.890000 4.000000e+01 0.000000 \n",
+ "25% 9000.000000 0.088100 265.730000 5.000000e+04 13.890000 \n",
+ "50% 15000.000000 0.118000 404.560000 7.300000e+04 19.760000 \n",
+ "75% 24000.000000 0.155700 648.100000 1.040000e+05 26.660000 \n",
+ "max 40000.000000 0.308400 1676.230000 8.797500e+06 999.000000 \n",
+ "\n",
+ " delinq_2yrs inq_last_6mths open_acc pub_rec \\\n",
+ "count 68817.000000 68817.000000 68817.000000 68817.000000 \n",
+ "mean 0.217766 0.497697 12.587340 0.126030 \n",
+ "std 0.718367 0.758122 6.022869 0.336797 \n",
+ "min 0.000000 0.000000 2.000000 0.000000 \n",
+ "25% 0.000000 0.000000 8.000000 0.000000 \n",
+ "50% 0.000000 0.000000 11.000000 0.000000 \n",
+ "75% 0.000000 1.000000 16.000000 0.000000 \n",
+ "max 18.000000 5.000000 72.000000 4.000000 \n",
+ "\n",
+ " revol_bal ... issue_d_Mar-2019 pymnt_plan_n \\\n",
+ "count 68817.000000 ... 68817.000000 68817.0 \n",
+ "mean 17604.142828 ... 0.177238 1.0 \n",
+ "std 21835.880400 ... 0.381873 0.0 \n",
+ "min 0.000000 ... 0.000000 1.0 \n",
+ "25% 6293.000000 ... 0.000000 1.0 \n",
+ "50% 12068.000000 ... 0.000000 1.0 \n",
+ "75% 21735.000000 ... 0.000000 1.0 \n",
+ "max 587191.000000 ... 1.000000 1.0 \n",
+ "\n",
+ " initial_list_status_f initial_list_status_w next_pymnt_d_Apr-2019 \\\n",
+ "count 68817.000000 68817.000000 68817.000000 \n",
+ "mean 0.123879 0.876121 0.383161 \n",
+ "std 0.329446 0.329446 0.486161 \n",
+ "min 0.000000 0.000000 0.000000 \n",
+ "25% 0.000000 1.000000 0.000000 \n",
+ "50% 0.000000 1.000000 0.000000 \n",
+ "75% 0.000000 1.000000 1.000000 \n",
+ "max 1.000000 1.000000 1.000000 \n",
+ "\n",
+ " next_pymnt_d_May-2019 application_type_Individual \\\n",
+ "count 68817.000000 68817.000000 \n",
+ "mean 0.616839 0.860340 \n",
+ "std 0.486161 0.346637 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.000000 1.000000 \n",
+ "50% 1.000000 1.000000 \n",
+ "75% 1.000000 1.000000 \n",
+ "max 1.000000 1.000000 \n",
+ "\n",
+ " application_type_Joint App hardship_flag_N debt_settlement_flag_N \n",
+ "count 68817.000000 68817.0 68817.0 \n",
+ "mean 0.139660 1.0 1.0 \n",
+ "std 0.346637 0.0 0.0 \n",
+ "min 0.000000 1.0 1.0 \n",
+ "25% 0.000000 1.0 1.0 \n",
+ "50% 0.000000 1.0 1.0 \n",
+ "75% 0.000000 1.0 1.0 \n",
+ "max 1.000000 1.0 1.0 \n",
+ "\n",
+ "[8 rows x 95 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "low_risk 68470\n",
+ "high_risk 347\n",
+ "Name: loan_status, dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check the balance of our target values\n",
+ "y['loan_status'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'low_risk': 68470, 'high_risk': 347})"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y = y['loan_status'].ravel()\n",
+ "Counter(y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'low_risk': 51366, 'high_risk': 246})"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Create X_train, X_test, y_train, y_test\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n",
+ "Counter(y_train)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Ensemble Learners\n",
+ "\n",
+ "In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:\n",
+ "\n",
+ "1. Train the model using the training data. \n",
+ "2. Calculate the balanced accuracy score from sklearn.metrics.\n",
+ "3. Print the confusion matrix from sklearn.metrics.\n",
+ "4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+ "5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score\n",
+ "\n",
+ "Note: Use a random state of 1 for each algorithm to ensure consistency between tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Balanced Random Forest Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from imblearn.ensemble import BalancedRandomForestClassifier\n",
+ "bf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fitting the model\n",
+ "bf_model = bf_model.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['low_risk', 'low_risk', 'high_risk', ..., 'low_risk', 'low_risk',\n",
+ " 'low_risk'], dtype=object)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_pred = bf_model.predict(X_test)\n",
+ "y_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8964254577157803"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Calculated the balanced accuracy score\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Predicted high risk | \n",
+ " Predicted low risk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Actual high risk | \n",
+ " 68 | \n",
+ " 33 | \n",
+ "
\n",
+ " \n",
+ " Actual low risk | \n",
+ " 1749 | \n",
+ " 15355 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted high risk Predicted low risk\n",
+ "Actual high risk 68 33\n",
+ "Actual low risk 1749 15355"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "# Calculating the confusion matrix.\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "\n",
+ "# Create a DataFrame from the confusion matrix.\n",
+ "cm_df = pd.DataFrame(\n",
+ " cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+ "\n",
+ "cm_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " high_risk 0.04 0.67 0.07 101\n",
+ " low_risk 1.00 0.90 0.95 17104\n",
+ "\n",
+ " accuracy 0.90 17205\n",
+ " macro avg 0.52 0.79 0.51 17205\n",
+ "weighted avg 0.99 0.90 0.94 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report\n",
+ "from sklearn.metrics import classification_report\n",
+ "print(classification_report(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis\n",
+ "\n",
+ "**Accuracy**\n",
+ "\n",
+ "The accuracy score of Balanced Random Forest Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 68 and TN(True Nagative) is 15355, and total is 17205, so (68+15355)/17205 = .8964. **The accuracy score is very good number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**\n",
+ "\n",
+ "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+ "\n",
+ "- High risk rate is 67% so every 100 high risk cases the model detects right 67 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is average number. It is not impressive like the accuracy. It tells that the low risk cases are detected better than the high risk cases**\n",
+ "\n",
+ "- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line.\n",
+ "\n",
+ "**Precision:**\n",
+ "\n",
+ "- High risk rate is closed 0.04 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+ "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+ "\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0.00995374, 0.03195562, 0.01707892, 0.01577554, 0.0160705 ,\n",
+ " 0.00327507, 0.00416516, 0.00782073, 0.00089767, 0.01599866,\n",
+ " 0.00897678, 0.01482801, 0.01653796, 0.05764917, 0.06410003,\n",
+ " 0.09175752, 0.0572968 , 0.00598147, 0. , 0. ,\n",
+ " 0.05174788, 0.00051683, 0. , 0. , 0.00218998,\n",
+ " 0.01073642, 0.00747188, 0.00553016, 0.00401646, 0.00641355,\n",
+ " 0.01431883, 0.01440243, 0.01315152, 0.00606626, 0.00458729,\n",
+ " 0.01616972, 0.01156494, 0.01240632, 0.00760805, 0.00686214,\n",
+ " 0.00960365, 0.01038009, 0.01263346, 0.01140916, 0.0127846 ,\n",
+ " 0. , 0. , 0.0116874 , 0.01502927, 0.00631944,\n",
+ " 0.00804558, 0.00687485, 0.01009753, 0.01701486, 0.0026174 ,\n",
+ " 0.00616047, 0.00953742, 0.00758615, 0.00887062, 0.00982172,\n",
+ " 0.00874511, 0.01145588, 0.00727378, 0.0077987 , 0. ,\n",
+ " 0. , 0.00057341, 0.00755451, 0.0099141 , 0.00683872,\n",
+ " 0.00121513, 0. , 0.01535561, 0.01263661, 0.01464882,\n",
+ " 0.01310158, 0. , 0.00227967, 0.00184048, 0.00208511,\n",
+ " 0.00275652, 0.00154722, 0.00173602, 0.00496182, 0.02353679,\n",
+ " 0.01351987, 0. , 0.00041925, 0.00081252, 0.00790625,\n",
+ " 0.0046852 , 0.00122131, 0.00122633, 0. , 0. ])"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# List the features sorted in descending order by feature importance\n",
+ "importances = bf_model.feature_importances_\n",
+ "importances"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(0.09175752102205247, 'total_rec_prncp'),\n",
+ " (0.06410003199501778, 'total_pymnt_inv'),\n",
+ " (0.05764917485461809, 'total_pymnt'),\n",
+ " (0.05729679526683975, 'total_rec_int'),\n",
+ " (0.05174788106507317, 'last_pymnt_amnt'),\n",
+ " (0.031955619175665397, 'int_rate'),\n",
+ " (0.02353678623968216, 'issue_d_Jan-2019'),\n",
+ " (0.017078915518993903, 'installment'),\n",
+ " (0.017014861224701222, 'mths_since_recent_inq'),\n",
+ " (0.016537957646730293, 'out_prncp_inv'),\n",
+ " (0.016169718411077325, 'max_bal_bc'),\n",
+ " (0.01607049983545137, 'dti'),\n",
+ " (0.01599866290723441, 'revol_bal'),\n",
+ " (0.015775537221600675, 'annual_inc'),\n",
+ " (0.01535560674178928, 'tot_hi_cred_lim'),\n",
+ " (0.015029265003541079, 'mo_sin_old_rev_tl_op'),\n",
+ " (0.014828006488636946, 'out_prncp'),\n",
+ " (0.01464881608833323, 'total_bc_limit'),\n",
+ " (0.014402430445752665, 'total_bal_il'),\n",
+ " (0.014318832248876989, 'mths_since_rcnt_il'),\n",
+ " (0.013519867193755364, 'issue_d_Mar-2019'),\n",
+ " (0.013151520216882331, 'il_util'),\n",
+ " (0.013101578263049833, 'total_il_high_credit_limit'),\n",
+ " (0.012784600558682344, 'bc_util'),\n",
+ " (0.012636608914961465, 'total_bal_ex_mort'),\n",
+ " (0.012633464965390648, 'avg_cur_bal'),\n",
+ " (0.012406321468566728, 'total_rev_hi_lim'),\n",
+ " (0.011687404692448701, 'mo_sin_old_il_acct'),\n",
+ " (0.01156494245653799, 'all_util'),\n",
+ " (0.011455878011762288, 'num_rev_accts'),\n",
+ " (0.011409157520644688, 'bc_open_to_buy'),\n",
+ " (0.01073641504525053, 'tot_cur_bal'),\n",
+ " (0.010380085181706624, 'acc_open_past_24mths'),\n",
+ " (0.010097528131347774, 'mths_since_recent_bc'),\n",
+ " (0.00995373830638152, 'loan_amnt'),\n",
+ " (0.00991410213601043, 'pct_tl_nvr_dlq'),\n",
+ " (0.009821715826953788, 'num_il_tl'),\n",
+ " (0.009603648248133598, 'inq_last_12m'),\n",
+ " (0.009537423049553, 'num_actv_rev_tl'),\n",
+ " (0.008976776055926955, 'total_acc'),\n",
+ " (0.008870623013604539, 'num_bc_tl'),\n",
+ " (0.008745106187024114, 'num_op_rev_tl'),\n",
+ " (0.008045578273709669, 'mo_sin_rcnt_tl'),\n",
+ " (0.007906251501807723, 'next_pymnt_d_Apr-2019'),\n",
+ " (0.00782073260901301, 'open_acc'),\n",
+ " (0.007798696767389274, 'num_sats'),\n",
+ " (0.007608045628523077, 'inq_fi'),\n",
+ " (0.0075861537897335815, 'num_bc_sats'),\n",
+ " (0.007554511001273182, 'num_tl_op_past_12m'),\n",
+ " (0.007471884930172615, 'open_acc_6m'),\n",
+ " (0.007273779915807858, 'num_rev_tl_bal_gt_0'),\n",
+ " (0.006874845464745796, 'mort_acc'),\n",
+ " (0.006862142977394886, 'total_cu_tl'),\n",
+ " (0.006838718858820505, 'percent_bc_gt_75'),\n",
+ " (0.006413554699909871, 'open_il_24m'),\n",
+ " (0.006319439816216779, 'mo_sin_rcnt_rev_tl_op'),\n",
+ " (0.006160469432535709, 'num_actv_bc_tl'),\n",
+ " (0.006066257227997291, 'open_rv_12m'),\n",
+ " (0.005981472544437747, 'total_rec_late_fee'),\n",
+ " (0.0055301594524349495, 'open_act_il'),\n",
+ " (0.004961823663836347, 'issue_d_Feb-2019'),\n",
+ " (0.004685198497435334, 'next_pymnt_d_May-2019'),\n",
+ " (0.0045872929977180356, 'open_rv_24m'),\n",
+ " (0.0041651633321967895, 'inq_last_6mths'),\n",
+ " (0.004016461341161775, 'open_il_12m'),\n",
+ " (0.0032750717701661657, 'delinq_2yrs'),\n",
+ " (0.0027565184136781346, 'verification_status_Not Verified'),\n",
+ " (0.0026174030074401656, 'num_accts_ever_120_pd'),\n",
+ " (0.002279671873697176, 'home_ownership_MORTGAGE'),\n",
+ " (0.0021899772867773103, 'tot_coll_amt'),\n",
+ " (0.0020851101815353096, 'home_ownership_RENT'),\n",
+ " (0.0018404849590376573, 'home_ownership_OWN'),\n",
+ " (0.001736019018028134, 'verification_status_Verified'),\n",
+ " (0.0015472230884974506, 'verification_status_Source Verified'),\n",
+ " (0.0012263315437383057, 'application_type_Joint App'),\n",
+ " (0.0012213148580230454, 'application_type_Individual'),\n",
+ " (0.0012151288883862276, 'pub_rec_bankruptcies'),\n",
+ " (0.0008976722260399365, 'pub_rec'),\n",
+ " (0.0008125182396705508, 'initial_list_status_w'),\n",
+ " (0.000573414997420326, 'num_tl_90g_dpd_24m'),\n",
+ " (0.0005168345750594915, 'collections_12_mths_ex_med'),\n",
+ " (0.0004192455022893127, 'initial_list_status_f'),\n",
+ " (0.0, 'tax_liens'),\n",
+ " (0.0, 'recoveries'),\n",
+ " (0.0, 'pymnt_plan_n'),\n",
+ " (0.0, 'policy_code'),\n",
+ " (0.0, 'num_tl_30dpd'),\n",
+ " (0.0, 'num_tl_120dpd_2m'),\n",
+ " (0.0, 'home_ownership_ANY'),\n",
+ " (0.0, 'hardship_flag_N'),\n",
+ " (0.0, 'delinq_amnt'),\n",
+ " (0.0, 'debt_settlement_flag_N'),\n",
+ " (0.0, 'collection_recovery_fee'),\n",
+ " (0.0, 'chargeoff_within_12_mths'),\n",
+ " (0.0, 'acc_now_delinq')]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sorted(zip(bf_model.feature_importances_, X.columns), reverse=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Easy Ensemble AdaBoost Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Train the Classifier\n",
+ "from imblearn.ensemble import EasyEnsembleClassifier\n",
+ "ee_model = EasyEnsembleClassifier(random_state=1)\n",
+ "# Fitting the model\n",
+ "ee_model = ee_model.fit(X_train, y_train)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['low_risk', 'low_risk', 'low_risk', ..., 'low_risk', 'low_risk',\n",
+ " 'low_risk'], dtype=object)"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_pred = ee_model.predict(X_test)\n",
+ "y_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9004359197907585"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Calculated the balanced accuracy score\n",
+ "accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Predicted high risk | \n",
+ " Predicted low risk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Actual high risk | \n",
+ " 94 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " Actual low risk | \n",
+ " 1706 | \n",
+ " 15398 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted high risk Predicted low risk\n",
+ "Actual high risk 94 7\n",
+ "Actual low risk 1706 15398"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "# Calculating the confusion matrix.\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "\n",
+ "# Create a DataFrame from the confusion matrix.\n",
+ "cm_df = pd.DataFrame(\n",
+ " cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+ "\n",
+ "cm_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " high_risk 0.05 0.93 0.10 101\n",
+ " low_risk 1.00 0.90 0.95 17104\n",
+ "\n",
+ " accuracy 0.90 17205\n",
+ " macro avg 0.53 0.92 0.52 17205\n",
+ "weighted avg 0.99 0.90 0.94 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report\n",
+ "print(classification_report(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis\n",
+ "\n",
+ "**Accuracy**\n",
+ "\n",
+ "The accuracy score of Easy Ensemble AdaBoost Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 94 and TN(True Nagative) is 15398, and total is 17205, so (94+15398)/17205 = .9004. **The accuracy score is very impressive number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**\n",
+ "\n",
+ "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+ "\n",
+ "- High risk rate is 93% so every 100 high risk cases the model detects right 93 high risk cases and categorizes 7 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is also very impressive number. Both the accuracy and the recall rate are over 90%. This is a good model to detect the high risk cases of this dataset**\n",
+ "\n",
+ "- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line.\n",
+ "\n",
+ "**Precision:**\n",
+ "\n",
+ "- High risk rate is closed 0.05 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+ "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+ "\n",
+ "***"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mlenv",
+ "language": "python",
+ "name": "mlenv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Notebooks/credit_risk_resampling.ipynb b/Notebooks/credit_risk_resampling.ipynb
new file mode 100644
index 0000000..9bc3981
--- /dev/null
+++ b/Notebooks/credit_risk_resampling.ipynb
@@ -0,0 +1,2028 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Credit Risk Resampling Techniques"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from pathlib import Path\n",
+ "from collections import Counter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Read the CSV and Perform Basic Data Cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns = [\n",
+ " \"loan_amnt\", \"int_rate\", \"installment\", \"home_ownership\",\n",
+ " \"annual_inc\", \"verification_status\", \"issue_d\", \"loan_status\",\n",
+ " \"pymnt_plan\", \"dti\", \"delinq_2yrs\", \"inq_last_6mths\",\n",
+ " \"open_acc\", \"pub_rec\", \"revol_bal\", \"total_acc\",\n",
+ " \"initial_list_status\", \"out_prncp\", \"out_prncp_inv\", \"total_pymnt\",\n",
+ " \"total_pymnt_inv\", \"total_rec_prncp\", \"total_rec_int\", \"total_rec_late_fee\",\n",
+ " \"recoveries\", \"collection_recovery_fee\", \"last_pymnt_amnt\", \"next_pymnt_d\",\n",
+ " \"collections_12_mths_ex_med\", \"policy_code\", \"application_type\", \"acc_now_delinq\",\n",
+ " \"tot_coll_amt\", \"tot_cur_bal\", \"open_acc_6m\", \"open_act_il\",\n",
+ " \"open_il_12m\", \"open_il_24m\", \"mths_since_rcnt_il\", \"total_bal_il\",\n",
+ " \"il_util\", \"open_rv_12m\", \"open_rv_24m\", \"max_bal_bc\",\n",
+ " \"all_util\", \"total_rev_hi_lim\", \"inq_fi\", \"total_cu_tl\",\n",
+ " \"inq_last_12m\", \"acc_open_past_24mths\", \"avg_cur_bal\", \"bc_open_to_buy\",\n",
+ " \"bc_util\", \"chargeoff_within_12_mths\", \"delinq_amnt\", \"mo_sin_old_il_acct\",\n",
+ " \"mo_sin_old_rev_tl_op\", \"mo_sin_rcnt_rev_tl_op\", \"mo_sin_rcnt_tl\", \"mort_acc\",\n",
+ " \"mths_since_recent_bc\", \"mths_since_recent_inq\", \"num_accts_ever_120_pd\", \"num_actv_bc_tl\",\n",
+ " \"num_actv_rev_tl\", \"num_bc_sats\", \"num_bc_tl\", \"num_il_tl\",\n",
+ " \"num_op_rev_tl\", \"num_rev_accts\", \"num_rev_tl_bal_gt_0\",\n",
+ " \"num_sats\", \"num_tl_120dpd_2m\", \"num_tl_30dpd\", \"num_tl_90g_dpd_24m\",\n",
+ " \"num_tl_op_past_12m\", \"pct_tl_nvr_dlq\", \"percent_bc_gt_75\", \"pub_rec_bankruptcies\",\n",
+ " \"tax_liens\", \"tot_hi_cred_lim\", \"total_bal_ex_mort\", \"total_bc_limit\",\n",
+ " \"total_il_high_credit_limit\", \"hardship_flag\", \"debt_settlement_flag\"\n",
+ "]\n",
+ "\n",
+ "target = [\"loan_status\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " home_ownership | \n",
+ " annual_inc | \n",
+ " verification_status | \n",
+ " issue_d | \n",
+ " loan_status | \n",
+ " pymnt_plan | \n",
+ " dti | \n",
+ " ... | \n",
+ " pct_tl_nvr_dlq | \n",
+ " percent_bc_gt_75 | \n",
+ " pub_rec_bankruptcies | \n",
+ " tax_liens | \n",
+ " tot_hi_cred_lim | \n",
+ " total_bal_ex_mort | \n",
+ " total_bc_limit | \n",
+ " total_il_high_credit_limit | \n",
+ " hardship_flag | \n",
+ " debt_settlement_flag | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10500.0 | \n",
+ " 0.1719 | \n",
+ " 375.35 | \n",
+ " RENT | \n",
+ " 66000.0 | \n",
+ " Source Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 27.24 | \n",
+ " ... | \n",
+ " 85.7 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 65687.0 | \n",
+ " 38199.0 | \n",
+ " 2000.0 | \n",
+ " 61987.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 25000.0 | \n",
+ " 0.2000 | \n",
+ " 929.09 | \n",
+ " MORTGAGE | \n",
+ " 105000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 20.23 | \n",
+ " ... | \n",
+ " 91.2 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 271427.0 | \n",
+ " 60641.0 | \n",
+ " 41200.0 | \n",
+ " 49197.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20000.0 | \n",
+ " 0.2000 | \n",
+ " 529.88 | \n",
+ " MORTGAGE | \n",
+ " 56000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 24.26 | \n",
+ " ... | \n",
+ " 66.7 | \n",
+ " 50.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 60644.0 | \n",
+ " 45684.0 | \n",
+ " 7500.0 | \n",
+ " 43144.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10000.0 | \n",
+ " 0.1640 | \n",
+ " 353.55 | \n",
+ " RENT | \n",
+ " 92000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 31.44 | \n",
+ " ... | \n",
+ " 100.0 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 99506.0 | \n",
+ " 68784.0 | \n",
+ " 19700.0 | \n",
+ " 76506.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 22000.0 | \n",
+ " 0.1474 | \n",
+ " 520.39 | \n",
+ " MORTGAGE | \n",
+ " 52000.0 | \n",
+ " Not Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 18.76 | \n",
+ " ... | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 219750.0 | \n",
+ " 25919.0 | \n",
+ " 27600.0 | \n",
+ " 20000.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 86 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment home_ownership annual_inc \\\n",
+ "0 10500.0 0.1719 375.35 RENT 66000.0 \n",
+ "1 25000.0 0.2000 929.09 MORTGAGE 105000.0 \n",
+ "2 20000.0 0.2000 529.88 MORTGAGE 56000.0 \n",
+ "3 10000.0 0.1640 353.55 RENT 92000.0 \n",
+ "4 22000.0 0.1474 520.39 MORTGAGE 52000.0 \n",
+ "\n",
+ " verification_status issue_d loan_status pymnt_plan dti ... \\\n",
+ "0 Source Verified Mar-2019 low_risk n 27.24 ... \n",
+ "1 Verified Mar-2019 low_risk n 20.23 ... \n",
+ "2 Verified Mar-2019 low_risk n 24.26 ... \n",
+ "3 Verified Mar-2019 low_risk n 31.44 ... \n",
+ "4 Not Verified Mar-2019 low_risk n 18.76 ... \n",
+ "\n",
+ " pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens \\\n",
+ "0 85.7 100.0 0.0 0.0 \n",
+ "1 91.2 50.0 1.0 0.0 \n",
+ "2 66.7 50.0 0.0 0.0 \n",
+ "3 100.0 50.0 1.0 0.0 \n",
+ "4 100.0 0.0 0.0 0.0 \n",
+ "\n",
+ " tot_hi_cred_lim total_bal_ex_mort total_bc_limit \\\n",
+ "0 65687.0 38199.0 2000.0 \n",
+ "1 271427.0 60641.0 41200.0 \n",
+ "2 60644.0 45684.0 7500.0 \n",
+ "3 99506.0 68784.0 19700.0 \n",
+ "4 219750.0 25919.0 27600.0 \n",
+ "\n",
+ " total_il_high_credit_limit hardship_flag debt_settlement_flag \n",
+ "0 61987.0 N N \n",
+ "1 49197.0 N N \n",
+ "2 43144.0 N N \n",
+ "3 76506.0 N N \n",
+ "4 20000.0 N N \n",
+ "\n",
+ "[5 rows x 86 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load the data\n",
+ "file_path = Path('../Resources/LoanStats_2019Q1.csv')\n",
+ "df = pd.read_csv(file_path, skiprows=1)[:-2]\n",
+ "df = df.loc[:, columns].copy()\n",
+ "\n",
+ "# Drop the null columns where all values are null\n",
+ "df = df.dropna(axis='columns', how='all')\n",
+ "\n",
+ "# Drop the null rows\n",
+ "df = df.dropna()\n",
+ "\n",
+ "# Remove the `Issued` loan status\n",
+ "issued_mask = df['loan_status'] != 'Issued'\n",
+ "df = df.loc[issued_mask]\n",
+ "\n",
+ "# convert interest rate to numerical\n",
+ "df['int_rate'] = df['int_rate'].str.replace('%', '')\n",
+ "df['int_rate'] = df['int_rate'].astype('float') / 100\n",
+ "\n",
+ "\n",
+ "# Convert the target column values to low_risk and high_risk based on their values\n",
+ "x = {'Current': 'low_risk'} \n",
+ "df = df.replace(x)\n",
+ "\n",
+ "x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk') \n",
+ "df = df.replace(x)\n",
+ "\n",
+ "df.reset_index(inplace=True, drop=True)\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Split the Data into Training and Testing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['home_ownership',\n",
+ " 'verification_status',\n",
+ " 'issue_d',\n",
+ " 'pymnt_plan',\n",
+ " 'initial_list_status',\n",
+ " 'next_pymnt_d',\n",
+ " 'application_type',\n",
+ " 'hardship_flag',\n",
+ " 'debt_settlement_flag']"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Reserve the original data frame by copying\n",
+ "df_temp = df.copy()\n",
+ "#Drop the target column\n",
+ "df_temp = df.drop(columns=target)\n",
+ "#Find columns that is string\n",
+ "obj_columns = list(df_temp.dtypes[df_temp.dtypes == np.object].index)\n",
+ "obj_columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " annual_inc | \n",
+ " dti | \n",
+ " delinq_2yrs | \n",
+ " inq_last_6mths | \n",
+ " open_acc | \n",
+ " pub_rec | \n",
+ " revol_bal | \n",
+ " ... | \n",
+ " issue_d_Mar-2019 | \n",
+ " pymnt_plan_n | \n",
+ " initial_list_status_f | \n",
+ " initial_list_status_w | \n",
+ " next_pymnt_d_Apr-2019 | \n",
+ " next_pymnt_d_May-2019 | \n",
+ " application_type_Individual | \n",
+ " application_type_Joint App | \n",
+ " hardship_flag_N | \n",
+ " debt_settlement_flag_N | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10500.0 | \n",
+ " 0.1719 | \n",
+ " 375.35 | \n",
+ " 66000.0 | \n",
+ " 27.24 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 1609.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 25000.0 | \n",
+ " 0.2000 | \n",
+ " 929.09 | \n",
+ " 105000.0 | \n",
+ " 20.23 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 17.0 | \n",
+ " 1.0 | \n",
+ " 18368.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20000.0 | \n",
+ " 0.2000 | \n",
+ " 529.88 | \n",
+ " 56000.0 | \n",
+ " 24.26 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 13247.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10000.0 | \n",
+ " 0.1640 | \n",
+ " 353.55 | \n",
+ " 92000.0 | \n",
+ " 31.44 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 17996.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 22000.0 | \n",
+ " 0.1474 | \n",
+ " 520.39 | \n",
+ " 52000.0 | \n",
+ " 18.76 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 14.0 | \n",
+ " 0.0 | \n",
+ " 9091.0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 68812 | \n",
+ " 10000.0 | \n",
+ " 0.1502 | \n",
+ " 346.76 | \n",
+ " 26000.0 | \n",
+ " 9.60 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ " 0.0 | \n",
+ " 2684.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68813 | \n",
+ " 12000.0 | \n",
+ " 0.2727 | \n",
+ " 368.37 | \n",
+ " 63000.0 | \n",
+ " 29.07 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 13314.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68814 | \n",
+ " 5000.0 | \n",
+ " 0.1992 | \n",
+ " 185.62 | \n",
+ " 52000.0 | \n",
+ " 14.86 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 3715.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68815 | \n",
+ " 40000.0 | \n",
+ " 0.0646 | \n",
+ " 1225.24 | \n",
+ " 520000.0 | \n",
+ " 9.96 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 21.0 | \n",
+ " 0.0 | \n",
+ " 59529.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 68816 | \n",
+ " 16000.0 | \n",
+ " 0.1131 | \n",
+ " 350.36 | \n",
+ " 72000.0 | \n",
+ " 7.02 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 12.0 | \n",
+ " 1.0 | \n",
+ " 11882.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
68817 rows × 95 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment annual_inc dti delinq_2yrs \\\n",
+ "0 10500.0 0.1719 375.35 66000.0 27.24 0.0 \n",
+ "1 25000.0 0.2000 929.09 105000.0 20.23 0.0 \n",
+ "2 20000.0 0.2000 529.88 56000.0 24.26 0.0 \n",
+ "3 10000.0 0.1640 353.55 92000.0 31.44 0.0 \n",
+ "4 22000.0 0.1474 520.39 52000.0 18.76 0.0 \n",
+ "... ... ... ... ... ... ... \n",
+ "68812 10000.0 0.1502 346.76 26000.0 9.60 0.0 \n",
+ "68813 12000.0 0.2727 368.37 63000.0 29.07 0.0 \n",
+ "68814 5000.0 0.1992 185.62 52000.0 14.86 0.0 \n",
+ "68815 40000.0 0.0646 1225.24 520000.0 9.96 0.0 \n",
+ "68816 16000.0 0.1131 350.36 72000.0 7.02 2.0 \n",
+ "\n",
+ " inq_last_6mths open_acc pub_rec revol_bal ... issue_d_Mar-2019 \\\n",
+ "0 0.0 8.0 0.0 1609.0 ... 1 \n",
+ "1 0.0 17.0 1.0 18368.0 ... 1 \n",
+ "2 0.0 8.0 0.0 13247.0 ... 1 \n",
+ "3 1.0 10.0 1.0 17996.0 ... 1 \n",
+ "4 1.0 14.0 0.0 9091.0 ... 1 \n",
+ "... ... ... ... ... ... ... \n",
+ "68812 0.0 9.0 0.0 2684.0 ... 0 \n",
+ "68813 0.0 8.0 0.0 13314.0 ... 0 \n",
+ "68814 0.0 5.0 1.0 3715.0 ... 0 \n",
+ "68815 1.0 21.0 0.0 59529.0 ... 0 \n",
+ "68816 0.0 12.0 1.0 11882.0 ... 0 \n",
+ "\n",
+ " pymnt_plan_n initial_list_status_f initial_list_status_w \\\n",
+ "0 1 0 1 \n",
+ "1 1 0 1 \n",
+ "2 1 0 1 \n",
+ "3 1 0 1 \n",
+ "4 1 0 1 \n",
+ "... ... ... ... \n",
+ "68812 1 0 1 \n",
+ "68813 1 0 1 \n",
+ "68814 1 0 1 \n",
+ "68815 1 1 0 \n",
+ "68816 1 0 1 \n",
+ "\n",
+ " next_pymnt_d_Apr-2019 next_pymnt_d_May-2019 \\\n",
+ "0 0 1 \n",
+ "1 0 1 \n",
+ "2 0 1 \n",
+ "3 0 1 \n",
+ "4 0 1 \n",
+ "... ... ... \n",
+ "68812 0 1 \n",
+ "68813 0 1 \n",
+ "68814 0 1 \n",
+ "68815 0 1 \n",
+ "68816 0 1 \n",
+ "\n",
+ " application_type_Individual application_type_Joint App \\\n",
+ "0 1 0 \n",
+ "1 1 0 \n",
+ "2 1 0 \n",
+ "3 1 0 \n",
+ "4 1 0 \n",
+ "... ... ... \n",
+ "68812 1 0 \n",
+ "68813 1 0 \n",
+ "68814 1 0 \n",
+ "68815 1 0 \n",
+ "68816 1 0 \n",
+ "\n",
+ " hardship_flag_N debt_settlement_flag_N \n",
+ "0 1 1 \n",
+ "1 1 1 \n",
+ "2 1 1 \n",
+ "3 1 1 \n",
+ "4 1 1 \n",
+ "... ... ... \n",
+ "68812 1 1 \n",
+ "68813 1 1 \n",
+ "68814 1 1 \n",
+ "68815 1 1 \n",
+ "68816 1 1 \n",
+ "\n",
+ "[68817 rows x 95 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Convert categorical variable into dummy/indicator variables\n",
+ "df_temp = pd.get_dummies(df_temp, columns= obj_columns)\n",
+ "df_temp"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create our features\n",
+ "X = df_temp\n",
+ "# Create our target\n",
+ "y = df[target]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " annual_inc | \n",
+ " dti | \n",
+ " delinq_2yrs | \n",
+ " inq_last_6mths | \n",
+ " open_acc | \n",
+ " pub_rec | \n",
+ " revol_bal | \n",
+ " ... | \n",
+ " issue_d_Mar-2019 | \n",
+ " pymnt_plan_n | \n",
+ " initial_list_status_f | \n",
+ " initial_list_status_w | \n",
+ " next_pymnt_d_Apr-2019 | \n",
+ " next_pymnt_d_May-2019 | \n",
+ " application_type_Individual | \n",
+ " application_type_Joint App | \n",
+ " hardship_flag_N | \n",
+ " debt_settlement_flag_N | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 6.881700e+04 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " ... | \n",
+ " 68817.000000 | \n",
+ " 68817.0 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.0 | \n",
+ " 68817.0 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 16677.594562 | \n",
+ " 0.127718 | \n",
+ " 480.652863 | \n",
+ " 8.821371e+04 | \n",
+ " 21.778153 | \n",
+ " 0.217766 | \n",
+ " 0.497697 | \n",
+ " 12.587340 | \n",
+ " 0.126030 | \n",
+ " 17604.142828 | \n",
+ " ... | \n",
+ " 0.177238 | \n",
+ " 1.0 | \n",
+ " 0.123879 | \n",
+ " 0.876121 | \n",
+ " 0.383161 | \n",
+ " 0.616839 | \n",
+ " 0.860340 | \n",
+ " 0.139660 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 10277.348590 | \n",
+ " 0.048130 | \n",
+ " 288.062432 | \n",
+ " 1.155800e+05 | \n",
+ " 20.199244 | \n",
+ " 0.718367 | \n",
+ " 0.758122 | \n",
+ " 6.022869 | \n",
+ " 0.336797 | \n",
+ " 21835.880400 | \n",
+ " ... | \n",
+ " 0.381873 | \n",
+ " 0.0 | \n",
+ " 0.329446 | \n",
+ " 0.329446 | \n",
+ " 0.486161 | \n",
+ " 0.486161 | \n",
+ " 0.346637 | \n",
+ " 0.346637 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1000.000000 | \n",
+ " 0.060000 | \n",
+ " 30.890000 | \n",
+ " 4.000000e+01 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 2.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 9000.000000 | \n",
+ " 0.088100 | \n",
+ " 265.730000 | \n",
+ " 5.000000e+04 | \n",
+ " 13.890000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 8.000000 | \n",
+ " 0.000000 | \n",
+ " 6293.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 15000.000000 | \n",
+ " 0.118000 | \n",
+ " 404.560000 | \n",
+ " 7.300000e+04 | \n",
+ " 19.760000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 11.000000 | \n",
+ " 0.000000 | \n",
+ " 12068.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 24000.000000 | \n",
+ " 0.155700 | \n",
+ " 648.100000 | \n",
+ " 1.040000e+05 | \n",
+ " 26.660000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 16.000000 | \n",
+ " 0.000000 | \n",
+ " 21735.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 40000.000000 | \n",
+ " 0.308400 | \n",
+ " 1676.230000 | \n",
+ " 8.797500e+06 | \n",
+ " 999.000000 | \n",
+ " 18.000000 | \n",
+ " 5.000000 | \n",
+ " 72.000000 | \n",
+ " 4.000000 | \n",
+ " 587191.000000 | \n",
+ " ... | \n",
+ " 1.000000 | \n",
+ " 1.0 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8 rows × 95 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment annual_inc dti \\\n",
+ "count 68817.000000 68817.000000 68817.000000 6.881700e+04 68817.000000 \n",
+ "mean 16677.594562 0.127718 480.652863 8.821371e+04 21.778153 \n",
+ "std 10277.348590 0.048130 288.062432 1.155800e+05 20.199244 \n",
+ "min 1000.000000 0.060000 30.890000 4.000000e+01 0.000000 \n",
+ "25% 9000.000000 0.088100 265.730000 5.000000e+04 13.890000 \n",
+ "50% 15000.000000 0.118000 404.560000 7.300000e+04 19.760000 \n",
+ "75% 24000.000000 0.155700 648.100000 1.040000e+05 26.660000 \n",
+ "max 40000.000000 0.308400 1676.230000 8.797500e+06 999.000000 \n",
+ "\n",
+ " delinq_2yrs inq_last_6mths open_acc pub_rec \\\n",
+ "count 68817.000000 68817.000000 68817.000000 68817.000000 \n",
+ "mean 0.217766 0.497697 12.587340 0.126030 \n",
+ "std 0.718367 0.758122 6.022869 0.336797 \n",
+ "min 0.000000 0.000000 2.000000 0.000000 \n",
+ "25% 0.000000 0.000000 8.000000 0.000000 \n",
+ "50% 0.000000 0.000000 11.000000 0.000000 \n",
+ "75% 0.000000 1.000000 16.000000 0.000000 \n",
+ "max 18.000000 5.000000 72.000000 4.000000 \n",
+ "\n",
+ " revol_bal ... issue_d_Mar-2019 pymnt_plan_n \\\n",
+ "count 68817.000000 ... 68817.000000 68817.0 \n",
+ "mean 17604.142828 ... 0.177238 1.0 \n",
+ "std 21835.880400 ... 0.381873 0.0 \n",
+ "min 0.000000 ... 0.000000 1.0 \n",
+ "25% 6293.000000 ... 0.000000 1.0 \n",
+ "50% 12068.000000 ... 0.000000 1.0 \n",
+ "75% 21735.000000 ... 0.000000 1.0 \n",
+ "max 587191.000000 ... 1.000000 1.0 \n",
+ "\n",
+ " initial_list_status_f initial_list_status_w next_pymnt_d_Apr-2019 \\\n",
+ "count 68817.000000 68817.000000 68817.000000 \n",
+ "mean 0.123879 0.876121 0.383161 \n",
+ "std 0.329446 0.329446 0.486161 \n",
+ "min 0.000000 0.000000 0.000000 \n",
+ "25% 0.000000 1.000000 0.000000 \n",
+ "50% 0.000000 1.000000 0.000000 \n",
+ "75% 0.000000 1.000000 1.000000 \n",
+ "max 1.000000 1.000000 1.000000 \n",
+ "\n",
+ " next_pymnt_d_May-2019 application_type_Individual \\\n",
+ "count 68817.000000 68817.000000 \n",
+ "mean 0.616839 0.860340 \n",
+ "std 0.486161 0.346637 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.000000 1.000000 \n",
+ "50% 1.000000 1.000000 \n",
+ "75% 1.000000 1.000000 \n",
+ "max 1.000000 1.000000 \n",
+ "\n",
+ " application_type_Joint App hardship_flag_N debt_settlement_flag_N \n",
+ "count 68817.000000 68817.0 68817.0 \n",
+ "mean 0.139660 1.0 1.0 \n",
+ "std 0.346637 0.0 0.0 \n",
+ "min 0.000000 1.0 1.0 \n",
+ "25% 0.000000 1.0 1.0 \n",
+ "50% 0.000000 1.0 1.0 \n",
+ "75% 0.000000 1.0 1.0 \n",
+ "max 1.000000 1.0 1.0 \n",
+ "\n",
+ "[8 rows x 95 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "low_risk 68470\n",
+ "high_risk 347\n",
+ "Name: loan_status, dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check the balance of our target values\n",
+ "y['loan_status'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'low_risk': 68470, 'high_risk': 347})"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y = y['loan_status'].ravel()\n",
+ "Counter(y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'low_risk': 51366, 'high_risk': 246})"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Create X_train, X_test, y_train, y_test\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n",
+ "Counter(y_train)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Oversampling\n",
+ "\n",
+ "In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:\n",
+ "\n",
+ "1. View the count of the target classes using `Counter` from the collections library. \n",
+ "3. Use the resampled data to train a logistic regression model.\n",
+ "3. Calculate the balanced accuracy score from sklearn.metrics.\n",
+ "4. Print the confusion matrix from sklearn.metrics.\n",
+ "5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+ "\n",
+ "Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Naive Random Oversampling"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'low_risk': 51366, 'high_risk': 51366})"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Resample the training data with the RandomOversampler\n",
+ "from imblearn.over_sampling import RandomOverSampler\n",
+ "ros = RandomOverSampler(random_state=1)\n",
+ "X_resampled, y_resampled = ros.fit_resample(X_train, y_train)\n",
+ "Counter(y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+ " intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+ " multi_class='auto', n_jobs=None, penalty='l2',\n",
+ " random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+ " warm_start=False)"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Train the Logistic Regression model using the resampled data\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+ "model.fit(X_resampled, y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6870331414572701"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Run the test with model\n",
+ "y_pred = model.predict(X_test)\n",
+ "# Calculated the balanced accuracy score\n",
+ "from sklearn.metrics import balanced_accuracy_score\n",
+ "balanced_accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Predicted high risk | \n",
+ " Predicted low risk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Actual high risk | \n",
+ " 78 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " Actual low risk | \n",
+ " 6811 | \n",
+ " 10293 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted high risk Predicted low risk\n",
+ "Actual high risk 78 23\n",
+ "Actual low risk 6811 10293"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "\n",
+ "# Calculating the confusion matrix.\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "\n",
+ "# Create a DataFrame from the confusion matrix.\n",
+ "cm_df = pd.DataFrame(\n",
+ " cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+ "\n",
+ "cm_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " pre rec spe f1 geo iba sup\n",
+ "\n",
+ " high_risk 0.01 0.77 0.60 0.02 0.68 0.47 101\n",
+ " low_risk 1.00 0.60 0.77 0.75 0.68 0.46 17104\n",
+ "\n",
+ "avg / total 0.99 0.60 0.77 0.75 0.68 0.46 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report\n",
+ "from imblearn.metrics import classification_report_imbalanced\n",
+ "print(classification_report_imbalanced(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis\n",
+ "\n",
+ "**Accuracy**\n",
+ "\n",
+ "The balanced accuracy score of Naive Random Oversampling is .69 or 69% after resampled the data training with the Random Over Sampler while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 78 and TN(True Nagative) is 10293, and total is 17205, so (78+10293)/17205 = .602789. This is a important factor that we need to evaluate the Machine Learning (ML) method becuase of the accuracy prediction of the corrected cases in the dataset. **Naive Random Oversampling is actually improved the accuracy prediction**\n",
+ "\n",
+ "**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)\n",
+ "\n",
+ "- High risk rate is 77% so every 100 high risk cases the model detects right 77 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. \n",
+ "- Low risk rate is 60% so every 100 low risk cases the model categorizes 40 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line\n",
+ "\n",
+ "**Precision:**\n",
+ "\n",
+ "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+ "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+ "\n",
+ "**For this dateset, Naive Random Oversampling accuracy and recall rate is the best among below models: Naive Random Oversampling, SMOTE,Undersampling,Combination Sampling. However, I recommend to use the Easy Ensemble AdaBoost Classifier model for the dataset because the accuracy and recall rate are very impressive(over 90%) (Refer: ReadMe.md)**\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### SMOTE Oversampling"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'low_risk': 51366, 'high_risk': 51366})"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Resample the training data with SMOTE\n",
+ "from imblearn.over_sampling import SMOTE\n",
+ "smote = SMOTE(random_state=1)\n",
+ "X_resampled, y_resampled = smote.fit_resample(X_train, y_train)\n",
+ "Counter(y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+ " intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+ " multi_class='auto', n_jobs=None, penalty='l2',\n",
+ " random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+ " warm_start=False)"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Train the Logistic Regression model using the resampled data\n",
+ "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+ "model.fit(X_resampled, y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6642357991645751"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Calculated the balanced accuracy score\n",
+ "y_pred = model.predict(X_test)\n",
+ "balanced_accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Predicted high risk | \n",
+ " Predicted low risk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Actual high risk | \n",
+ " 64 | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " Actual low risk | \n",
+ " 5220 | \n",
+ " 11884 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted high risk Predicted low risk\n",
+ "Actual high risk 64 37\n",
+ "Actual low risk 5220 11884"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "\n",
+ "# Calculating the confusion matrix.\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "\n",
+ "# Create a DataFrame from the confusion matrix.\n",
+ "cm_df = pd.DataFrame(\n",
+ " cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+ "\n",
+ "cm_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " pre rec spe f1 geo iba sup\n",
+ "\n",
+ " high_risk 0.01 0.63 0.69 0.02 0.66 0.44 101\n",
+ " low_risk 1.00 0.69 0.63 0.82 0.66 0.44 17104\n",
+ "\n",
+ "avg / total 0.99 0.69 0.63 0.81 0.66 0.44 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report\n",
+ "print(classification_report_imbalanced(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis\n",
+ "\n",
+ "**Accuracy**\n",
+ "\n",
+ "The balanced accuracy score of SMOTE Oversampling is .66 or 66% after resampled the data training with the SMOTE while Accuracy score is .69 or 69% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 64 and TN(True Nagative) is 11884, and total is 17205, so (64+11884)/17205 = .69444. With SMOTE model, the balanced accuracy rate is lower than the actual accuracy rate. It means the good loan (low risk) cases predicted better. **SMOTE model accuracy is very close and similar to Naive Random Oversampling, but the Naive Random Oversampling is better selection**\n",
+ "\n",
+ "**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)\n",
+ "\n",
+ "- High risk rate is 63% so every 100 high risk cases the model detects right 63 high risk cases and categorizes 37 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **SMOTE model recall (sensitivity) rate is lower than Naive Random Oversampling**\n",
+ "\n",
+ "- Low risk rate is 69% so every 100 low risk cases the model categorizes 41 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line\n",
+ "\n",
+ "**Precision:**\n",
+ "\n",
+ "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+ "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+ "\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Undersampling\n",
+ "\n",
+ "In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:\n",
+ "\n",
+ "1. View the count of the target classes using `Counter` from the collections library. \n",
+ "3. Use the resampled data to train a logistic regression model.\n",
+ "3. Calculate the balanced accuracy score from sklearn.metrics.\n",
+ "4. Print the confusion matrix from sklearn.metrics.\n",
+ "5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+ "\n",
+ "Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'high_risk': 246, 'low_risk': 246})"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Resample the data using the ClusterCentroids resampler\n",
+ "from imblearn.under_sampling import ClusterCentroids\n",
+ "cc = ClusterCentroids(random_state=1)\n",
+ "X_resampled, y_resampled = cc.fit_resample(X_train, y_train)\n",
+ "Counter(y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+ " intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+ " multi_class='auto', n_jobs=None, penalty='l2',\n",
+ " random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+ " warm_start=False)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Train the Logistic Regression model using the resampled data\n",
+ "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+ "model.fit(X_resampled, y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.5330103432466726"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Calculated the balanced accuracy score\n",
+ "y_pred = model.predict(X_test)\n",
+ "balanced_accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Predicted high risk | \n",
+ " Predicted low risk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Actual high risk | \n",
+ " 67 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " Actual low risk | \n",
+ " 10217 | \n",
+ " 6887 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted high risk Predicted low risk\n",
+ "Actual high risk 67 34\n",
+ "Actual low risk 10217 6887"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "\n",
+ "# Calculating the confusion matrix.\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "\n",
+ "# Create a DataFrame from the confusion matrix.\n",
+ "cm_df = pd.DataFrame(\n",
+ " cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+ "\n",
+ "cm_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " pre rec spe f1 geo iba sup\n",
+ "\n",
+ " high_risk 0.01 0.66 0.40 0.01 0.52 0.27 101\n",
+ " low_risk 1.00 0.40 0.66 0.57 0.52 0.26 17104\n",
+ "\n",
+ "avg / total 0.99 0.40 0.66 0.57 0.52 0.26 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report\n",
+ "print(classification_report_imbalanced(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis\n",
+ "\n",
+ "**Accuracy**\n",
+ "\n",
+ "The balanced accuracy score of Undersampling is .53 or 53% after resampled the data training with the ClusterCentroids while Accuracy score is .40 or 40% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 6887, and total is 17205, so (67+6887)/17205 = .4041. With Undersampling model, the balanced accuracy rate is higher than the actual accuracy rate. **Undersampling model accuracy is very low. This method may not the righ model for this dataset**\n",
+ "\n",
+ "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+ "\n",
+ "- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling still the best model selection because the accurary and recall rate**\n",
+ "\n",
+ "- Low risk rate is 40% so every 100 low risk cases the model categorizes 60 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line, but it is **very high alarmed cases**\n",
+ "\n",
+ "**Precision:**\n",
+ "\n",
+ "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+ "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+ "\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Combination (Over and Under) Sampling\n",
+ "\n",
+ "In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:\n",
+ "\n",
+ "1. View the count of the target classes using `Counter` from the collections library. \n",
+ "3. Use the resampled data to train a logistic regression model.\n",
+ "3. Calculate the balanced accuracy score from sklearn.metrics.\n",
+ "4. Print the confusion matrix from sklearn.metrics.\n",
+ "5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+ "\n",
+ "Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'high_risk': 51359, 'low_risk': 46660})"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Resample the training data with SMOTEENN\n",
+ "from imblearn.combine import SMOTEENN\n",
+ "\n",
+ "smote_enn = SMOTEENN(random_state=0)\n",
+ "X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)\n",
+ "Counter(y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+ " intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+ " multi_class='auto', n_jobs=None, penalty='l2',\n",
+ " random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+ " warm_start=False)"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Train the Logistic Regression model using the resampled data\n",
+ "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+ "model.fit(X_resampled, y_resampled)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6347701655104706"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Calculated the balanced accuracy score\n",
+ "y_pred = model.predict(X_test)\n",
+ "\n",
+ "balanced_accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Predicted high risk | \n",
+ " Predicted low risk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Actual high risk | \n",
+ " 67 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " Actual low risk | \n",
+ " 6736 | \n",
+ " 10368 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted high risk Predicted low risk\n",
+ "Actual high risk 67 34\n",
+ "Actual low risk 6736 10368"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "\n",
+ "# Calculating the confusion matrix.\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "\n",
+ "# Create a DataFrame from the confusion matrix.\n",
+ "cm_df = pd.DataFrame(\n",
+ " cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+ "\n",
+ "cm_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " pre rec spe f1 geo iba sup\n",
+ "\n",
+ " high_risk 0.01 0.66 0.61 0.02 0.63 0.40 101\n",
+ " low_risk 1.00 0.61 0.66 0.75 0.63 0.40 17104\n",
+ "\n",
+ "avg / total 0.99 0.61 0.66 0.75 0.63 0.40 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report\n",
+ "print(classification_report_imbalanced(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis\n",
+ "\n",
+ "**Accuracy**\n",
+ "\n",
+ "The balanced accuracy score of Combination (Over and Under) Sampling is .63 or 63% after resampled the data training with the SMOTEENN while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 10368, and total is 17205, so (67+10368)/17205 = .6065. With SMOTEENN model, the balanced accuracy rate is higher than the actual accuracy rate. **SMOTEENN model accuracy is lower than Naive Random Oversampling, but it is much better than Undersampling model(ClusterCentroids)**\n",
+ "\n",
+ "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+ "\n",
+ "- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling is still the highest rate so far**\n",
+ "\n",
+ "- Low risk rate is 61% so every 100 low risk cases the model categorizes 39 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line.\n",
+ "\n",
+ "**Precision:**\n",
+ "\n",
+ "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+ "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+ "\n",
+ "***"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mlenv",
+ "language": "python",
+ "name": "mlenv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.md b/README.md
index 8aac655..cc2a96d 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,268 @@
-# LM-LendingClub
\ No newline at end of file
+# Lending Club - Credit Risk (Machine Learning)
+
+## Summary
+
+Credit risk is an inherently unbalanced classification problem, as the number of good loans easily outnumber the number of risky loans.
+
+1- To use different techniques to train and evaluate models with unbalanced classes.
+2- To Evaluate the performance of these models and make a recommendation on whether they should be used to predict credit risk.
+
+## Objectives
+The goals:
+
+1. Implement machine learning models.
+2. Use resampling to attempt to address class imbalance.
+3. Evaluate the performance of machine learning models.
+
+The Accuracy and Recall (sensitivity) rate are two important factor to select the righ model for the imbalanced dataset while the good data is overwhelming the bad data in the dataset.
+
+
+## Naive Random Oversampling
+
+**Balanced Accuracy Score**
+
+0.69
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |78 | 23 |
+|Actual low rick |6811| 10293|
+
+**Report Table**
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk | 0.01 | 0.77 | 0.60 | 0.02 | 0.68 | 0.47 | 101|
+|low_risk | 1.00 | 0.60 | 0.77 | 0.75 | 0.68 | 0.46 | 17104|
+|avg / total| 0.99 | 0.60 | 0.77 | 0.75 | 0.68 | 0.46 | 17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of Naive Random Oversampling is .69 or 69% after resampled the data training with the Random Over Sampler while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 78 and TN(True Nagative) is 10293, and total is 17205, so (78+10293)/17205 = .602789. This is a important factor that we need to evaluate the Machine Learning (ML) method becuase of the accuracy prediction of the corrected cases in the dataset. **Naive Random Oversampling is actually improved the accuracy prediction**
+
+**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)
+
+- High risk rate is 77% so every 100 high risk cases the model detects right 77 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution.
+- Low risk rate is 60% so every 100 low risk cases the model categorizes 40 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+**For this dateset, Naive Random Oversampling accuracy and recall rate is the best among below models: Naive Random Oversampling, SMOTE,Undersampling,Combination Sampling. However, I recommend to use the Easy Ensemble AdaBoost Classifier model for the dataset because the accuracy and recall rate are very impressive(over 90%) (Refer: ReadMe.md)**
+***
+
+
+## SMOTE Oversampling
+
+**Balanced Accuracy Score**
+
+0.66
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |64 | 37 |
+|Actual low rick |5220| 11884|
+
+**Report Table
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk | 0.01| 0.63| 0.69| 0.02| 0.66| 0.44| 101|
+|low_risk | 1.00| 0.69| 0.63| 0.82| 0.66| 0.44| 17104|
+|avg / total| 0.99| 0.69| 0.63| 0.81| 0.66| 0.44| 17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of SMOTE Oversampling is .66 or 66% after resampled the data training with the SMOTE while Accuracy score is .69 or 69% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 64 and TN(True Nagative) is 11884, and total is 17205, so (64+11884)/17205 = .69444. With SMOTE model, the balanced accuracy rate is lower than the actual accuracy rate. It means the good loan (low risk) cases predicted better. **SMOTE model accuracy is very close and similar to Naive Random Oversampling, but the Naive Random Oversampling is better selection**
+
+**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)
+
+- High risk rate is 63% so every 100 high risk cases the model detects right 63 high risk cases and categorizes 37 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **SMOTE model recall (sensitivity) rate is lower than Naive Random Oversampling**
+
+- Low risk rate is 69% so every 100 low risk cases the model categorizes 41 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+
+## Cluster Centroids
+
+**Balanced Accuracy Score**
+
+0.53
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |67 | 34 |
+|Actual low rick |10217| 6887|
+
+**Report Table**
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk | 0.01| 0.66| 0.40| 0.01| 0.52| 0.27| 101|
+|low_risk | 1.00| 0.40| 0.66| 0.57| 0.52| 0.26| 17104|
+|avg / total| 0.99| 0.40| 0.66| 0.57| 0.52| 0.26| 17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of Undersampling is .53 or 53% after resampled the data training with the ClusterCentroids while Accuracy score is .40 or 40% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 6887, and total is 17205, so (67+6887)/17205 = .4041. With Undersampling model, the balanced accuracy rate is higher than the actual accuracy rate. **Undersampling model accuracy is very low. This method may not the righ model for this dataset**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report)
+
+- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling still the best model selection because the accurary and recall rate**
+
+- Low risk rate is 40% so every 100 low risk cases the model categorizes 60 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line, but it is **very high alarmed cases**
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+## Combination (Over and Under) Sampling
+
+**Balanced Accuracy Score**
+
+0.63
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |67 | 34 |
+|Actual low rick |6736| 10368|
+
+**Report Table**
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk | 0.01| 0.66| 0.61| 0.02| 0.63| 0.40| 101|
+|low_risk | 1.00| 0.61| 0.66| 0.75| 0.63| 0.40| 17104|
+|avg / total| 0.99| 0.61| 0.66| 0.75| 0.63| 0.40| 17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of Combination (Over and Under) Sampling is .63 or 63% after resampled the data training with the SMOTEENN while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 10368, and total is 17205, so (67+10368)/17205 = .6065. With SMOTEENN model, the balanced accuracy rate is higher than the actual accuracy rate. **SMOTEENN model accuracy is lower than Naive Random Oversampling, but it is much better than Undersampling model(ClusterCentroids)**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report)
+
+- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling is still the highest rate so far**
+
+- Low risk rate is 61% so every 100 low risk cases the model categorizes 39 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line.
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+# Extension (Machine Learning)
+
+
+## BalancedRandomForestClassifier
+
+**Accuracy Score
+
+0.90
+
+**Confusion Matrix
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |68 | 33 |
+|Actual low rick |1749| 15355|
+
+**Report Table
+
+|precision| recall| f1-score| support|
+|-|-|-|-|
+| high_risk| 0.04| 0.67| 0.07| 101|
+| low_risk| 1.00| 0.90| 0.95| 17104|
+| accuracy| | | 0.90| 17205|
+| macro avg| 0.52| 0.79| 0.51| 17205|
+|weighted avg| 0.99| 0.90| 0.94| 17205|
+
+### Analysis
+
+**Accuracy**
+
+The accuracy score of Balanced Random Forest Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 68 and TN(True Nagative) is 15355, and total is 17205, so (68+15355)/17205 = .8964. **The accuracy score is very good number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report)
+
+- High risk rate is 67% so every 100 high risk cases the model detects right 67 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is average number. It is not impressive like the accuracy. It tells that the low risk cases are detected better than the high risk cases**
+
+- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line.
+
+**Precision:**
+
+- High risk rate is closed 0.04 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+## EasyEnsembleClassifier
+
+**Accuracy Score
+
+0.90
+
+**Confusion Matrix
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |94 | 7 |
+|Actual low rick |1706| 15398|
+
+**Report Table
+
+|precision| recall| f1-score| support|
+|-|-|-|-|
+| high_risk| 0.05| 0.93| 0.10| 101|
+| low_risk| 1.00| 0.90| 0.95| 17104|
+| accuracy| | | 0.90| 17205|
+| macro avg| 0.53| 0.92| 0.51| 17205|
+|weighted avg| 0.99| 0.90| 0.94| 17205|
+
+
+### Analysis (The best model)
+
+**Accuracy**
+
+The accuracy score of Easy Ensemble AdaBoost Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 94 and TN(True Nagative) is 15398, and total is 17205, so (94+15398)/17205 = .9004. **The accuracy score is very impressive number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report)
+
+- High risk rate is 93% so every 100 high risk cases the model detects right 93 high risk cases and categorizes 7 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is also very impressive number. Both the accuracy and the recall rate are over 90%. This is a good model to detect the high risk cases of this dataset**
+
+- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line.
+
+**Precision:**
+
+- High risk rate is closed 0.05 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***