diff --git a/Notebooks/Clean_up_data.ipynb b/Notebooks/Clean_up_data.ipynb
new file mode 100644
index 0000000..2eedc61
--- /dev/null
+++ b/Notebooks/Clean_up_data.ipynb
@@ -0,0 +1,483 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepair Steps\n",
+    "\n",
+    "1. Load data from data (csv file) to dataframe\n",
+    "2. Clean up all NA columns, all row NA, columns cannot be features such as Id, or having only one value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dataframe with panda\n",
+    "import pandas as pd\n",
+    "# IO path\n",
+    "from path import Path\n",
+    "# Count Distint in array or series\n",
+    "from collections import Counter\n",
+    "# Numpy\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set the data path\n",
+    "data = Path('../Resources/LoanStats_2019Q1.csv')\n",
+    "# read data from the path (low_memory=False to avoid warning for dtype size)\n",
+    "# do not read data until 2nd row (skip 1 row)\n",
+    "df = pd.read_csv(data, low_memory=False,skiprows=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(115677, 144)"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find the shape\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "id                            2\n",
+       "member_id                     0\n",
+       "loan_amnt                115675\n",
+       "funded_amnt              115675\n",
+       "funded_amnt_inv          115675\n",
+       "                          ...  \n",
+       "settlement_status             0\n",
+       "settlement_date               0\n",
+       "settlement_amount             0\n",
+       "settlement_percentage         0\n",
+       "settlement_term               0\n",
+       "Length: 144, dtype: int64"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Find columns contain all NA or Empty value\n",
+    "df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',\n",
+       "       'installment', 'grade', 'sub_grade', 'emp_title',\n",
+       "       ...\n",
+       "       'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util',\n",
+       "       'sec_app_open_act_il', 'sec_app_num_rev_accts',\n",
+       "       'sec_app_chargeoff_within_12_mths',\n",
+       "       'sec_app_collections_12_mths_ex_med',\n",
+       "       'sec_app_mths_since_last_major_derog', 'hardship_flag',\n",
+       "       'debt_settlement_flag'],\n",
+       "      dtype='object', length=121)"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Drop columns all NA\n",
+    "df = df.dropna(axis='columns',how='all')\n",
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(115677, 121)"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# the dataframe shape \n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(115675, 120)"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# we do not need id column\n",
+    "# id has only 2 rows not NA\n",
+    "df= df.drop(columns = [\"id\"])\n",
+    "# drop rows all NA\n",
+    "df = df.dropna(axis='rows',how='all')\n",
+    "# the dataframe shape \n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(115675, 120)"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# the dataframe shape \n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "loan_amnt                              115675\n",
+       "funded_amnt                            115675\n",
+       "funded_amnt_inv                        115675\n",
+       "term                                   115675\n",
+       "int_rate                               115675\n",
+       "                                        ...  \n",
+       "sec_app_chargeoff_within_12_mths        16681\n",
+       "sec_app_collections_12_mths_ex_med      16681\n",
+       "sec_app_mths_since_last_major_derog      4901\n",
+       "hardship_flag                          115675\n",
+       "debt_settlement_flag                   115675\n",
+       "Length: 120, dtype: int64"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# check if NA columns\n",
+    "df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Column name pymnt_plan:\n",
+      "Column name recoveries:\n",
+      "Column name collection_recovery_fee:\n",
+      "Column name policy_code:\n",
+      "Column name acc_now_delinq:\n",
+      "Column name num_tl_120dpd_2m:\n",
+      "Column name num_tl_30dpd:\n",
+      "Column name tax_liens:\n",
+      "Column name hardship_flag:\n",
+      "Column name debt_settlement_flag:\n",
+      "['pymnt_plan', 'recoveries', 'collection_recovery_fee', 'policy_code', 'acc_now_delinq', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'tax_liens', 'hardship_flag', 'debt_settlement_flag']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# find columns that have only one value\n",
+    "# They are not feature\n",
+    "columns_1_value = []\n",
+    "for column in df.columns:\n",
+    "    if (len(df[column].value_counts()) == 1):\n",
+    "        columns_1_value.append(column)\n",
+    "        print (f\"Column name {column}:\")\n",
+    "print(columns_1_value)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(115675, 110)"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Drop columns that have only one value\n",
+    "df= df.drop(columns = columns_1_value)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['term',\n",
+       " 'int_rate',\n",
+       " 'grade',\n",
+       " 'sub_grade',\n",
+       " 'emp_title',\n",
+       " 'emp_length',\n",
+       " 'home_ownership',\n",
+       " 'verification_status',\n",
+       " 'issue_d',\n",
+       " 'loan_status',\n",
+       " 'purpose',\n",
+       " 'title',\n",
+       " 'zip_code',\n",
+       " 'addr_state',\n",
+       " 'earliest_cr_line',\n",
+       " 'revol_util',\n",
+       " 'initial_list_status',\n",
+       " 'last_pymnt_d',\n",
+       " 'next_pymnt_d',\n",
+       " 'last_credit_pull_d',\n",
+       " 'application_type',\n",
+       " 'verification_status_joint',\n",
+       " 'sec_app_earliest_cr_line']"
+      ]
+     },
+     "execution_count": 103,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "obj_columns = list(df.dtypes[df.dtypes == np.object].index)\n",
+    "obj_columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Current               94116\n",
+       "Issued                18835\n",
+       "Fully Paid             2157\n",
+       "In Grace Period         233\n",
+       "Late (16-30 days)       155\n",
+       "Late (31-120 days)      138\n",
+       "Charged Off              41\n",
+       "Name: loan_status, dtype: int64"
+      ]
+     },
+     "execution_count": 107,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['loan_status'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['loan_amnt',\n",
+       " 'funded_amnt',\n",
+       " 'funded_amnt_inv',\n",
+       " 'installment',\n",
+       " 'annual_inc',\n",
+       " 'dti',\n",
+       " 'delinq_2yrs',\n",
+       " 'inq_last_6mths',\n",
+       " 'mths_since_last_delinq',\n",
+       " 'mths_since_last_record',\n",
+       " 'open_acc',\n",
+       " 'pub_rec',\n",
+       " 'revol_bal',\n",
+       " 'total_acc',\n",
+       " 'out_prncp',\n",
+       " 'out_prncp_inv',\n",
+       " 'total_pymnt',\n",
+       " 'total_pymnt_inv',\n",
+       " 'total_rec_prncp',\n",
+       " 'total_rec_int',\n",
+       " 'total_rec_late_fee',\n",
+       " 'last_pymnt_amnt',\n",
+       " 'collections_12_mths_ex_med',\n",
+       " 'mths_since_last_major_derog',\n",
+       " 'annual_inc_joint',\n",
+       " 'dti_joint',\n",
+       " 'tot_coll_amt',\n",
+       " 'tot_cur_bal',\n",
+       " 'open_acc_6m',\n",
+       " 'open_act_il',\n",
+       " 'open_il_12m',\n",
+       " 'open_il_24m',\n",
+       " 'mths_since_rcnt_il',\n",
+       " 'total_bal_il',\n",
+       " 'il_util',\n",
+       " 'open_rv_12m',\n",
+       " 'open_rv_24m',\n",
+       " 'max_bal_bc',\n",
+       " 'all_util',\n",
+       " 'total_rev_hi_lim',\n",
+       " 'inq_fi',\n",
+       " 'total_cu_tl',\n",
+       " 'inq_last_12m',\n",
+       " 'acc_open_past_24mths',\n",
+       " 'avg_cur_bal',\n",
+       " 'bc_open_to_buy',\n",
+       " 'bc_util',\n",
+       " 'chargeoff_within_12_mths',\n",
+       " 'delinq_amnt',\n",
+       " 'mo_sin_old_il_acct',\n",
+       " 'mo_sin_old_rev_tl_op',\n",
+       " 'mo_sin_rcnt_rev_tl_op',\n",
+       " 'mo_sin_rcnt_tl',\n",
+       " 'mort_acc',\n",
+       " 'mths_since_recent_bc',\n",
+       " 'mths_since_recent_bc_dlq',\n",
+       " 'mths_since_recent_inq',\n",
+       " 'mths_since_recent_revol_delinq',\n",
+       " 'num_accts_ever_120_pd',\n",
+       " 'num_actv_bc_tl',\n",
+       " 'num_actv_rev_tl',\n",
+       " 'num_bc_sats',\n",
+       " 'num_bc_tl',\n",
+       " 'num_il_tl',\n",
+       " 'num_op_rev_tl',\n",
+       " 'num_rev_accts',\n",
+       " 'num_rev_tl_bal_gt_0',\n",
+       " 'num_sats',\n",
+       " 'num_tl_90g_dpd_24m',\n",
+       " 'num_tl_op_past_12m',\n",
+       " 'pct_tl_nvr_dlq',\n",
+       " 'percent_bc_gt_75',\n",
+       " 'pub_rec_bankruptcies',\n",
+       " 'tot_hi_cred_lim',\n",
+       " 'total_bal_ex_mort',\n",
+       " 'total_bc_limit',\n",
+       " 'total_il_high_credit_limit',\n",
+       " 'revol_bal_joint',\n",
+       " 'sec_app_inq_last_6mths',\n",
+       " 'sec_app_mort_acc',\n",
+       " 'sec_app_open_acc',\n",
+       " 'sec_app_revol_util',\n",
+       " 'sec_app_open_act_il',\n",
+       " 'sec_app_num_rev_accts',\n",
+       " 'sec_app_chargeoff_within_12_mths',\n",
+       " 'sec_app_collections_12_mths_ex_med',\n",
+       " 'sec_app_mths_since_last_major_derog']"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_columns = list(df.dtypes[df.dtypes != np.object].index)\n",
+    "num_columns"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlenv",
+   "language": "python",
+   "name": "mlenv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/Notebooks/credit_risk_ensemble.ipynb b/Notebooks/credit_risk_ensemble.ipynb
new file mode 100644
index 0000000..6681596
--- /dev/null
+++ b/Notebooks/credit_risk_ensemble.ipynb
@@ -0,0 +1,1750 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import balanced_accuracy_score\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "from imblearn.metrics import classification_report_imbalanced"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read the CSV and Perform Basic Data Cleaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-\n",
+    "\n",
+    "columns = [\n",
+    "    \"loan_amnt\", \"int_rate\", \"installment\", \"home_ownership\",\n",
+    "    \"annual_inc\", \"verification_status\", \"issue_d\", \"loan_status\",\n",
+    "    \"pymnt_plan\", \"dti\", \"delinq_2yrs\", \"inq_last_6mths\",\n",
+    "    \"open_acc\", \"pub_rec\", \"revol_bal\", \"total_acc\",\n",
+    "    \"initial_list_status\", \"out_prncp\", \"out_prncp_inv\", \"total_pymnt\",\n",
+    "    \"total_pymnt_inv\", \"total_rec_prncp\", \"total_rec_int\", \"total_rec_late_fee\",\n",
+    "    \"recoveries\", \"collection_recovery_fee\", \"last_pymnt_amnt\", \"next_pymnt_d\",\n",
+    "    \"collections_12_mths_ex_med\", \"policy_code\", \"application_type\", \"acc_now_delinq\",\n",
+    "    \"tot_coll_amt\", \"tot_cur_bal\", \"open_acc_6m\", \"open_act_il\",\n",
+    "    \"open_il_12m\", \"open_il_24m\", \"mths_since_rcnt_il\", \"total_bal_il\",\n",
+    "    \"il_util\", \"open_rv_12m\", \"open_rv_24m\", \"max_bal_bc\",\n",
+    "    \"all_util\", \"total_rev_hi_lim\", \"inq_fi\", \"total_cu_tl\",\n",
+    "    \"inq_last_12m\", \"acc_open_past_24mths\", \"avg_cur_bal\", \"bc_open_to_buy\",\n",
+    "    \"bc_util\", \"chargeoff_within_12_mths\", \"delinq_amnt\", \"mo_sin_old_il_acct\",\n",
+    "    \"mo_sin_old_rev_tl_op\", \"mo_sin_rcnt_rev_tl_op\", \"mo_sin_rcnt_tl\", \"mort_acc\",\n",
+    "    \"mths_since_recent_bc\", \"mths_since_recent_inq\", \"num_accts_ever_120_pd\", \"num_actv_bc_tl\",\n",
+    "    \"num_actv_rev_tl\", \"num_bc_sats\", \"num_bc_tl\", \"num_il_tl\",\n",
+    "    \"num_op_rev_tl\", \"num_rev_accts\", \"num_rev_tl_bal_gt_0\",\n",
+    "    \"num_sats\", \"num_tl_120dpd_2m\", \"num_tl_30dpd\", \"num_tl_90g_dpd_24m\",\n",
+    "    \"num_tl_op_past_12m\", \"pct_tl_nvr_dlq\", \"percent_bc_gt_75\", \"pub_rec_bankruptcies\",\n",
+    "    \"tax_liens\", \"tot_hi_cred_lim\", \"total_bal_ex_mort\", \"total_bc_limit\",\n",
+    "    \"total_il_high_credit_limit\", \"hardship_flag\", \"debt_settlement_flag\"\n",
+    "]\n",
+    "\n",
+    "target = [\"loan_status\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_amnt</th>\n",
+       "      <th>int_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>home_ownership</th>\n",
+       "      <th>annual_inc</th>\n",
+       "      <th>verification_status</th>\n",
+       "      <th>issue_d</th>\n",
+       "      <th>loan_status</th>\n",
+       "      <th>pymnt_plan</th>\n",
+       "      <th>dti</th>\n",
+       "      <th>...</th>\n",
+       "      <th>pct_tl_nvr_dlq</th>\n",
+       "      <th>percent_bc_gt_75</th>\n",
+       "      <th>pub_rec_bankruptcies</th>\n",
+       "      <th>tax_liens</th>\n",
+       "      <th>tot_hi_cred_lim</th>\n",
+       "      <th>total_bal_ex_mort</th>\n",
+       "      <th>total_bc_limit</th>\n",
+       "      <th>total_il_high_credit_limit</th>\n",
+       "      <th>hardship_flag</th>\n",
+       "      <th>debt_settlement_flag</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10500.0</td>\n",
+       "      <td>0.1719</td>\n",
+       "      <td>375.35</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>66000.0</td>\n",
+       "      <td>Source Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>27.24</td>\n",
+       "      <td>...</td>\n",
+       "      <td>85.7</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>65687.0</td>\n",
+       "      <td>38199.0</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>61987.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>929.09</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>105000.0</td>\n",
+       "      <td>Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>20.23</td>\n",
+       "      <td>...</td>\n",
+       "      <td>91.2</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>271427.0</td>\n",
+       "      <td>60641.0</td>\n",
+       "      <td>41200.0</td>\n",
+       "      <td>49197.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>20000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>529.88</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>56000.0</td>\n",
+       "      <td>Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>24.26</td>\n",
+       "      <td>...</td>\n",
+       "      <td>66.7</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>60644.0</td>\n",
+       "      <td>45684.0</td>\n",
+       "      <td>7500.0</td>\n",
+       "      <td>43144.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>10000.0</td>\n",
+       "      <td>0.1640</td>\n",
+       "      <td>353.55</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>92000.0</td>\n",
+       "      <td>Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>31.44</td>\n",
+       "      <td>...</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>99506.0</td>\n",
+       "      <td>68784.0</td>\n",
+       "      <td>19700.0</td>\n",
+       "      <td>76506.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>22000.0</td>\n",
+       "      <td>0.1474</td>\n",
+       "      <td>520.39</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>52000.0</td>\n",
+       "      <td>Not Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>18.76</td>\n",
+       "      <td>...</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>219750.0</td>\n",
+       "      <td>25919.0</td>\n",
+       "      <td>27600.0</td>\n",
+       "      <td>20000.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 86 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   loan_amnt  int_rate  installment home_ownership  annual_inc  \\\n",
+       "0    10500.0    0.1719       375.35           RENT     66000.0   \n",
+       "1    25000.0    0.2000       929.09       MORTGAGE    105000.0   \n",
+       "2    20000.0    0.2000       529.88       MORTGAGE     56000.0   \n",
+       "3    10000.0    0.1640       353.55           RENT     92000.0   \n",
+       "4    22000.0    0.1474       520.39       MORTGAGE     52000.0   \n",
+       "\n",
+       "  verification_status   issue_d loan_status pymnt_plan    dti  ...  \\\n",
+       "0     Source Verified  Mar-2019    low_risk          n  27.24  ...   \n",
+       "1            Verified  Mar-2019    low_risk          n  20.23  ...   \n",
+       "2            Verified  Mar-2019    low_risk          n  24.26  ...   \n",
+       "3            Verified  Mar-2019    low_risk          n  31.44  ...   \n",
+       "4        Not Verified  Mar-2019    low_risk          n  18.76  ...   \n",
+       "\n",
+       "   pct_tl_nvr_dlq  percent_bc_gt_75  pub_rec_bankruptcies  tax_liens  \\\n",
+       "0            85.7             100.0                   0.0        0.0   \n",
+       "1            91.2              50.0                   1.0        0.0   \n",
+       "2            66.7              50.0                   0.0        0.0   \n",
+       "3           100.0              50.0                   1.0        0.0   \n",
+       "4           100.0               0.0                   0.0        0.0   \n",
+       "\n",
+       "   tot_hi_cred_lim  total_bal_ex_mort total_bc_limit  \\\n",
+       "0          65687.0            38199.0         2000.0   \n",
+       "1         271427.0            60641.0        41200.0   \n",
+       "2          60644.0            45684.0         7500.0   \n",
+       "3          99506.0            68784.0        19700.0   \n",
+       "4         219750.0            25919.0        27600.0   \n",
+       "\n",
+       "   total_il_high_credit_limit  hardship_flag  debt_settlement_flag  \n",
+       "0                     61987.0              N                     N  \n",
+       "1                     49197.0              N                     N  \n",
+       "2                     43144.0              N                     N  \n",
+       "3                     76506.0              N                     N  \n",
+       "4                     20000.0              N                     N  \n",
+       "\n",
+       "[5 rows x 86 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Load the data\n",
+    "file_path = Path('../Resources/LoanStats_2019Q1.csv')\n",
+    "df = pd.read_csv(file_path, skiprows=1)[:-2]\n",
+    "df = df.loc[:, columns].copy()\n",
+    "\n",
+    "# Drop the null columns where all values are null\n",
+    "df = df.dropna(axis='columns', how='all')\n",
+    "\n",
+    "# Drop the null rows\n",
+    "df = df.dropna()\n",
+    "\n",
+    "# Remove the `Issued` loan status\n",
+    "issued_mask = df['loan_status'] != 'Issued'\n",
+    "df = df.loc[issued_mask]\n",
+    "\n",
+    "# convert interest rate to numerical\n",
+    "df['int_rate'] = df['int_rate'].str.replace('%', '')\n",
+    "df['int_rate'] = df['int_rate'].astype('float') / 100\n",
+    "\n",
+    "\n",
+    "# Convert the target column values to low_risk and high_risk based on their values\n",
+    "x = {'Current': 'low_risk'}   \n",
+    "df = df.replace(x)\n",
+    "\n",
+    "x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    \n",
+    "df = df.replace(x)\n",
+    "\n",
+    "df.reset_index(inplace=True, drop=True)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split the Data into Training and Testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['home_ownership',\n",
+       " 'verification_status',\n",
+       " 'issue_d',\n",
+       " 'pymnt_plan',\n",
+       " 'initial_list_status',\n",
+       " 'next_pymnt_d',\n",
+       " 'application_type',\n",
+       " 'hardship_flag',\n",
+       " 'debt_settlement_flag']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Reserve the original data frame by copying\n",
+    "df_temp = df.copy()\n",
+    "#Drop the target column\n",
+    "df_temp = df.drop(columns=target)\n",
+    "#Find columns that is string\n",
+    "obj_columns = list(df_temp.dtypes[df_temp.dtypes == np.object].index)\n",
+    "obj_columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_amnt</th>\n",
+       "      <th>int_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>annual_inc</th>\n",
+       "      <th>dti</th>\n",
+       "      <th>delinq_2yrs</th>\n",
+       "      <th>inq_last_6mths</th>\n",
+       "      <th>open_acc</th>\n",
+       "      <th>pub_rec</th>\n",
+       "      <th>revol_bal</th>\n",
+       "      <th>...</th>\n",
+       "      <th>issue_d_Mar-2019</th>\n",
+       "      <th>pymnt_plan_n</th>\n",
+       "      <th>initial_list_status_f</th>\n",
+       "      <th>initial_list_status_w</th>\n",
+       "      <th>next_pymnt_d_Apr-2019</th>\n",
+       "      <th>next_pymnt_d_May-2019</th>\n",
+       "      <th>application_type_Individual</th>\n",
+       "      <th>application_type_Joint App</th>\n",
+       "      <th>hardship_flag_N</th>\n",
+       "      <th>debt_settlement_flag_N</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10500.0</td>\n",
+       "      <td>0.1719</td>\n",
+       "      <td>375.35</td>\n",
+       "      <td>66000.0</td>\n",
+       "      <td>27.24</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1609.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>929.09</td>\n",
+       "      <td>105000.0</td>\n",
+       "      <td>20.23</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>18368.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>20000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>529.88</td>\n",
+       "      <td>56000.0</td>\n",
+       "      <td>24.26</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>13247.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>10000.0</td>\n",
+       "      <td>0.1640</td>\n",
+       "      <td>353.55</td>\n",
+       "      <td>92000.0</td>\n",
+       "      <td>31.44</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>17996.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>22000.0</td>\n",
+       "      <td>0.1474</td>\n",
+       "      <td>520.39</td>\n",
+       "      <td>52000.0</td>\n",
+       "      <td>18.76</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>9091.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68812</th>\n",
+       "      <td>10000.0</td>\n",
+       "      <td>0.1502</td>\n",
+       "      <td>346.76</td>\n",
+       "      <td>26000.0</td>\n",
+       "      <td>9.60</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2684.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68813</th>\n",
+       "      <td>12000.0</td>\n",
+       "      <td>0.2727</td>\n",
+       "      <td>368.37</td>\n",
+       "      <td>63000.0</td>\n",
+       "      <td>29.07</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>13314.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68814</th>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>0.1992</td>\n",
+       "      <td>185.62</td>\n",
+       "      <td>52000.0</td>\n",
+       "      <td>14.86</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3715.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68815</th>\n",
+       "      <td>40000.0</td>\n",
+       "      <td>0.0646</td>\n",
+       "      <td>1225.24</td>\n",
+       "      <td>520000.0</td>\n",
+       "      <td>9.96</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>59529.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68816</th>\n",
+       "      <td>16000.0</td>\n",
+       "      <td>0.1131</td>\n",
+       "      <td>350.36</td>\n",
+       "      <td>72000.0</td>\n",
+       "      <td>7.02</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>11882.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>68817 rows × 95 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  \\\n",
+       "0        10500.0    0.1719       375.35     66000.0  27.24          0.0   \n",
+       "1        25000.0    0.2000       929.09    105000.0  20.23          0.0   \n",
+       "2        20000.0    0.2000       529.88     56000.0  24.26          0.0   \n",
+       "3        10000.0    0.1640       353.55     92000.0  31.44          0.0   \n",
+       "4        22000.0    0.1474       520.39     52000.0  18.76          0.0   \n",
+       "...          ...       ...          ...         ...    ...          ...   \n",
+       "68812    10000.0    0.1502       346.76     26000.0   9.60          0.0   \n",
+       "68813    12000.0    0.2727       368.37     63000.0  29.07          0.0   \n",
+       "68814     5000.0    0.1992       185.62     52000.0  14.86          0.0   \n",
+       "68815    40000.0    0.0646      1225.24    520000.0   9.96          0.0   \n",
+       "68816    16000.0    0.1131       350.36     72000.0   7.02          2.0   \n",
+       "\n",
+       "       inq_last_6mths  open_acc  pub_rec  revol_bal  ...  issue_d_Mar-2019  \\\n",
+       "0                 0.0       8.0      0.0     1609.0  ...                 1   \n",
+       "1                 0.0      17.0      1.0    18368.0  ...                 1   \n",
+       "2                 0.0       8.0      0.0    13247.0  ...                 1   \n",
+       "3                 1.0      10.0      1.0    17996.0  ...                 1   \n",
+       "4                 1.0      14.0      0.0     9091.0  ...                 1   \n",
+       "...               ...       ...      ...        ...  ...               ...   \n",
+       "68812             0.0       9.0      0.0     2684.0  ...                 0   \n",
+       "68813             0.0       8.0      0.0    13314.0  ...                 0   \n",
+       "68814             0.0       5.0      1.0     3715.0  ...                 0   \n",
+       "68815             1.0      21.0      0.0    59529.0  ...                 0   \n",
+       "68816             0.0      12.0      1.0    11882.0  ...                 0   \n",
+       "\n",
+       "       pymnt_plan_n  initial_list_status_f  initial_list_status_w  \\\n",
+       "0                 1                      0                      1   \n",
+       "1                 1                      0                      1   \n",
+       "2                 1                      0                      1   \n",
+       "3                 1                      0                      1   \n",
+       "4                 1                      0                      1   \n",
+       "...             ...                    ...                    ...   \n",
+       "68812             1                      0                      1   \n",
+       "68813             1                      0                      1   \n",
+       "68814             1                      0                      1   \n",
+       "68815             1                      1                      0   \n",
+       "68816             1                      0                      1   \n",
+       "\n",
+       "       next_pymnt_d_Apr-2019  next_pymnt_d_May-2019  \\\n",
+       "0                          0                      1   \n",
+       "1                          0                      1   \n",
+       "2                          0                      1   \n",
+       "3                          0                      1   \n",
+       "4                          0                      1   \n",
+       "...                      ...                    ...   \n",
+       "68812                      0                      1   \n",
+       "68813                      0                      1   \n",
+       "68814                      0                      1   \n",
+       "68815                      0                      1   \n",
+       "68816                      0                      1   \n",
+       "\n",
+       "       application_type_Individual  application_type_Joint App  \\\n",
+       "0                                1                           0   \n",
+       "1                                1                           0   \n",
+       "2                                1                           0   \n",
+       "3                                1                           0   \n",
+       "4                                1                           0   \n",
+       "...                            ...                         ...   \n",
+       "68812                            1                           0   \n",
+       "68813                            1                           0   \n",
+       "68814                            1                           0   \n",
+       "68815                            1                           0   \n",
+       "68816                            1                           0   \n",
+       "\n",
+       "       hardship_flag_N  debt_settlement_flag_N  \n",
+       "0                    1                       1  \n",
+       "1                    1                       1  \n",
+       "2                    1                       1  \n",
+       "3                    1                       1  \n",
+       "4                    1                       1  \n",
+       "...                ...                     ...  \n",
+       "68812                1                       1  \n",
+       "68813                1                       1  \n",
+       "68814                1                       1  \n",
+       "68815                1                       1  \n",
+       "68816                1                       1  \n",
+       "\n",
+       "[68817 rows x 95 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Convert categorical variable into dummy/indicator variables\n",
+    "df_temp = pd.get_dummies(df_temp, columns= obj_columns)\n",
+    "df_temp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create our features\n",
+    "X = df_temp\n",
+    "# Create our target\n",
+    "y = df[target]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_amnt</th>\n",
+       "      <th>int_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>annual_inc</th>\n",
+       "      <th>dti</th>\n",
+       "      <th>delinq_2yrs</th>\n",
+       "      <th>inq_last_6mths</th>\n",
+       "      <th>open_acc</th>\n",
+       "      <th>pub_rec</th>\n",
+       "      <th>revol_bal</th>\n",
+       "      <th>...</th>\n",
+       "      <th>issue_d_Mar-2019</th>\n",
+       "      <th>pymnt_plan_n</th>\n",
+       "      <th>initial_list_status_f</th>\n",
+       "      <th>initial_list_status_w</th>\n",
+       "      <th>next_pymnt_d_Apr-2019</th>\n",
+       "      <th>next_pymnt_d_May-2019</th>\n",
+       "      <th>application_type_Individual</th>\n",
+       "      <th>application_type_Joint App</th>\n",
+       "      <th>hardship_flag_N</th>\n",
+       "      <th>debt_settlement_flag_N</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>6.881700e+04</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.0</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.0</td>\n",
+       "      <td>68817.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>16677.594562</td>\n",
+       "      <td>0.127718</td>\n",
+       "      <td>480.652863</td>\n",
+       "      <td>8.821371e+04</td>\n",
+       "      <td>21.778153</td>\n",
+       "      <td>0.217766</td>\n",
+       "      <td>0.497697</td>\n",
+       "      <td>12.587340</td>\n",
+       "      <td>0.126030</td>\n",
+       "      <td>17604.142828</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.177238</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.123879</td>\n",
+       "      <td>0.876121</td>\n",
+       "      <td>0.383161</td>\n",
+       "      <td>0.616839</td>\n",
+       "      <td>0.860340</td>\n",
+       "      <td>0.139660</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>10277.348590</td>\n",
+       "      <td>0.048130</td>\n",
+       "      <td>288.062432</td>\n",
+       "      <td>1.155800e+05</td>\n",
+       "      <td>20.199244</td>\n",
+       "      <td>0.718367</td>\n",
+       "      <td>0.758122</td>\n",
+       "      <td>6.022869</td>\n",
+       "      <td>0.336797</td>\n",
+       "      <td>21835.880400</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.381873</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.329446</td>\n",
+       "      <td>0.329446</td>\n",
+       "      <td>0.486161</td>\n",
+       "      <td>0.486161</td>\n",
+       "      <td>0.346637</td>\n",
+       "      <td>0.346637</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>1000.000000</td>\n",
+       "      <td>0.060000</td>\n",
+       "      <td>30.890000</td>\n",
+       "      <td>4.000000e+01</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>9000.000000</td>\n",
+       "      <td>0.088100</td>\n",
+       "      <td>265.730000</td>\n",
+       "      <td>5.000000e+04</td>\n",
+       "      <td>13.890000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>8.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>6293.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>15000.000000</td>\n",
+       "      <td>0.118000</td>\n",
+       "      <td>404.560000</td>\n",
+       "      <td>7.300000e+04</td>\n",
+       "      <td>19.760000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>11.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>12068.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>24000.000000</td>\n",
+       "      <td>0.155700</td>\n",
+       "      <td>648.100000</td>\n",
+       "      <td>1.040000e+05</td>\n",
+       "      <td>26.660000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>16.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>21735.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>40000.000000</td>\n",
+       "      <td>0.308400</td>\n",
+       "      <td>1676.230000</td>\n",
+       "      <td>8.797500e+06</td>\n",
+       "      <td>999.000000</td>\n",
+       "      <td>18.000000</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>72.000000</td>\n",
+       "      <td>4.000000</td>\n",
+       "      <td>587191.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>8 rows × 95 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          loan_amnt      int_rate   installment    annual_inc           dti  \\\n",
+       "count  68817.000000  68817.000000  68817.000000  6.881700e+04  68817.000000   \n",
+       "mean   16677.594562      0.127718    480.652863  8.821371e+04     21.778153   \n",
+       "std    10277.348590      0.048130    288.062432  1.155800e+05     20.199244   \n",
+       "min     1000.000000      0.060000     30.890000  4.000000e+01      0.000000   \n",
+       "25%     9000.000000      0.088100    265.730000  5.000000e+04     13.890000   \n",
+       "50%    15000.000000      0.118000    404.560000  7.300000e+04     19.760000   \n",
+       "75%    24000.000000      0.155700    648.100000  1.040000e+05     26.660000   \n",
+       "max    40000.000000      0.308400   1676.230000  8.797500e+06    999.000000   \n",
+       "\n",
+       "        delinq_2yrs  inq_last_6mths      open_acc       pub_rec  \\\n",
+       "count  68817.000000    68817.000000  68817.000000  68817.000000   \n",
+       "mean       0.217766        0.497697     12.587340      0.126030   \n",
+       "std        0.718367        0.758122      6.022869      0.336797   \n",
+       "min        0.000000        0.000000      2.000000      0.000000   \n",
+       "25%        0.000000        0.000000      8.000000      0.000000   \n",
+       "50%        0.000000        0.000000     11.000000      0.000000   \n",
+       "75%        0.000000        1.000000     16.000000      0.000000   \n",
+       "max       18.000000        5.000000     72.000000      4.000000   \n",
+       "\n",
+       "           revol_bal  ...  issue_d_Mar-2019  pymnt_plan_n  \\\n",
+       "count   68817.000000  ...      68817.000000       68817.0   \n",
+       "mean    17604.142828  ...          0.177238           1.0   \n",
+       "std     21835.880400  ...          0.381873           0.0   \n",
+       "min         0.000000  ...          0.000000           1.0   \n",
+       "25%      6293.000000  ...          0.000000           1.0   \n",
+       "50%     12068.000000  ...          0.000000           1.0   \n",
+       "75%     21735.000000  ...          0.000000           1.0   \n",
+       "max    587191.000000  ...          1.000000           1.0   \n",
+       "\n",
+       "       initial_list_status_f  initial_list_status_w  next_pymnt_d_Apr-2019  \\\n",
+       "count           68817.000000           68817.000000           68817.000000   \n",
+       "mean                0.123879               0.876121               0.383161   \n",
+       "std                 0.329446               0.329446               0.486161   \n",
+       "min                 0.000000               0.000000               0.000000   \n",
+       "25%                 0.000000               1.000000               0.000000   \n",
+       "50%                 0.000000               1.000000               0.000000   \n",
+       "75%                 0.000000               1.000000               1.000000   \n",
+       "max                 1.000000               1.000000               1.000000   \n",
+       "\n",
+       "       next_pymnt_d_May-2019  application_type_Individual  \\\n",
+       "count           68817.000000                 68817.000000   \n",
+       "mean                0.616839                     0.860340   \n",
+       "std                 0.486161                     0.346637   \n",
+       "min                 0.000000                     0.000000   \n",
+       "25%                 0.000000                     1.000000   \n",
+       "50%                 1.000000                     1.000000   \n",
+       "75%                 1.000000                     1.000000   \n",
+       "max                 1.000000                     1.000000   \n",
+       "\n",
+       "       application_type_Joint App  hardship_flag_N  debt_settlement_flag_N  \n",
+       "count                68817.000000          68817.0                 68817.0  \n",
+       "mean                     0.139660              1.0                     1.0  \n",
+       "std                      0.346637              0.0                     0.0  \n",
+       "min                      0.000000              1.0                     1.0  \n",
+       "25%                      0.000000              1.0                     1.0  \n",
+       "50%                      0.000000              1.0                     1.0  \n",
+       "75%                      0.000000              1.0                     1.0  \n",
+       "max                      1.000000              1.0                     1.0  \n",
+       "\n",
+       "[8 rows x 95 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "low_risk     68470\n",
+       "high_risk      347\n",
+       "Name: loan_status, dtype: int64"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Check the balance of our target values\n",
+    "y['loan_status'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'low_risk': 68470, 'high_risk': 347})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y = y['loan_status'].ravel()\n",
+    "Counter(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'low_risk': 51366, 'high_risk': 246})"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create X_train, X_test, y_train, y_test\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n",
+    "Counter(y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ensemble Learners\n",
+    "\n",
+    "In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:\n",
+    "\n",
+    "1. Train the model using the training data. \n",
+    "2. Calculate the balanced accuracy score from sklearn.metrics.\n",
+    "3. Print the confusion matrix from sklearn.metrics.\n",
+    "4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+    "5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score\n",
+    "\n",
+    "Note: Use a random state of 1 for each algorithm to ensure consistency between tests"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Balanced Random Forest Classifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from imblearn.ensemble import BalancedRandomForestClassifier\n",
+    "bf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fitting the model\n",
+    "bf_model = bf_model.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['low_risk', 'low_risk', 'high_risk', ..., 'low_risk', 'low_risk',\n",
+       "       'low_risk'], dtype=object)"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_pred = bf_model.predict(X_test)\n",
+    "y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8964254577157803"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Calculated the balanced accuracy score\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "accuracy_score(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Predicted high risk</th>\n",
+       "      <th>Predicted low risk</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Actual high risk</th>\n",
+       "      <td>68</td>\n",
+       "      <td>33</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Actual low risk</th>\n",
+       "      <td>1749</td>\n",
+       "      <td>15355</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  Predicted high risk  Predicted low risk\n",
+       "Actual high risk                   68                  33\n",
+       "Actual low risk                  1749               15355"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Display the confusion matrix\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "# Calculating the confusion matrix.\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "\n",
+    "# Create a DataFrame from the confusion matrix.\n",
+    "cm_df = pd.DataFrame(\n",
+    "    cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+    "\n",
+    "cm_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "   high_risk       0.04      0.67      0.07       101\n",
+      "    low_risk       1.00      0.90      0.95     17104\n",
+      "\n",
+      "    accuracy                           0.90     17205\n",
+      "   macro avg       0.52      0.79      0.51     17205\n",
+      "weighted avg       0.99      0.90      0.94     17205\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the imbalanced classification report\n",
+    "from sklearn.metrics import classification_report\n",
+    "print(classification_report(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis\n",
+    "\n",
+    "**Accuracy**\n",
+    "\n",
+    "The accuracy score of Balanced Random Forest Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 68 and TN(True Nagative) is 15355, and total is 17205, so (68+15355)/17205 = .8964. **The accuracy score is very good number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**\n",
+    "\n",
+    "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+    "\n",
+    "- High risk rate is 67% so every 100 high risk cases the model detects right 67 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is average number. It is not impressive like the accuracy. It tells that the low risk cases are detected better than the high risk cases**\n",
+    "\n",
+    "- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line.\n",
+    "\n",
+    "**Precision:**\n",
+    "\n",
+    "- High risk rate is closed 0.04 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+    "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+    "\n",
+    "***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.00995374, 0.03195562, 0.01707892, 0.01577554, 0.0160705 ,\n",
+       "       0.00327507, 0.00416516, 0.00782073, 0.00089767, 0.01599866,\n",
+       "       0.00897678, 0.01482801, 0.01653796, 0.05764917, 0.06410003,\n",
+       "       0.09175752, 0.0572968 , 0.00598147, 0.        , 0.        ,\n",
+       "       0.05174788, 0.00051683, 0.        , 0.        , 0.00218998,\n",
+       "       0.01073642, 0.00747188, 0.00553016, 0.00401646, 0.00641355,\n",
+       "       0.01431883, 0.01440243, 0.01315152, 0.00606626, 0.00458729,\n",
+       "       0.01616972, 0.01156494, 0.01240632, 0.00760805, 0.00686214,\n",
+       "       0.00960365, 0.01038009, 0.01263346, 0.01140916, 0.0127846 ,\n",
+       "       0.        , 0.        , 0.0116874 , 0.01502927, 0.00631944,\n",
+       "       0.00804558, 0.00687485, 0.01009753, 0.01701486, 0.0026174 ,\n",
+       "       0.00616047, 0.00953742, 0.00758615, 0.00887062, 0.00982172,\n",
+       "       0.00874511, 0.01145588, 0.00727378, 0.0077987 , 0.        ,\n",
+       "       0.        , 0.00057341, 0.00755451, 0.0099141 , 0.00683872,\n",
+       "       0.00121513, 0.        , 0.01535561, 0.01263661, 0.01464882,\n",
+       "       0.01310158, 0.        , 0.00227967, 0.00184048, 0.00208511,\n",
+       "       0.00275652, 0.00154722, 0.00173602, 0.00496182, 0.02353679,\n",
+       "       0.01351987, 0.        , 0.00041925, 0.00081252, 0.00790625,\n",
+       "       0.0046852 , 0.00122131, 0.00122633, 0.        , 0.        ])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# List the features sorted in descending order by feature importance\n",
+    "importances = bf_model.feature_importances_\n",
+    "importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(0.09175752102205247, 'total_rec_prncp'),\n",
+       " (0.06410003199501778, 'total_pymnt_inv'),\n",
+       " (0.05764917485461809, 'total_pymnt'),\n",
+       " (0.05729679526683975, 'total_rec_int'),\n",
+       " (0.05174788106507317, 'last_pymnt_amnt'),\n",
+       " (0.031955619175665397, 'int_rate'),\n",
+       " (0.02353678623968216, 'issue_d_Jan-2019'),\n",
+       " (0.017078915518993903, 'installment'),\n",
+       " (0.017014861224701222, 'mths_since_recent_inq'),\n",
+       " (0.016537957646730293, 'out_prncp_inv'),\n",
+       " (0.016169718411077325, 'max_bal_bc'),\n",
+       " (0.01607049983545137, 'dti'),\n",
+       " (0.01599866290723441, 'revol_bal'),\n",
+       " (0.015775537221600675, 'annual_inc'),\n",
+       " (0.01535560674178928, 'tot_hi_cred_lim'),\n",
+       " (0.015029265003541079, 'mo_sin_old_rev_tl_op'),\n",
+       " (0.014828006488636946, 'out_prncp'),\n",
+       " (0.01464881608833323, 'total_bc_limit'),\n",
+       " (0.014402430445752665, 'total_bal_il'),\n",
+       " (0.014318832248876989, 'mths_since_rcnt_il'),\n",
+       " (0.013519867193755364, 'issue_d_Mar-2019'),\n",
+       " (0.013151520216882331, 'il_util'),\n",
+       " (0.013101578263049833, 'total_il_high_credit_limit'),\n",
+       " (0.012784600558682344, 'bc_util'),\n",
+       " (0.012636608914961465, 'total_bal_ex_mort'),\n",
+       " (0.012633464965390648, 'avg_cur_bal'),\n",
+       " (0.012406321468566728, 'total_rev_hi_lim'),\n",
+       " (0.011687404692448701, 'mo_sin_old_il_acct'),\n",
+       " (0.01156494245653799, 'all_util'),\n",
+       " (0.011455878011762288, 'num_rev_accts'),\n",
+       " (0.011409157520644688, 'bc_open_to_buy'),\n",
+       " (0.01073641504525053, 'tot_cur_bal'),\n",
+       " (0.010380085181706624, 'acc_open_past_24mths'),\n",
+       " (0.010097528131347774, 'mths_since_recent_bc'),\n",
+       " (0.00995373830638152, 'loan_amnt'),\n",
+       " (0.00991410213601043, 'pct_tl_nvr_dlq'),\n",
+       " (0.009821715826953788, 'num_il_tl'),\n",
+       " (0.009603648248133598, 'inq_last_12m'),\n",
+       " (0.009537423049553, 'num_actv_rev_tl'),\n",
+       " (0.008976776055926955, 'total_acc'),\n",
+       " (0.008870623013604539, 'num_bc_tl'),\n",
+       " (0.008745106187024114, 'num_op_rev_tl'),\n",
+       " (0.008045578273709669, 'mo_sin_rcnt_tl'),\n",
+       " (0.007906251501807723, 'next_pymnt_d_Apr-2019'),\n",
+       " (0.00782073260901301, 'open_acc'),\n",
+       " (0.007798696767389274, 'num_sats'),\n",
+       " (0.007608045628523077, 'inq_fi'),\n",
+       " (0.0075861537897335815, 'num_bc_sats'),\n",
+       " (0.007554511001273182, 'num_tl_op_past_12m'),\n",
+       " (0.007471884930172615, 'open_acc_6m'),\n",
+       " (0.007273779915807858, 'num_rev_tl_bal_gt_0'),\n",
+       " (0.006874845464745796, 'mort_acc'),\n",
+       " (0.006862142977394886, 'total_cu_tl'),\n",
+       " (0.006838718858820505, 'percent_bc_gt_75'),\n",
+       " (0.006413554699909871, 'open_il_24m'),\n",
+       " (0.006319439816216779, 'mo_sin_rcnt_rev_tl_op'),\n",
+       " (0.006160469432535709, 'num_actv_bc_tl'),\n",
+       " (0.006066257227997291, 'open_rv_12m'),\n",
+       " (0.005981472544437747, 'total_rec_late_fee'),\n",
+       " (0.0055301594524349495, 'open_act_il'),\n",
+       " (0.004961823663836347, 'issue_d_Feb-2019'),\n",
+       " (0.004685198497435334, 'next_pymnt_d_May-2019'),\n",
+       " (0.0045872929977180356, 'open_rv_24m'),\n",
+       " (0.0041651633321967895, 'inq_last_6mths'),\n",
+       " (0.004016461341161775, 'open_il_12m'),\n",
+       " (0.0032750717701661657, 'delinq_2yrs'),\n",
+       " (0.0027565184136781346, 'verification_status_Not Verified'),\n",
+       " (0.0026174030074401656, 'num_accts_ever_120_pd'),\n",
+       " (0.002279671873697176, 'home_ownership_MORTGAGE'),\n",
+       " (0.0021899772867773103, 'tot_coll_amt'),\n",
+       " (0.0020851101815353096, 'home_ownership_RENT'),\n",
+       " (0.0018404849590376573, 'home_ownership_OWN'),\n",
+       " (0.001736019018028134, 'verification_status_Verified'),\n",
+       " (0.0015472230884974506, 'verification_status_Source Verified'),\n",
+       " (0.0012263315437383057, 'application_type_Joint App'),\n",
+       " (0.0012213148580230454, 'application_type_Individual'),\n",
+       " (0.0012151288883862276, 'pub_rec_bankruptcies'),\n",
+       " (0.0008976722260399365, 'pub_rec'),\n",
+       " (0.0008125182396705508, 'initial_list_status_w'),\n",
+       " (0.000573414997420326, 'num_tl_90g_dpd_24m'),\n",
+       " (0.0005168345750594915, 'collections_12_mths_ex_med'),\n",
+       " (0.0004192455022893127, 'initial_list_status_f'),\n",
+       " (0.0, 'tax_liens'),\n",
+       " (0.0, 'recoveries'),\n",
+       " (0.0, 'pymnt_plan_n'),\n",
+       " (0.0, 'policy_code'),\n",
+       " (0.0, 'num_tl_30dpd'),\n",
+       " (0.0, 'num_tl_120dpd_2m'),\n",
+       " (0.0, 'home_ownership_ANY'),\n",
+       " (0.0, 'hardship_flag_N'),\n",
+       " (0.0, 'delinq_amnt'),\n",
+       " (0.0, 'debt_settlement_flag_N'),\n",
+       " (0.0, 'collection_recovery_fee'),\n",
+       " (0.0, 'chargeoff_within_12_mths'),\n",
+       " (0.0, 'acc_now_delinq')]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sorted(zip(bf_model.feature_importances_, X.columns), reverse=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Easy Ensemble AdaBoost Classifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train the Classifier\n",
+    "from imblearn.ensemble import EasyEnsembleClassifier\n",
+    "ee_model = EasyEnsembleClassifier(random_state=1)\n",
+    "# Fitting the model\n",
+    "ee_model = ee_model.fit(X_train, y_train)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['low_risk', 'low_risk', 'low_risk', ..., 'low_risk', 'low_risk',\n",
+       "       'low_risk'], dtype=object)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_pred = ee_model.predict(X_test)\n",
+    "y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9004359197907585"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Calculated the balanced accuracy score\n",
+    "accuracy_score(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Predicted high risk</th>\n",
+       "      <th>Predicted low risk</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Actual high risk</th>\n",
+       "      <td>94</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Actual low risk</th>\n",
+       "      <td>1706</td>\n",
+       "      <td>15398</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  Predicted high risk  Predicted low risk\n",
+       "Actual high risk                   94                   7\n",
+       "Actual low risk                  1706               15398"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Display the confusion matrix\n",
+    "# Calculating the confusion matrix.\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "\n",
+    "# Create a DataFrame from the confusion matrix.\n",
+    "cm_df = pd.DataFrame(\n",
+    "    cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+    "\n",
+    "cm_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "   high_risk       0.05      0.93      0.10       101\n",
+      "    low_risk       1.00      0.90      0.95     17104\n",
+      "\n",
+      "    accuracy                           0.90     17205\n",
+      "   macro avg       0.53      0.92      0.52     17205\n",
+      "weighted avg       0.99      0.90      0.94     17205\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the imbalanced classification report\n",
+    "print(classification_report(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis\n",
+    "\n",
+    "**Accuracy**\n",
+    "\n",
+    "The accuracy score of Easy Ensemble AdaBoost Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 94 and TN(True Nagative) is 15398, and total is 17205, so (94+15398)/17205 = .9004. **The accuracy score is very impressive number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**\n",
+    "\n",
+    "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+    "\n",
+    "- High risk rate is 93% so every 100 high risk cases the model detects right 93 high risk cases and categorizes 7 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is also very impressive number. Both the accuracy and the recall rate are over 90%. This is a good model to detect the high risk cases of this dataset**\n",
+    "\n",
+    "- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line.\n",
+    "\n",
+    "**Precision:**\n",
+    "\n",
+    "- High risk rate is closed 0.05 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+    "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+    "\n",
+    "***"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlenv",
+   "language": "python",
+   "name": "mlenv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Notebooks/credit_risk_resampling.ipynb b/Notebooks/credit_risk_resampling.ipynb
new file mode 100644
index 0000000..9bc3981
--- /dev/null
+++ b/Notebooks/credit_risk_resampling.ipynb
@@ -0,0 +1,2028 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Credit Risk Resampling Techniques"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read the CSV and Perform Basic Data Cleaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "columns = [\n",
+    "    \"loan_amnt\", \"int_rate\", \"installment\", \"home_ownership\",\n",
+    "    \"annual_inc\", \"verification_status\", \"issue_d\", \"loan_status\",\n",
+    "    \"pymnt_plan\", \"dti\", \"delinq_2yrs\", \"inq_last_6mths\",\n",
+    "    \"open_acc\", \"pub_rec\", \"revol_bal\", \"total_acc\",\n",
+    "    \"initial_list_status\", \"out_prncp\", \"out_prncp_inv\", \"total_pymnt\",\n",
+    "    \"total_pymnt_inv\", \"total_rec_prncp\", \"total_rec_int\", \"total_rec_late_fee\",\n",
+    "    \"recoveries\", \"collection_recovery_fee\", \"last_pymnt_amnt\", \"next_pymnt_d\",\n",
+    "    \"collections_12_mths_ex_med\", \"policy_code\", \"application_type\", \"acc_now_delinq\",\n",
+    "    \"tot_coll_amt\", \"tot_cur_bal\", \"open_acc_6m\", \"open_act_il\",\n",
+    "    \"open_il_12m\", \"open_il_24m\", \"mths_since_rcnt_il\", \"total_bal_il\",\n",
+    "    \"il_util\", \"open_rv_12m\", \"open_rv_24m\", \"max_bal_bc\",\n",
+    "    \"all_util\", \"total_rev_hi_lim\", \"inq_fi\", \"total_cu_tl\",\n",
+    "    \"inq_last_12m\", \"acc_open_past_24mths\", \"avg_cur_bal\", \"bc_open_to_buy\",\n",
+    "    \"bc_util\", \"chargeoff_within_12_mths\", \"delinq_amnt\", \"mo_sin_old_il_acct\",\n",
+    "    \"mo_sin_old_rev_tl_op\", \"mo_sin_rcnt_rev_tl_op\", \"mo_sin_rcnt_tl\", \"mort_acc\",\n",
+    "    \"mths_since_recent_bc\", \"mths_since_recent_inq\", \"num_accts_ever_120_pd\", \"num_actv_bc_tl\",\n",
+    "    \"num_actv_rev_tl\", \"num_bc_sats\", \"num_bc_tl\", \"num_il_tl\",\n",
+    "    \"num_op_rev_tl\", \"num_rev_accts\", \"num_rev_tl_bal_gt_0\",\n",
+    "    \"num_sats\", \"num_tl_120dpd_2m\", \"num_tl_30dpd\", \"num_tl_90g_dpd_24m\",\n",
+    "    \"num_tl_op_past_12m\", \"pct_tl_nvr_dlq\", \"percent_bc_gt_75\", \"pub_rec_bankruptcies\",\n",
+    "    \"tax_liens\", \"tot_hi_cred_lim\", \"total_bal_ex_mort\", \"total_bc_limit\",\n",
+    "    \"total_il_high_credit_limit\", \"hardship_flag\", \"debt_settlement_flag\"\n",
+    "]\n",
+    "\n",
+    "target = [\"loan_status\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_amnt</th>\n",
+       "      <th>int_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>home_ownership</th>\n",
+       "      <th>annual_inc</th>\n",
+       "      <th>verification_status</th>\n",
+       "      <th>issue_d</th>\n",
+       "      <th>loan_status</th>\n",
+       "      <th>pymnt_plan</th>\n",
+       "      <th>dti</th>\n",
+       "      <th>...</th>\n",
+       "      <th>pct_tl_nvr_dlq</th>\n",
+       "      <th>percent_bc_gt_75</th>\n",
+       "      <th>pub_rec_bankruptcies</th>\n",
+       "      <th>tax_liens</th>\n",
+       "      <th>tot_hi_cred_lim</th>\n",
+       "      <th>total_bal_ex_mort</th>\n",
+       "      <th>total_bc_limit</th>\n",
+       "      <th>total_il_high_credit_limit</th>\n",
+       "      <th>hardship_flag</th>\n",
+       "      <th>debt_settlement_flag</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10500.0</td>\n",
+       "      <td>0.1719</td>\n",
+       "      <td>375.35</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>66000.0</td>\n",
+       "      <td>Source Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>27.24</td>\n",
+       "      <td>...</td>\n",
+       "      <td>85.7</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>65687.0</td>\n",
+       "      <td>38199.0</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>61987.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>929.09</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>105000.0</td>\n",
+       "      <td>Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>20.23</td>\n",
+       "      <td>...</td>\n",
+       "      <td>91.2</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>271427.0</td>\n",
+       "      <td>60641.0</td>\n",
+       "      <td>41200.0</td>\n",
+       "      <td>49197.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>20000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>529.88</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>56000.0</td>\n",
+       "      <td>Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>24.26</td>\n",
+       "      <td>...</td>\n",
+       "      <td>66.7</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>60644.0</td>\n",
+       "      <td>45684.0</td>\n",
+       "      <td>7500.0</td>\n",
+       "      <td>43144.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>10000.0</td>\n",
+       "      <td>0.1640</td>\n",
+       "      <td>353.55</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>92000.0</td>\n",
+       "      <td>Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>31.44</td>\n",
+       "      <td>...</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>99506.0</td>\n",
+       "      <td>68784.0</td>\n",
+       "      <td>19700.0</td>\n",
+       "      <td>76506.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>22000.0</td>\n",
+       "      <td>0.1474</td>\n",
+       "      <td>520.39</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>52000.0</td>\n",
+       "      <td>Not Verified</td>\n",
+       "      <td>Mar-2019</td>\n",
+       "      <td>low_risk</td>\n",
+       "      <td>n</td>\n",
+       "      <td>18.76</td>\n",
+       "      <td>...</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>219750.0</td>\n",
+       "      <td>25919.0</td>\n",
+       "      <td>27600.0</td>\n",
+       "      <td>20000.0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 86 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   loan_amnt  int_rate  installment home_ownership  annual_inc  \\\n",
+       "0    10500.0    0.1719       375.35           RENT     66000.0   \n",
+       "1    25000.0    0.2000       929.09       MORTGAGE    105000.0   \n",
+       "2    20000.0    0.2000       529.88       MORTGAGE     56000.0   \n",
+       "3    10000.0    0.1640       353.55           RENT     92000.0   \n",
+       "4    22000.0    0.1474       520.39       MORTGAGE     52000.0   \n",
+       "\n",
+       "  verification_status   issue_d loan_status pymnt_plan    dti  ...  \\\n",
+       "0     Source Verified  Mar-2019    low_risk          n  27.24  ...   \n",
+       "1            Verified  Mar-2019    low_risk          n  20.23  ...   \n",
+       "2            Verified  Mar-2019    low_risk          n  24.26  ...   \n",
+       "3            Verified  Mar-2019    low_risk          n  31.44  ...   \n",
+       "4        Not Verified  Mar-2019    low_risk          n  18.76  ...   \n",
+       "\n",
+       "   pct_tl_nvr_dlq  percent_bc_gt_75  pub_rec_bankruptcies  tax_liens  \\\n",
+       "0            85.7             100.0                   0.0        0.0   \n",
+       "1            91.2              50.0                   1.0        0.0   \n",
+       "2            66.7              50.0                   0.0        0.0   \n",
+       "3           100.0              50.0                   1.0        0.0   \n",
+       "4           100.0               0.0                   0.0        0.0   \n",
+       "\n",
+       "   tot_hi_cred_lim  total_bal_ex_mort total_bc_limit  \\\n",
+       "0          65687.0            38199.0         2000.0   \n",
+       "1         271427.0            60641.0        41200.0   \n",
+       "2          60644.0            45684.0         7500.0   \n",
+       "3          99506.0            68784.0        19700.0   \n",
+       "4         219750.0            25919.0        27600.0   \n",
+       "\n",
+       "   total_il_high_credit_limit  hardship_flag  debt_settlement_flag  \n",
+       "0                     61987.0              N                     N  \n",
+       "1                     49197.0              N                     N  \n",
+       "2                     43144.0              N                     N  \n",
+       "3                     76506.0              N                     N  \n",
+       "4                     20000.0              N                     N  \n",
+       "\n",
+       "[5 rows x 86 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Load the data\n",
+    "file_path = Path('../Resources/LoanStats_2019Q1.csv')\n",
+    "df = pd.read_csv(file_path, skiprows=1)[:-2]\n",
+    "df = df.loc[:, columns].copy()\n",
+    "\n",
+    "# Drop the null columns where all values are null\n",
+    "df = df.dropna(axis='columns', how='all')\n",
+    "\n",
+    "# Drop the null rows\n",
+    "df = df.dropna()\n",
+    "\n",
+    "# Remove the `Issued` loan status\n",
+    "issued_mask = df['loan_status'] != 'Issued'\n",
+    "df = df.loc[issued_mask]\n",
+    "\n",
+    "# convert interest rate to numerical\n",
+    "df['int_rate'] = df['int_rate'].str.replace('%', '')\n",
+    "df['int_rate'] = df['int_rate'].astype('float') / 100\n",
+    "\n",
+    "\n",
+    "# Convert the target column values to low_risk and high_risk based on their values\n",
+    "x = {'Current': 'low_risk'}   \n",
+    "df = df.replace(x)\n",
+    "\n",
+    "x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    \n",
+    "df = df.replace(x)\n",
+    "\n",
+    "df.reset_index(inplace=True, drop=True)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split the Data into Training and Testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['home_ownership',\n",
+       " 'verification_status',\n",
+       " 'issue_d',\n",
+       " 'pymnt_plan',\n",
+       " 'initial_list_status',\n",
+       " 'next_pymnt_d',\n",
+       " 'application_type',\n",
+       " 'hardship_flag',\n",
+       " 'debt_settlement_flag']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Reserve the original data frame by copying\n",
+    "df_temp = df.copy()\n",
+    "#Drop the target column\n",
+    "df_temp = df.drop(columns=target)\n",
+    "#Find columns that is string\n",
+    "obj_columns = list(df_temp.dtypes[df_temp.dtypes == np.object].index)\n",
+    "obj_columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_amnt</th>\n",
+       "      <th>int_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>annual_inc</th>\n",
+       "      <th>dti</th>\n",
+       "      <th>delinq_2yrs</th>\n",
+       "      <th>inq_last_6mths</th>\n",
+       "      <th>open_acc</th>\n",
+       "      <th>pub_rec</th>\n",
+       "      <th>revol_bal</th>\n",
+       "      <th>...</th>\n",
+       "      <th>issue_d_Mar-2019</th>\n",
+       "      <th>pymnt_plan_n</th>\n",
+       "      <th>initial_list_status_f</th>\n",
+       "      <th>initial_list_status_w</th>\n",
+       "      <th>next_pymnt_d_Apr-2019</th>\n",
+       "      <th>next_pymnt_d_May-2019</th>\n",
+       "      <th>application_type_Individual</th>\n",
+       "      <th>application_type_Joint App</th>\n",
+       "      <th>hardship_flag_N</th>\n",
+       "      <th>debt_settlement_flag_N</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10500.0</td>\n",
+       "      <td>0.1719</td>\n",
+       "      <td>375.35</td>\n",
+       "      <td>66000.0</td>\n",
+       "      <td>27.24</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1609.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>929.09</td>\n",
+       "      <td>105000.0</td>\n",
+       "      <td>20.23</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>18368.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>20000.0</td>\n",
+       "      <td>0.2000</td>\n",
+       "      <td>529.88</td>\n",
+       "      <td>56000.0</td>\n",
+       "      <td>24.26</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>13247.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>10000.0</td>\n",
+       "      <td>0.1640</td>\n",
+       "      <td>353.55</td>\n",
+       "      <td>92000.0</td>\n",
+       "      <td>31.44</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>17996.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>22000.0</td>\n",
+       "      <td>0.1474</td>\n",
+       "      <td>520.39</td>\n",
+       "      <td>52000.0</td>\n",
+       "      <td>18.76</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>9091.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68812</th>\n",
+       "      <td>10000.0</td>\n",
+       "      <td>0.1502</td>\n",
+       "      <td>346.76</td>\n",
+       "      <td>26000.0</td>\n",
+       "      <td>9.60</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2684.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68813</th>\n",
+       "      <td>12000.0</td>\n",
+       "      <td>0.2727</td>\n",
+       "      <td>368.37</td>\n",
+       "      <td>63000.0</td>\n",
+       "      <td>29.07</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>13314.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68814</th>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>0.1992</td>\n",
+       "      <td>185.62</td>\n",
+       "      <td>52000.0</td>\n",
+       "      <td>14.86</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3715.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68815</th>\n",
+       "      <td>40000.0</td>\n",
+       "      <td>0.0646</td>\n",
+       "      <td>1225.24</td>\n",
+       "      <td>520000.0</td>\n",
+       "      <td>9.96</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>59529.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68816</th>\n",
+       "      <td>16000.0</td>\n",
+       "      <td>0.1131</td>\n",
+       "      <td>350.36</td>\n",
+       "      <td>72000.0</td>\n",
+       "      <td>7.02</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>11882.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>68817 rows × 95 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  \\\n",
+       "0        10500.0    0.1719       375.35     66000.0  27.24          0.0   \n",
+       "1        25000.0    0.2000       929.09    105000.0  20.23          0.0   \n",
+       "2        20000.0    0.2000       529.88     56000.0  24.26          0.0   \n",
+       "3        10000.0    0.1640       353.55     92000.0  31.44          0.0   \n",
+       "4        22000.0    0.1474       520.39     52000.0  18.76          0.0   \n",
+       "...          ...       ...          ...         ...    ...          ...   \n",
+       "68812    10000.0    0.1502       346.76     26000.0   9.60          0.0   \n",
+       "68813    12000.0    0.2727       368.37     63000.0  29.07          0.0   \n",
+       "68814     5000.0    0.1992       185.62     52000.0  14.86          0.0   \n",
+       "68815    40000.0    0.0646      1225.24    520000.0   9.96          0.0   \n",
+       "68816    16000.0    0.1131       350.36     72000.0   7.02          2.0   \n",
+       "\n",
+       "       inq_last_6mths  open_acc  pub_rec  revol_bal  ...  issue_d_Mar-2019  \\\n",
+       "0                 0.0       8.0      0.0     1609.0  ...                 1   \n",
+       "1                 0.0      17.0      1.0    18368.0  ...                 1   \n",
+       "2                 0.0       8.0      0.0    13247.0  ...                 1   \n",
+       "3                 1.0      10.0      1.0    17996.0  ...                 1   \n",
+       "4                 1.0      14.0      0.0     9091.0  ...                 1   \n",
+       "...               ...       ...      ...        ...  ...               ...   \n",
+       "68812             0.0       9.0      0.0     2684.0  ...                 0   \n",
+       "68813             0.0       8.0      0.0    13314.0  ...                 0   \n",
+       "68814             0.0       5.0      1.0     3715.0  ...                 0   \n",
+       "68815             1.0      21.0      0.0    59529.0  ...                 0   \n",
+       "68816             0.0      12.0      1.0    11882.0  ...                 0   \n",
+       "\n",
+       "       pymnt_plan_n  initial_list_status_f  initial_list_status_w  \\\n",
+       "0                 1                      0                      1   \n",
+       "1                 1                      0                      1   \n",
+       "2                 1                      0                      1   \n",
+       "3                 1                      0                      1   \n",
+       "4                 1                      0                      1   \n",
+       "...             ...                    ...                    ...   \n",
+       "68812             1                      0                      1   \n",
+       "68813             1                      0                      1   \n",
+       "68814             1                      0                      1   \n",
+       "68815             1                      1                      0   \n",
+       "68816             1                      0                      1   \n",
+       "\n",
+       "       next_pymnt_d_Apr-2019  next_pymnt_d_May-2019  \\\n",
+       "0                          0                      1   \n",
+       "1                          0                      1   \n",
+       "2                          0                      1   \n",
+       "3                          0                      1   \n",
+       "4                          0                      1   \n",
+       "...                      ...                    ...   \n",
+       "68812                      0                      1   \n",
+       "68813                      0                      1   \n",
+       "68814                      0                      1   \n",
+       "68815                      0                      1   \n",
+       "68816                      0                      1   \n",
+       "\n",
+       "       application_type_Individual  application_type_Joint App  \\\n",
+       "0                                1                           0   \n",
+       "1                                1                           0   \n",
+       "2                                1                           0   \n",
+       "3                                1                           0   \n",
+       "4                                1                           0   \n",
+       "...                            ...                         ...   \n",
+       "68812                            1                           0   \n",
+       "68813                            1                           0   \n",
+       "68814                            1                           0   \n",
+       "68815                            1                           0   \n",
+       "68816                            1                           0   \n",
+       "\n",
+       "       hardship_flag_N  debt_settlement_flag_N  \n",
+       "0                    1                       1  \n",
+       "1                    1                       1  \n",
+       "2                    1                       1  \n",
+       "3                    1                       1  \n",
+       "4                    1                       1  \n",
+       "...                ...                     ...  \n",
+       "68812                1                       1  \n",
+       "68813                1                       1  \n",
+       "68814                1                       1  \n",
+       "68815                1                       1  \n",
+       "68816                1                       1  \n",
+       "\n",
+       "[68817 rows x 95 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Convert categorical variable into dummy/indicator variables\n",
+    "df_temp = pd.get_dummies(df_temp, columns= obj_columns)\n",
+    "df_temp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create our features\n",
+    "X = df_temp\n",
+    "# Create our target\n",
+    "y = df[target]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_amnt</th>\n",
+       "      <th>int_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>annual_inc</th>\n",
+       "      <th>dti</th>\n",
+       "      <th>delinq_2yrs</th>\n",
+       "      <th>inq_last_6mths</th>\n",
+       "      <th>open_acc</th>\n",
+       "      <th>pub_rec</th>\n",
+       "      <th>revol_bal</th>\n",
+       "      <th>...</th>\n",
+       "      <th>issue_d_Mar-2019</th>\n",
+       "      <th>pymnt_plan_n</th>\n",
+       "      <th>initial_list_status_f</th>\n",
+       "      <th>initial_list_status_w</th>\n",
+       "      <th>next_pymnt_d_Apr-2019</th>\n",
+       "      <th>next_pymnt_d_May-2019</th>\n",
+       "      <th>application_type_Individual</th>\n",
+       "      <th>application_type_Joint App</th>\n",
+       "      <th>hardship_flag_N</th>\n",
+       "      <th>debt_settlement_flag_N</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>6.881700e+04</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.0</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.000000</td>\n",
+       "      <td>68817.0</td>\n",
+       "      <td>68817.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>16677.594562</td>\n",
+       "      <td>0.127718</td>\n",
+       "      <td>480.652863</td>\n",
+       "      <td>8.821371e+04</td>\n",
+       "      <td>21.778153</td>\n",
+       "      <td>0.217766</td>\n",
+       "      <td>0.497697</td>\n",
+       "      <td>12.587340</td>\n",
+       "      <td>0.126030</td>\n",
+       "      <td>17604.142828</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.177238</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.123879</td>\n",
+       "      <td>0.876121</td>\n",
+       "      <td>0.383161</td>\n",
+       "      <td>0.616839</td>\n",
+       "      <td>0.860340</td>\n",
+       "      <td>0.139660</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>10277.348590</td>\n",
+       "      <td>0.048130</td>\n",
+       "      <td>288.062432</td>\n",
+       "      <td>1.155800e+05</td>\n",
+       "      <td>20.199244</td>\n",
+       "      <td>0.718367</td>\n",
+       "      <td>0.758122</td>\n",
+       "      <td>6.022869</td>\n",
+       "      <td>0.336797</td>\n",
+       "      <td>21835.880400</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.381873</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.329446</td>\n",
+       "      <td>0.329446</td>\n",
+       "      <td>0.486161</td>\n",
+       "      <td>0.486161</td>\n",
+       "      <td>0.346637</td>\n",
+       "      <td>0.346637</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>1000.000000</td>\n",
+       "      <td>0.060000</td>\n",
+       "      <td>30.890000</td>\n",
+       "      <td>4.000000e+01</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>9000.000000</td>\n",
+       "      <td>0.088100</td>\n",
+       "      <td>265.730000</td>\n",
+       "      <td>5.000000e+04</td>\n",
+       "      <td>13.890000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>8.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>6293.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>15000.000000</td>\n",
+       "      <td>0.118000</td>\n",
+       "      <td>404.560000</td>\n",
+       "      <td>7.300000e+04</td>\n",
+       "      <td>19.760000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>11.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>12068.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>24000.000000</td>\n",
+       "      <td>0.155700</td>\n",
+       "      <td>648.100000</td>\n",
+       "      <td>1.040000e+05</td>\n",
+       "      <td>26.660000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>16.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>21735.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>40000.000000</td>\n",
+       "      <td>0.308400</td>\n",
+       "      <td>1676.230000</td>\n",
+       "      <td>8.797500e+06</td>\n",
+       "      <td>999.000000</td>\n",
+       "      <td>18.000000</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>72.000000</td>\n",
+       "      <td>4.000000</td>\n",
+       "      <td>587191.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>8 rows × 95 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          loan_amnt      int_rate   installment    annual_inc           dti  \\\n",
+       "count  68817.000000  68817.000000  68817.000000  6.881700e+04  68817.000000   \n",
+       "mean   16677.594562      0.127718    480.652863  8.821371e+04     21.778153   \n",
+       "std    10277.348590      0.048130    288.062432  1.155800e+05     20.199244   \n",
+       "min     1000.000000      0.060000     30.890000  4.000000e+01      0.000000   \n",
+       "25%     9000.000000      0.088100    265.730000  5.000000e+04     13.890000   \n",
+       "50%    15000.000000      0.118000    404.560000  7.300000e+04     19.760000   \n",
+       "75%    24000.000000      0.155700    648.100000  1.040000e+05     26.660000   \n",
+       "max    40000.000000      0.308400   1676.230000  8.797500e+06    999.000000   \n",
+       "\n",
+       "        delinq_2yrs  inq_last_6mths      open_acc       pub_rec  \\\n",
+       "count  68817.000000    68817.000000  68817.000000  68817.000000   \n",
+       "mean       0.217766        0.497697     12.587340      0.126030   \n",
+       "std        0.718367        0.758122      6.022869      0.336797   \n",
+       "min        0.000000        0.000000      2.000000      0.000000   \n",
+       "25%        0.000000        0.000000      8.000000      0.000000   \n",
+       "50%        0.000000        0.000000     11.000000      0.000000   \n",
+       "75%        0.000000        1.000000     16.000000      0.000000   \n",
+       "max       18.000000        5.000000     72.000000      4.000000   \n",
+       "\n",
+       "           revol_bal  ...  issue_d_Mar-2019  pymnt_plan_n  \\\n",
+       "count   68817.000000  ...      68817.000000       68817.0   \n",
+       "mean    17604.142828  ...          0.177238           1.0   \n",
+       "std     21835.880400  ...          0.381873           0.0   \n",
+       "min         0.000000  ...          0.000000           1.0   \n",
+       "25%      6293.000000  ...          0.000000           1.0   \n",
+       "50%     12068.000000  ...          0.000000           1.0   \n",
+       "75%     21735.000000  ...          0.000000           1.0   \n",
+       "max    587191.000000  ...          1.000000           1.0   \n",
+       "\n",
+       "       initial_list_status_f  initial_list_status_w  next_pymnt_d_Apr-2019  \\\n",
+       "count           68817.000000           68817.000000           68817.000000   \n",
+       "mean                0.123879               0.876121               0.383161   \n",
+       "std                 0.329446               0.329446               0.486161   \n",
+       "min                 0.000000               0.000000               0.000000   \n",
+       "25%                 0.000000               1.000000               0.000000   \n",
+       "50%                 0.000000               1.000000               0.000000   \n",
+       "75%                 0.000000               1.000000               1.000000   \n",
+       "max                 1.000000               1.000000               1.000000   \n",
+       "\n",
+       "       next_pymnt_d_May-2019  application_type_Individual  \\\n",
+       "count           68817.000000                 68817.000000   \n",
+       "mean                0.616839                     0.860340   \n",
+       "std                 0.486161                     0.346637   \n",
+       "min                 0.000000                     0.000000   \n",
+       "25%                 0.000000                     1.000000   \n",
+       "50%                 1.000000                     1.000000   \n",
+       "75%                 1.000000                     1.000000   \n",
+       "max                 1.000000                     1.000000   \n",
+       "\n",
+       "       application_type_Joint App  hardship_flag_N  debt_settlement_flag_N  \n",
+       "count                68817.000000          68817.0                 68817.0  \n",
+       "mean                     0.139660              1.0                     1.0  \n",
+       "std                      0.346637              0.0                     0.0  \n",
+       "min                      0.000000              1.0                     1.0  \n",
+       "25%                      0.000000              1.0                     1.0  \n",
+       "50%                      0.000000              1.0                     1.0  \n",
+       "75%                      0.000000              1.0                     1.0  \n",
+       "max                      1.000000              1.0                     1.0  \n",
+       "\n",
+       "[8 rows x 95 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "low_risk     68470\n",
+       "high_risk      347\n",
+       "Name: loan_status, dtype: int64"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Check the balance of our target values\n",
+    "y['loan_status'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'low_risk': 68470, 'high_risk': 347})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y = y['loan_status'].ravel()\n",
+    "Counter(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'low_risk': 51366, 'high_risk': 246})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create X_train, X_test, y_train, y_test\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n",
+    "Counter(y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Oversampling\n",
+    "\n",
+    "In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:\n",
+    "\n",
+    "1. View the count of the target classes using `Counter` from the collections library. \n",
+    "3. Use the resampled data to train a logistic regression model.\n",
+    "3. Calculate the balanced accuracy score from sklearn.metrics.\n",
+    "4. Print the confusion matrix from sklearn.metrics.\n",
+    "5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+    "\n",
+    "Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Naive Random Oversampling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'low_risk': 51366, 'high_risk': 51366})"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Resample the training data with the RandomOversampler\n",
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "ros = RandomOverSampler(random_state=1)\n",
+    "X_resampled, y_resampled = ros.fit_resample(X_train, y_train)\n",
+    "Counter(y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
+       "                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+       "                   warm_start=False)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Train the Logistic Regression model using the resampled data\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "\n",
+    "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+    "model.fit(X_resampled, y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6870331414572701"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Run the test with model\n",
+    "y_pred = model.predict(X_test)\n",
+    "# Calculated the balanced accuracy score\n",
+    "from sklearn.metrics import balanced_accuracy_score\n",
+    "balanced_accuracy_score(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Predicted high risk</th>\n",
+       "      <th>Predicted low risk</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Actual high risk</th>\n",
+       "      <td>78</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Actual low risk</th>\n",
+       "      <td>6811</td>\n",
+       "      <td>10293</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  Predicted high risk  Predicted low risk\n",
+       "Actual high risk                   78                  23\n",
+       "Actual low risk                  6811               10293"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Display the confusion matrix\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "\n",
+    "# Calculating the confusion matrix.\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "\n",
+    "# Create a DataFrame from the confusion matrix.\n",
+    "cm_df = pd.DataFrame(\n",
+    "    cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+    "\n",
+    "cm_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   pre       rec       spe        f1       geo       iba       sup\n",
+      "\n",
+      "  high_risk       0.01      0.77      0.60      0.02      0.68      0.47       101\n",
+      "   low_risk       1.00      0.60      0.77      0.75      0.68      0.46     17104\n",
+      "\n",
+      "avg / total       0.99      0.60      0.77      0.75      0.68      0.46     17205\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the imbalanced classification report\n",
+    "from imblearn.metrics import classification_report_imbalanced\n",
+    "print(classification_report_imbalanced(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis\n",
+    "\n",
+    "**Accuracy**\n",
+    "\n",
+    "The balanced accuracy score of Naive Random Oversampling is .69 or 69% after resampled the data training with the Random Over Sampler while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 78 and TN(True Nagative) is 10293, and total is 17205, so (78+10293)/17205 = .602789. This is a important factor that we need to evaluate the Machine Learning (ML) method becuase of the accuracy prediction of the corrected cases in the dataset. **Naive Random Oversampling is actually improved the accuracy prediction**\n",
+    "\n",
+    "**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)\n",
+    "\n",
+    "- High risk rate is 77% so every 100 high risk cases the model detects right 77 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. \n",
+    "- Low risk rate is 60% so every 100 low risk cases the model categorizes 40 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line\n",
+    "\n",
+    "**Precision:**\n",
+    "\n",
+    "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+    "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+    "\n",
+    "**For this dateset, Naive Random Oversampling accuracy and recall rate is the best among below models: Naive Random Oversampling, SMOTE,Undersampling,Combination Sampling. However, I recommend to use the Easy Ensemble AdaBoost Classifier model for the dataset because the accuracy and recall rate are very impressive(over 90%) (Refer: ReadMe.md)**\n",
+    "***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SMOTE Oversampling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'low_risk': 51366, 'high_risk': 51366})"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Resample the training data with SMOTE\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "smote = SMOTE(random_state=1)\n",
+    "X_resampled, y_resampled = smote.fit_resample(X_train, y_train)\n",
+    "Counter(y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
+       "                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+       "                   warm_start=False)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Train the Logistic Regression model using the resampled data\n",
+    "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+    "model.fit(X_resampled, y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6642357991645751"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Calculated the balanced accuracy score\n",
+    "y_pred = model.predict(X_test)\n",
+    "balanced_accuracy_score(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Predicted high risk</th>\n",
+       "      <th>Predicted low risk</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Actual high risk</th>\n",
+       "      <td>64</td>\n",
+       "      <td>37</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Actual low risk</th>\n",
+       "      <td>5220</td>\n",
+       "      <td>11884</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  Predicted high risk  Predicted low risk\n",
+       "Actual high risk                   64                  37\n",
+       "Actual low risk                  5220               11884"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Display the confusion matrix\n",
+    "\n",
+    "# Calculating the confusion matrix.\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "\n",
+    "# Create a DataFrame from the confusion matrix.\n",
+    "cm_df = pd.DataFrame(\n",
+    "    cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+    "\n",
+    "cm_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   pre       rec       spe        f1       geo       iba       sup\n",
+      "\n",
+      "  high_risk       0.01      0.63      0.69      0.02      0.66      0.44       101\n",
+      "   low_risk       1.00      0.69      0.63      0.82      0.66      0.44     17104\n",
+      "\n",
+      "avg / total       0.99      0.69      0.63      0.81      0.66      0.44     17205\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the imbalanced classification report\n",
+    "print(classification_report_imbalanced(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis\n",
+    "\n",
+    "**Accuracy**\n",
+    "\n",
+    "The balanced accuracy score of SMOTE Oversampling is .66 or 66% after resampled the data training with the SMOTE while Accuracy score is .69 or 69% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 64 and TN(True Nagative) is 11884, and total is 17205, so (64+11884)/17205 = .69444. With SMOTE model, the balanced accuracy rate is lower than the actual accuracy rate. It means the good loan (low risk) cases predicted better. **SMOTE model accuracy is very close and similar to Naive Random Oversampling, but the Naive Random Oversampling is better selection**\n",
+    "\n",
+    "**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)\n",
+    "\n",
+    "- High risk rate is 63% so every 100 high risk cases the model detects right 63 high risk cases and categorizes 37 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **SMOTE model recall (sensitivity) rate is lower than Naive Random Oversampling**\n",
+    "\n",
+    "- Low risk rate is 69% so every 100 low risk cases the model categorizes 41 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line\n",
+    "\n",
+    "**Precision:**\n",
+    "\n",
+    "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+    "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+    "\n",
+    "***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Undersampling\n",
+    "\n",
+    "In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:\n",
+    "\n",
+    "1. View the count of the target classes using `Counter` from the collections library. \n",
+    "3. Use the resampled data to train a logistic regression model.\n",
+    "3. Calculate the balanced accuracy score from sklearn.metrics.\n",
+    "4. Print the confusion matrix from sklearn.metrics.\n",
+    "5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+    "\n",
+    "Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'high_risk': 246, 'low_risk': 246})"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Resample the data using the ClusterCentroids resampler\n",
+    "from imblearn.under_sampling import ClusterCentroids\n",
+    "cc = ClusterCentroids(random_state=1)\n",
+    "X_resampled, y_resampled = cc.fit_resample(X_train, y_train)\n",
+    "Counter(y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
+       "                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+       "                   warm_start=False)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Train the Logistic Regression model using the resampled data\n",
+    "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+    "model.fit(X_resampled, y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5330103432466726"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Calculated the balanced accuracy score\n",
+    "y_pred = model.predict(X_test)\n",
+    "balanced_accuracy_score(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Predicted high risk</th>\n",
+       "      <th>Predicted low risk</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Actual high risk</th>\n",
+       "      <td>67</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Actual low risk</th>\n",
+       "      <td>10217</td>\n",
+       "      <td>6887</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  Predicted high risk  Predicted low risk\n",
+       "Actual high risk                   67                  34\n",
+       "Actual low risk                 10217                6887"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Display the confusion matrix\n",
+    "\n",
+    "# Calculating the confusion matrix.\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "\n",
+    "# Create a DataFrame from the confusion matrix.\n",
+    "cm_df = pd.DataFrame(\n",
+    "    cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+    "\n",
+    "cm_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   pre       rec       spe        f1       geo       iba       sup\n",
+      "\n",
+      "  high_risk       0.01      0.66      0.40      0.01      0.52      0.27       101\n",
+      "   low_risk       1.00      0.40      0.66      0.57      0.52      0.26     17104\n",
+      "\n",
+      "avg / total       0.99      0.40      0.66      0.57      0.52      0.26     17205\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the imbalanced classification report\n",
+    "print(classification_report_imbalanced(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis\n",
+    "\n",
+    "**Accuracy**\n",
+    "\n",
+    "The balanced accuracy score of Undersampling is .53 or 53% after resampled the data training with the ClusterCentroids while Accuracy score is .40 or 40% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 6887, and total is 17205, so (67+6887)/17205 = .4041. With Undersampling model, the balanced accuracy rate is higher than the actual accuracy rate. **Undersampling model accuracy is very low. This method may not the righ model for this dataset**\n",
+    "\n",
+    "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+    "\n",
+    "- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling still the best model selection because the accurary and recall rate**\n",
+    "\n",
+    "- Low risk rate is 40% so every 100 low risk cases the model categorizes 60 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line, but it is **very high alarmed cases**\n",
+    "\n",
+    "**Precision:**\n",
+    "\n",
+    "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+    "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+    "\n",
+    "***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Combination (Over and Under) Sampling\n",
+    "\n",
+    "In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:\n",
+    "\n",
+    "1. View the count of the target classes using `Counter` from the collections library. \n",
+    "3. Use the resampled data to train a logistic regression model.\n",
+    "3. Calculate the balanced accuracy score from sklearn.metrics.\n",
+    "4. Print the confusion matrix from sklearn.metrics.\n",
+    "5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+    "\n",
+    "Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'high_risk': 51359, 'low_risk': 46660})"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Resample the training data with SMOTEENN\n",
+    "from imblearn.combine import SMOTEENN\n",
+    "\n",
+    "smote_enn = SMOTEENN(random_state=0)\n",
+    "X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)\n",
+    "Counter(y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
+       "                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,\n",
+       "                   warm_start=False)"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Train the Logistic Regression model using the resampled data\n",
+    "model = LogisticRegression(solver='lbfgs', random_state=1)\n",
+    "model.fit(X_resampled, y_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6347701655104706"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Calculated the balanced accuracy score\n",
+    "y_pred = model.predict(X_test)\n",
+    "\n",
+    "balanced_accuracy_score(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Predicted high risk</th>\n",
+       "      <th>Predicted low risk</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Actual high risk</th>\n",
+       "      <td>67</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Actual low risk</th>\n",
+       "      <td>6736</td>\n",
+       "      <td>10368</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  Predicted high risk  Predicted low risk\n",
+       "Actual high risk                   67                  34\n",
+       "Actual low risk                  6736               10368"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Display the confusion matrix\n",
+    "\n",
+    "# Calculating the confusion matrix.\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "\n",
+    "# Create a DataFrame from the confusion matrix.\n",
+    "cm_df = pd.DataFrame(\n",
+    "    cm,index=[\"Actual high risk\", \"Actual low risk\"], columns=[\"Predicted high risk\", \"Predicted low risk\"])\n",
+    "\n",
+    "cm_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   pre       rec       spe        f1       geo       iba       sup\n",
+      "\n",
+      "  high_risk       0.01      0.66      0.61      0.02      0.63      0.40       101\n",
+      "   low_risk       1.00      0.61      0.66      0.75      0.63      0.40     17104\n",
+      "\n",
+      "avg / total       0.99      0.61      0.66      0.75      0.63      0.40     17205\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the imbalanced classification report\n",
+    "print(classification_report_imbalanced(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis\n",
+    "\n",
+    "**Accuracy**\n",
+    "\n",
+    "The balanced accuracy score of Combination (Over and Under) Sampling is .63 or 63% after resampled the data training with the SMOTEENN while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 10368, and total is 17205, so (67+10368)/17205 = .6065. With SMOTEENN model, the balanced accuracy rate is higher than the actual accuracy rate. **SMOTEENN model accuracy is lower than Naive Random Oversampling, but it is much better than Undersampling model(ClusterCentroids)**\n",
+    "\n",
+    "**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) \n",
+    "\n",
+    "- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling is still the highest rate so far**\n",
+    "\n",
+    "- Low risk rate is 61% so every 100 low risk cases the model categorizes 39 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line.\n",
+    "\n",
+    "**Precision:**\n",
+    "\n",
+    "- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases \n",
+    "- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.\n",
+    "\n",
+    "***"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlenv",
+   "language": "python",
+   "name": "mlenv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.md b/README.md
index 8aac655..cc2a96d 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,268 @@
-# LM-LendingClub
\ No newline at end of file
+# Lending Club - Credit Risk (Machine Learning)
+
+## Summary
+
+Credit risk is an inherently unbalanced classification problem, as the number of good loans easily outnumber the number of risky loans. 
+
+1- To use different techniques to train and evaluate models with unbalanced classes. 
+2- To Evaluate the performance of these models and make a recommendation on whether they should be used to predict credit risk.
+
+## Objectives
+The goals:
+
+1. Implement machine learning models.
+2. Use resampling to attempt to address class imbalance.
+3. Evaluate the performance of machine learning models. 
+
+The Accuracy and Recall (sensitivity) rate are two important factor to select the righ model for the imbalanced dataset while the good data is overwhelming the bad data in the dataset.
+
+
+## Naive Random Oversampling
+
+**Balanced Accuracy Score**
+
+0.69
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |78  | 23   |
+|Actual low rick  |6811| 10293|
+       
+**Report Table**
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk  |  0.01   |   0.77 |     0.60  |    0.02   |   0.68   |   0.47  |     101|
+|low_risk   |  1.00   |   0.60 |     0.77  |    0.75   |   0.68   |   0.46  |   17104|
+|avg / total|  0.99   |   0.60 |     0.77  |    0.75   |   0.68   |   0.46  |   17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of Naive Random Oversampling is .69 or 69% after resampled the data training with the Random Over Sampler while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 78 and TN(True Nagative) is 10293, and total is 17205, so (78+10293)/17205 = .602789. This is a important factor that we need to evaluate the Machine Learning (ML) method becuase of the accuracy prediction of the corrected cases in the dataset. **Naive Random Oversampling is actually improved the accuracy prediction**
+
+**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)
+
+- High risk rate is 77% so every 100 high risk cases the model detects right 77 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. 
+- Low risk rate is 60% so every 100 low risk cases the model categorizes 40 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases 
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+**For this dateset, Naive Random Oversampling accuracy and recall rate is the best among below models: Naive Random Oversampling, SMOTE,Undersampling,Combination Sampling. However, I recommend to use the Easy Ensemble AdaBoost Classifier model for the dataset because the accuracy and recall rate are very impressive(over 90%) (Refer: ReadMe.md)**
+***
+
+
+## SMOTE Oversampling
+
+**Balanced Accuracy Score**
+
+0.66
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |64  | 37   |
+|Actual low rick  |5220| 11884|
+       
+**Report Table
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk  |  0.01|      0.63|      0.69|      0.02|      0.66|      0.44|       101|
+|low_risk   |  1.00|      0.69|      0.63|      0.82|      0.66|      0.44|     17104|
+|avg / total|  0.99|      0.69|      0.63|      0.81|      0.66|      0.44|     17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of SMOTE Oversampling is .66 or 66% after resampled the data training with the SMOTE while Accuracy score is .69 or 69% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 64 and TN(True Nagative) is 11884, and total is 17205, so (64+11884)/17205 = .69444. With SMOTE model, the balanced accuracy rate is lower than the actual accuracy rate. It means the good loan (low risk) cases predicted better. **SMOTE model accuracy is very close and similar to Naive Random Oversampling, but the Naive Random Oversampling is better selection**
+
+**Recall (sensitivity):** (Refer: the confustion matrix and the classification report)
+
+- High risk rate is 63% so every 100 high risk cases the model detects right 63 high risk cases and categorizes 37 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **SMOTE model recall (sensitivity) rate is lower than Naive Random Oversampling**
+
+- Low risk rate is 69% so every 100 low risk cases the model categorizes 41 low risk cases to become high risk cases. This is just alarmed cases that need to be reviewed by business line
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases 
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+
+## Cluster Centroids
+
+**Balanced Accuracy Score**
+
+0.53
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |67  | 34   |
+|Actual low rick  |10217| 6887|
+       
+**Report Table**
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk  |  0.01|      0.66|      0.40|      0.01|      0.52|      0.27|       101|
+|low_risk   |  1.00|      0.40|      0.66|      0.57|      0.52|      0.26|     17104|
+|avg / total|  0.99|      0.40|      0.66|      0.57|      0.52|      0.26|     17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of Undersampling is .53 or 53% after resampled the data training with the ClusterCentroids while Accuracy score is .40 or 40% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 6887, and total is 17205, so (67+6887)/17205 = .4041. With Undersampling model, the balanced accuracy rate is higher than the actual accuracy rate. **Undersampling model accuracy is very low. This method may not the righ model for this dataset**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) 
+
+- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling still the best model selection because the accurary and recall rate**
+
+- Low risk rate is 40% so every 100 low risk cases the model categorizes 60 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line, but it is **very high alarmed cases**
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases 
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+## Combination (Over and Under) Sampling
+
+**Balanced Accuracy Score**
+
+0.63
+
+**Confusion Matrix**
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |67  | 34   |
+|Actual low rick  |6736| 10368|
+       
+**Report Table**
+
+| |pre|rec|spe|f1|geo|iba|sup|
+|-|-|-|-|-|-|-|-|
+|high_risk  |  0.01|      0.66|      0.61|      0.02|      0.63|      0.40|       101|
+|low_risk   |  1.00|      0.61|      0.66|      0.75|      0.63|      0.40|     17104|
+|avg / total|  0.99|      0.61|      0.66|      0.75|      0.63|      0.40|     17205|
+
+### Analysis
+
+**Accuracy**
+
+The balanced accuracy score of Combination (Over and Under) Sampling is .63 or 63% after resampled the data training with the SMOTEENN while Accuracy score is .60 or 60% based on the confustion matrix. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 67 and TN(True Nagative) is 10368, and total is 17205, so (67+10368)/17205 = .6065. With SMOTEENN model, the balanced accuracy rate is higher than the actual accuracy rate. **SMOTEENN model accuracy is lower than Naive Random Oversampling, but it is much better than Undersampling model(ClusterCentroids)**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) 
+
+- High risk rate is 66% so every 100 high risk cases the model detects right 66 high risk cases and categorizes 34 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Naive Random Oversampling is still the highest rate so far**
+
+- Low risk rate is 61% so every 100 low risk cases the model categorizes 39 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line.
+
+**Precision:**
+
+- High risk rate is closed 0.01 (~1%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases 
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+# Extension (Machine Learning)
+
+
+## BalancedRandomForestClassifier
+
+**Accuracy Score
+
+0.90
+
+**Confusion Matrix
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |68  | 33   |
+|Actual low rick  |1749| 15355|
+       
+**Report Table
+
+|precision|    recall|  f1-score|   support|
+|-|-|-|-|
+|   high_risk|       0.04|      0.67|      0.07|       101|
+|    low_risk|       1.00|      0.90|      0.95|     17104|
+|    accuracy|           |          |      0.90|     17205|
+|   macro avg|       0.52|      0.79|      0.51|     17205|
+|weighted avg|       0.99|      0.90|      0.94|     17205|
+
+### Analysis
+
+**Accuracy**
+
+The accuracy score of Balanced Random Forest Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 68 and TN(True Nagative) is 15355, and total is 17205, so (68+15355)/17205 = .8964. **The accuracy score is very good number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) 
+
+- High risk rate is 67% so every 100 high risk cases the model detects right 67 high risk cases and categorizes 33 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is average number. It is not impressive like the accuracy. It tells that the low risk cases are detected better than the high risk cases**
+
+- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line.
+
+**Precision:**
+
+- High risk rate is closed 0.04 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases 
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***
+
+## EasyEnsembleClassifier
+
+**Accuracy Score
+
+0.90
+
+**Confusion Matrix
+
+| |Predict high risk | Predict low risk |
+|-|-|-|
+|Actual high risk |94  | 7   |
+|Actual low rick  |1706| 15398|
+       
+**Report Table
+
+|precision|    recall|  f1-score|   support|
+|-|-|-|-|
+|   high_risk|       0.05|      0.93|      0.10|       101|
+|    low_risk|       1.00|      0.90|      0.95|     17104|
+|    accuracy|           |          |      0.90|     17205|
+|   macro avg|       0.53|      0.92|      0.51|     17205|
+|weighted avg|       0.99|      0.90|      0.94|     17205|
+
+
+### Analysis (The best model)
+
+**Accuracy**
+
+The accuracy score of Easy Ensemble AdaBoost Classifier is .90 or 90%. Accuracy score formula is (TP+TN)/Total, which TP (True Positive) is 94 and TN(True Nagative) is 15398, and total is 17205, so (94+15398)/17205 = .9004. **The accuracy score is very impressive number. It can be a good model for this dataset. Let take a look into Recall (sensitivity) becuase this rate is important to detect high risk cases of this dataset**
+
+**Recall (sensitivity):**(Refer: The confustion matrix and the classification report) 
+
+- High risk rate is 93% so every 100 high risk cases the model detects right 93 high risk cases and categorizes 7 high risk cases to become low risk cases. This is a number that we want to see a larger percent becuase we do not want high risk cases to become low risk cases without any caution. **Recall rate is also very impressive number. Both the accuracy and the recall rate are over 90%. This is a good model to detect the high risk cases of this dataset**
+
+- Low risk rate is 90% so every 100 low risk cases the model categorizes 10 low risk cases to become high risk cases. This is  just alarmed cases that need to be reviewed by business line.
+
+**Precision:**
+
+- High risk rate is closed 0.05 (~4%). It tells that the number of low risk cases predicted as high risk cases is much larger than actual high risk cases 
+- Low risk rate is closed 1.00 (~100%). It tells that the actual low risk cases are overrule the precision rate. This is very true in the imbalanced classificantion cases.
+
+***