diff --git a/README.md b/README.md index 23d4907..70cf408 100644 --- a/README.md +++ b/README.md @@ -20,19 +20,33 @@ This notebook performs Exploratory Data Analysis (EDA) on the cleaned dataset. T - **Correlation Analysis**: Understanding relationships between numerical features. - **Outlier Detection**: Using box plots to identify potential outliers. + +### 3. `Feature_Engineering.ipynb` +This notebook focuses on feature engineering to enhance the dataset for modeling. Key tasks include: +- **Aggregate Features**: Creating new features such as total transaction amount, average transaction amount, transaction count, and standard deviation of transaction amounts for each customer. +- **Time-Based Features**: Extracting features from the transaction timestamp (hour, day, month, year). +- **Encoding Categorical Variables**: Applying Weight of Evidence (WOE) transformation to categorical features for better model interpretability. +- **Handling Missing Values**: Implementing strategies for filling or removing missing values in the dataset. +- **Normalization/Standardization**: Scaling numerical features to ensure they are on a similar scale, improving model performance. + + ## Requirements To run the notebooks, you will need: - Python 3.x - Pandas +- NumPy - Matplotlib - Seaborn +- Scikit-learn +- Scorecardpy ## Getting Started 1. Clone the repository or download the project files. 2. Install the required packages listed in `requirements.txt`. 3. Open the notebooks in Jupyter Notebook or any compatible IDE. -4. Execute the cells in the order provided to complete the data cleaning and EDA. +4. Execute the cells in the order provided to complete the data cleaning, EDA, and feature engineering processes. + ## Conclusion The outputs from the EDA and feature engineering notebooks will be utilized in subsequent modeling tasks to develop a robust credit scoring model. Your contributions and feedback are welcome! diff --git a/notebooks/Feature_Engineering.ipynb b/notebooks/Feature_Engineering.ipynb new file mode 100644 index 0000000..1cf8b14 --- /dev/null +++ b/notebooks/Feature_Engineering.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Engineering" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "import pandas as pd\n", + "from sklearn.preprocessing import StandardScaler\n", + "import scorecardpy as sc" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your dataset (replace 'your_file.csv' with your actual file path)\n", + "df = pd.read_csv('C:/Users/Administrator/Documents/kifiya/Week_6/cleaned_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',\n", + " 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',\n", + " 'ProductCategory', 'ChannelId', 'Amount', 'Value',\n", + " 'TransactionStartTime', 'PricingStrategy', 'FraudResult'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Create Aggregate Features" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "# Total Transaction Amount per customer\n", + "df['TotalTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('sum')\n", + "\n", + "# Average Transaction Amount per customer\n", + "df['AverageTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('mean')\n", + "\n", + "# Transaction Count per customer\n", + "df['TransactionCount'] = df.groupby('CustomerId')['TransactionId'].transform('count')\n", + "\n", + "# Standard Deviation of Transaction Amounts per customer\n", + "df['TransactionAmountStd'] = df.groupby('CustomerId')['Amount'].transform('std').fillna(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Extract Time-Based Features" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')\n", + "\n", + "df['TransactionHour'] = df['TransactionStartTime'].dt.hour\n", + "df['TransactionDay'] = df['TransactionStartTime'].dt.day\n", + "df['TransactionMonth'] = df['TransactionStartTime'].dt.month\n", + "df['TransactionYear'] = df['TransactionStartTime'].dt.year" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Encode Categorical Variables using WOE" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] creating woe binning ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\condition_fun.py:40: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\condition_fun.py:40: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\condition_fun.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\condition_fun.py:40: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\condition_fun.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\condition_fun.py:40: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\condition_fun.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:361: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " init_bin = init_bin.groupby('brkp', group_keys=False).agg({\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:361: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " init_bin = init_bin.groupby('brkp', group_keys=False).agg({\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:361: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " init_bin = init_bin.groupby('brkp', group_keys=False).agg({\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:361: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " init_bin = init_bin.groupby('brkp', group_keys=False).agg({\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:361: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " init_bin = init_bin.groupby('brkp', group_keys=False).agg({\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:361: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " init_bin = init_bin.groupby('brkp', group_keys=False).agg({\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:361: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " init_bin = init_bin.groupby('brkp', group_keys=False).agg({\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:410: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " total_iv_all_brks = pd.melt(\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n", + "c:\\Users\\Administrator\\miniconda3\\envs\\jojo\\lib\\site-packages\\scorecardpy\\woebin.py:446: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\\\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] converting into woe values ...\n" + ] + } + ], + "source": [ + "# Assuming 'FraudResult' is the target variable and 'ProductCategory', 'ProviderId', 'ChannelId' are the features\n", + "features = ['ProductCategory', 'ProviderId', 'ChannelId']\n", + "\n", + "# Calculate the WOE and IV for each feature\n", + "bins = sc.woebin(df, y='FraudResult', x=features)\n", + "\n", + "# Apply the WOE transformation to the dataset\n", + "df_woe = sc.woebin_ply(df, bins)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['TransactionDay', 'TransactionHour', 'BatchId', 'TransactionId',\n", + " 'Value', 'TransactionYear', 'ProductId', 'CurrencyCode',\n", + " 'TotalTransactionAmount', 'TransactionCount', 'Amount',\n", + " 'SubscriptionId', 'TransactionMonth', 'PricingStrategy', 'AccountId',\n", + " 'FraudResult', 'AverageTransactionAmount', 'TransactionStartTime',\n", + " 'CountryCode', 'TransactionAmountStd', 'CustomerId', 'ProviderId_woe',\n", + " 'ProductCategory_woe', 'ChannelId_woe'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "print(df_woe.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Handle Missing Values" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_15060\\1868837317.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df_woe[col].fillna(df_woe[col].median(), inplace=True)\n", + "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_15060\\1868837317.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df_woe[col].fillna(df_woe[col].median(), inplace=True)\n", + "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_15060\\1868837317.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df_woe[col].fillna(df_woe[col].median(), inplace=True)\n", + "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_15060\\1868837317.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df_woe[col].fillna(df_woe[col].median(), inplace=True)\n", + "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_15060\\1868837317.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df_woe[col].fillna(df_woe[col].median(), inplace=True)\n", + "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_15060\\1868837317.py:7: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df_woe[col].fillna(df_woe[col].mode()[0], inplace=True)\n" + ] + } + ], + "source": [ + "# Fill missing numerical columns with median\n", + "for col in ['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']:\n", + " df_woe[col].fillna(df_woe[col].median(), inplace=True)\n", + "\n", + "# Handle missing values for categorical WOE columns\n", + "for col in ['ProductCategory_woe', 'CurrencyCode', 'ProviderId_woe', 'ChannelId_woe']:\n", + " df_woe[col].fillna(df_woe[col].mode()[0], inplace=True)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Normalize/Standardize Numerical Features" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# Standardize (mean=0, std=1) the numerical features\n", + "scaler = StandardScaler()\n", + "df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']] = scaler.fit_transform(\n", + " df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TransactionDayTransactionHourBatchIdTransactionIdValueTransactionYearProductIdCurrencyCodeTotalTransactionAmountTransactionCount...AccountIdFraudResultAverageTransactionAmountTransactionStartTimeCountryCodeTransactionAmountStdCustomerIdProviderId_woeProductCategory_woeChannelId_woe
0152BatchId_36123TransactionId_76871-0.0722912018ProductId_10UGX0.170118119...AccountId_39570-0.0676232018-11-15 02:18:49+00:00256-0.167016CustomerId_4406-2.906446-1.6908240.484515
1152BatchId_15642TransactionId_73770-0.0802512018ProductId_6UGX0.170118119...AccountId_48410-0.0676232018-11-15 02:19:08+00:00256-0.167016CustomerId_4406-2.9064460.607033-2.736867
2152BatchId_53941TransactionId_26203-0.0763522018ProductId_1UGX0.1651222...AccountId_42290-0.0725682018-11-15 02:44:21+00:00256-0.201209CustomerId_4683-2.906446-1.6908240.484515
3153BatchId_102363TransactionId_3800.0966482018ProductId_21UGX0.17556738...AccountId_6480-0.0081552018-11-15 03:32:55+00:00256-0.008243CustomerId_9881.9394420.6070330.484515
4153BatchId_38780TransactionId_28195-0.0751832018ProductId_6UGX0.17556738...AccountId_48410-0.0081552018-11-15 03:34:21+00:00256-0.008243CustomerId_988-2.9064460.607033-2.736867
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " TransactionDay TransactionHour BatchId TransactionId \\\n", + "0 15 2 BatchId_36123 TransactionId_76871 \n", + "1 15 2 BatchId_15642 TransactionId_73770 \n", + "2 15 2 BatchId_53941 TransactionId_26203 \n", + "3 15 3 BatchId_102363 TransactionId_380 \n", + "4 15 3 BatchId_38780 TransactionId_28195 \n", + "\n", + " Value TransactionYear ProductId CurrencyCode \\\n", + "0 -0.072291 2018 ProductId_10 UGX \n", + "1 -0.080251 2018 ProductId_6 UGX \n", + "2 -0.076352 2018 ProductId_1 UGX \n", + "3 0.096648 2018 ProductId_21 UGX \n", + "4 -0.075183 2018 ProductId_6 UGX \n", + "\n", + " TotalTransactionAmount TransactionCount ... AccountId FraudResult \\\n", + "0 0.170118 119 ... AccountId_3957 0 \n", + "1 0.170118 119 ... AccountId_4841 0 \n", + "2 0.165122 2 ... AccountId_4229 0 \n", + "3 0.175567 38 ... AccountId_648 0 \n", + "4 0.175567 38 ... AccountId_4841 0 \n", + "\n", + " AverageTransactionAmount TransactionStartTime CountryCode \\\n", + "0 -0.067623 2018-11-15 02:18:49+00:00 256 \n", + "1 -0.067623 2018-11-15 02:19:08+00:00 256 \n", + "2 -0.072568 2018-11-15 02:44:21+00:00 256 \n", + "3 -0.008155 2018-11-15 03:32:55+00:00 256 \n", + "4 -0.008155 2018-11-15 03:34:21+00:00 256 \n", + "\n", + " TransactionAmountStd CustomerId ProviderId_woe ProductCategory_woe \\\n", + "0 -0.167016 CustomerId_4406 -2.906446 -1.690824 \n", + "1 -0.167016 CustomerId_4406 -2.906446 0.607033 \n", + "2 -0.201209 CustomerId_4683 -2.906446 -1.690824 \n", + "3 -0.008243 CustomerId_988 1.939442 0.607033 \n", + "4 -0.008243 CustomerId_988 -2.906446 0.607033 \n", + "\n", + " ChannelId_woe \n", + "0 0.484515 \n", + "1 -2.736867 \n", + "2 0.484515 \n", + "3 0.484515 \n", + "4 -2.736867 \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Final check of the dataset\n", + "display(df_woe.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "# Save the processed dataset to CSV \n", + "df_woe.to_csv('C:/Users/Administrator/Documents/kifiya/Week_6/processed_data.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "jojo", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.20" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 3b9a24f..bfd7d91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ pandas matplotlib -seaborn \ No newline at end of file +seaborn +scorecardpy +Scikit-learn + +