diff --git a/project_classification.ipynb b/project_classification.ipynb
new file mode 100644
index 0000000..946422f
--- /dev/null
+++ b/project_classification.ipynb
@@ -0,0 +1,5532 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Import of basic packages\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import operator\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings('ignore')\n",
+ "\n",
+ "# Import of chart packages\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import plotly.express as px\n",
+ "import altair as alt\n",
+ "\n",
+ "# Import of machine learning metric packages\n",
+ "from sklearn.metrics import f1_score, classification_report, confusion_matrix, mean_squared_error, r2_score, accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, fbeta_score\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "# Import of preprossesor packages\n",
+ "from sklearn import preprocessing\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, LabelBinarizer, PolynomialFeatures\n",
+ "\n",
+ "# Import of machine learning packages\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, RandomizedSearchCV\n",
+ "from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, SGDClassifier\n",
+ "from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier\n",
+ "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree\n",
+ "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, StackingRegressor, StackingClassifier, AdaBoostClassifier\n",
+ "from xgboost import XGBClassifier\n",
+ "from sklearn.svm import SVC\n",
+ "\n",
+ "# Set random seed \n",
+ "RSEED = 0"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Style"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID DATOP FLTID DEPSTN ARRSTN STD \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "\n",
+ " STA STATUS AC target \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load csv\n",
+ "df = pd.read_csv('data/train.csv')\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore data analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " statistic \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " dtype \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " float64 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " mean \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 48.7 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " std \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 117.1 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " min \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 25% \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 50% \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 14.0 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 75% \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 43.0 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " max \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 3451.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " statistic ID DATOP FLTID DEPSTN ARRSTN STD STA STATUS \n",
+ "0 dtype object object object object object object object object \\\n",
+ "1 mean NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "2 std NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "3 min NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "4 25% NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "5 50% NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "6 75% NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "7 max NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " AC target \n",
+ "0 object float64 \n",
+ "1 NaN 48.7 \n",
+ "2 NaN 117.1 \n",
+ "3 NaN 0.0 \n",
+ "4 NaN 0.0 \n",
+ "5 NaN 14.0 \n",
+ "6 NaN 43.0 \n",
+ "7 NaN 3451.0 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA info & describe\n",
+ "info = pd.concat([\n",
+ "df.dtypes.to_frame().T,\n",
+ "df.mean(numeric_only=True).to_frame().T,\n",
+ "df.std(numeric_only=True).to_frame().T,\n",
+ "df.min(numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.25, numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.5, numeric_only=True).to_frame().T, \n",
+ "df.quantile(0.75, numeric_only=True).to_frame().T,\n",
+ "df.max(numeric_only=True).to_frame().T,], ignore_index=True).applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)\n",
+ "\n",
+ "info.insert(0, 'statistic', ['dtype', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])\n",
+ "info"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [ID, DATOP, FLTID, DEPSTN, ARRSTN, STD, STA, STATUS, AC, target]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA duplicates\n",
+ "duplicates = df.duplicated().sum()\n",
+ "duplicate_percentage = round((duplicates / df.shape[0]) * 100, 1)\n",
+ "df[df.duplicated(keep=False)].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Amount \n",
+ " Percentage \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [Amount, Percentage]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA NaNs\n",
+ "missing = pd.DataFrame(df.isnull().sum(), columns=['Amount'])\n",
+ "missing['Percentage'] = round((missing['Amount']/df.shape[0]) * 100, 1)\n",
+ "missing[missing['Amount'] != 0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of rows and columns: (107833, 10)\n",
+ "--------------------------------------------------\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_107830 \n",
+ " 2018-11-07 \n",
+ " SGT 0000 \n",
+ " TUN \n",
+ " TUN \n",
+ " 2018-11-07 05:00:00 \n",
+ " 2018-11-07 12.50.00 \n",
+ " SCH \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_107831 \n",
+ " 2018-01-23 \n",
+ " UG 0010 \n",
+ " TUN \n",
+ " DJE \n",
+ " 2018-01-23 18:00:00 \n",
+ " 2018-01-23 18.45.00 \n",
+ " ATA \n",
+ " TU CR9ISA \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " train_id_107832 \n",
+ " 2018-11-13 \n",
+ " UG 0002 \n",
+ " TUN \n",
+ " DJE \n",
+ " 2018-11-13 06:15:00 \n",
+ " 2018-11-13 07.05.00 \n",
+ " SCH \n",
+ " TU CR9ISA \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID DATOP FLTID DEPSTN ARRSTN STD \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_107830 2018-11-07 SGT 0000 TUN TUN 2018-11-07 05:00:00 \n",
+ "4 train_id_107831 2018-01-23 UG 0010 TUN DJE 2018-01-23 18:00:00 \n",
+ "5 train_id_107832 2018-11-13 UG 0002 TUN DJE 2018-11-13 06:15:00 \n",
+ "\n",
+ " STA STATUS AC target \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 \n",
+ "3 2018-11-07 12.50.00 SCH TU 736IOK 0.0 \n",
+ "4 2018-01-23 18.45.00 ATA TU CR9ISA 0.0 \n",
+ "5 2018-11-13 07.05.00 SCH TU CR9ISA 0.0 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA shape\n",
+ "print('Number of rows and columns: ',df.shape)\n",
+ "print('-'*50)\n",
+ "pd.concat([df.head(3), df.tail(3)]).reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " STA \n",
+ " STD \n",
+ " FLTID \n",
+ " DATOP \n",
+ " target \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " AC \n",
+ " STATUS \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Amount \n",
+ " 107833 \n",
+ " 85136 \n",
+ " 81697 \n",
+ " 1861 \n",
+ " 1011 \n",
+ " 968 \n",
+ " 132 \n",
+ " 128 \n",
+ " 68 \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID STA STD FLTID DATOP target DEPSTN ARRSTN AC STATUS\n",
+ "Amount 107833 85136 81697 1861 1011 968 132 128 68 5"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA Uniques\n",
+ "unique_counts = pd.DataFrame(df.nunique(), columns=['Amount']).sort_values('Amount', ascending=False).T\n",
+ "unique_counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " train_id_5 \n",
+ " 2016-01-17 \n",
+ " TU 0283 \n",
+ " TLS \n",
+ " TUN \n",
+ " 2016-01-17 16:20:00 \n",
+ " 2016-01-17 18.15.00 \n",
+ " ATA \n",
+ " TU 736IOP \n",
+ " 53.0 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " train_id_6 \n",
+ " 2016-01-18 \n",
+ " TU 0514 \n",
+ " TUN \n",
+ " BCN \n",
+ " 2016-01-18 07:15:00 \n",
+ " 2016-01-18 09.00.00 \n",
+ " ATA \n",
+ " TU 32AIMH \n",
+ " 10.0 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " train_id_7 \n",
+ " 2016-01-18 \n",
+ " TU 0716 \n",
+ " TUN \n",
+ " ORY \n",
+ " 2016-01-18 07:35:00 \n",
+ " 2016-01-18 09.55.00 \n",
+ " ATA \n",
+ " TU 32AIMI \n",
+ " 15.0 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " train_id_8 \n",
+ " 2016-01-18 \n",
+ " TU 0752 \n",
+ " TUN \n",
+ " FCO \n",
+ " 2016-01-18 07:40:00 \n",
+ " 2016-01-18 09.00.00 \n",
+ " ATA \n",
+ " TU 32AIMC \n",
+ " 16.0 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " train_id_9 \n",
+ " 2016-01-18 \n",
+ " TU 0996 \n",
+ " TUN \n",
+ " NCE \n",
+ " 2016-01-18 07:45:00 \n",
+ " 2016-01-18 09.15.00 \n",
+ " ATA \n",
+ " TU 31AIMK \n",
+ " 21.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID DATOP FLTID DEPSTN ARRSTN STD \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "5 train_id_5 2016-01-17 TU 0283 TLS TUN 2016-01-17 16:20:00 \n",
+ "6 train_id_6 2016-01-18 TU 0514 TUN BCN 2016-01-18 07:15:00 \n",
+ "7 train_id_7 2016-01-18 TU 0716 TUN ORY 2016-01-18 07:35:00 \n",
+ "8 train_id_8 2016-01-18 TU 0752 TUN FCO 2016-01-18 07:40:00 \n",
+ "9 train_id_9 2016-01-18 TU 0996 TUN NCE 2016-01-18 07:45:00 \n",
+ "\n",
+ " STA STATUS AC target \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 \n",
+ "5 2016-01-17 18.15.00 ATA TU 736IOP 53.0 \n",
+ "6 2016-01-18 09.00.00 ATA TU 32AIMH 10.0 \n",
+ "7 2016-01-18 09.55.00 ATA TU 32AIMI 15.0 \n",
+ "8 2016-01-18 09.00.00 ATA TU 32AIMC 16.0 \n",
+ "9 2016-01-18 09.15.00 ATA TU 31AIMK 21.0 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['CMN', 'MXP', 'TUN', 'DJE', 'TLS', 'IST', 'ORY', 'MIR', 'BRU',\n",
+ " 'ABJ', 'VCE', 'AMS', 'FRA', 'BCN', 'JED', 'ALG', 'LIS', 'SXB',\n",
+ " 'LYS', 'OUA', 'LGW', 'BEY', 'NCE', 'OPO', 'MRS', 'DUS', 'SFA',\n",
+ " 'FCO', 'CDG', 'NKC', 'NTE', 'ZRH', 'GVA', 'OUD', 'MUC', 'SXF',\n",
+ " 'HAM', 'NDR', 'NBE', 'CAI', 'BEG', 'VIE', 'ORN', 'MAD', 'TOE',\n",
+ " 'BKO', 'DKR', 'KGL', 'BLQ', 'MLA', 'AHU', 'LHR', 'BOD', 'PRG',\n",
+ " 'LJU', 'SVO', 'MED', 'BUD', 'ARN', 'CPH', 'CRL', 'OST', 'TNG',\n",
+ " 'GAF', 'NAP', 'BRQ', 'OSR', 'YUL', 'NIM', 'TMR', 'JIB', 'CGN',\n",
+ " 'EBL', 'GAE', 'BJA', 'AYT', 'RAK', 'LFW', 'LIL', 'PMO', 'FBM',\n",
+ " 'TBJ', 'PSA', 'KRT', 'GNB', 'CZL', 'MVB', 'VNO', 'ESB', 'LBV',\n",
+ " 'CKY', 'LED', 'KSC', 'BTS', 'AMM', 'OTP', 'TRN', 'IEV', 'HBE',\n",
+ " 'CAG', 'KBP', 'ATH', 'SKG', 'ADB', 'DSS', 'DOH', 'COO', 'LUX',\n",
+ " 'FIH', 'BYJ', 'KEF', 'EBM', 'BDS', 'AAL', 'VKO', 'AAE', 'BRI',\n",
+ " 'VRN', 'SKX', 'HAJ', 'BLL', 'TLL', 'VOG', 'LAD', 'GHA', 'KTW',\n",
+ " 'SJJ', 'KRR', 'RTM', 'STR', 'TPS', 'CTA'], dtype=object)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Get unique departure airports\n",
+ "df[\"DEPSTN\"].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['TUN', 'IST', 'NTE', 'ALG', 'BCN', 'ORY', 'FCO', 'NCE', 'MRS',\n",
+ " 'MED', 'FRA', 'BRU', 'DJE', 'LYS', 'CMN', 'BEG', 'OUA', 'GVA',\n",
+ " 'MXP', 'BEY', 'MAD', 'JED', 'ABJ', 'VIE', 'MLA', 'BLQ', 'SFA',\n",
+ " 'LIS', 'LHR', 'CDG', 'MIR', 'CAI', 'DUS', 'HAM', 'NBE', 'ZRH',\n",
+ " 'AMS', 'NDR', 'TLS', 'VCE', 'SXB', 'MUC', 'LGW', 'CRL', 'ORN',\n",
+ " 'DKR', 'BOD', 'SXF', 'LJU', 'OST', 'NKC', 'BKO', 'TOE', 'AHU',\n",
+ " 'YUL', 'PRG', 'CPH', 'ARN', 'OUD', 'BRQ', 'GAF', 'JIB', 'BUD',\n",
+ " 'OPO', 'KGL', 'NIM', 'SVO', 'LIL', 'OSR', 'EBL', 'TNG', 'PSA',\n",
+ " 'CGN', 'AYT', 'GAE', 'NAP', 'BJA', 'KRT', 'LFW', 'TBJ', 'PMO',\n",
+ " 'TMR', 'FBM', 'RAK', 'GNB', 'ESB', 'CZL', 'LBV', 'KSC', 'CKY',\n",
+ " 'AMM', 'LED', 'BTS', 'MVB', 'HBE', 'OTP', 'CAG', 'VNO', 'TRN',\n",
+ " 'ATH', 'ADB', 'SKG', 'BYJ', 'DSS', 'COO', 'IEV', 'LUX', 'KBP',\n",
+ " 'DOH', 'FIH', 'EBM', 'BDS', 'VKO', 'AAE', 'BLL', 'HAJ', 'BRI',\n",
+ " 'CTA', 'VRN', 'SKX', 'VOG', 'BGY', 'LAD', 'KRR', 'SJJ', 'GHA',\n",
+ " 'RTM', 'TPS'], dtype=object)"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Get unique arrival airports\n",
+ "df[\"ARRSTN\"].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Index: 69665 entries, 0 to 107675\n",
+ "Data columns (total 10 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 ID 69665 non-null object \n",
+ " 1 DATOP 69665 non-null object \n",
+ " 2 FLTID 69665 non-null object \n",
+ " 3 DEPSTN 69665 non-null object \n",
+ " 4 ARRSTN 69665 non-null object \n",
+ " 5 STD 69665 non-null object \n",
+ " 6 STA 69665 non-null object \n",
+ " 7 STATUS 69665 non-null object \n",
+ " 8 AC 69665 non-null object \n",
+ " 9 target 69665 non-null float64\n",
+ "dtypes: float64(1), object(9)\n",
+ "memory usage: 5.8+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get dataframe for delayed flights\n",
+ "df_delayed = df.copy()\n",
+ "df_delayed = df_delayed[df_delayed[\"target\"] > 0]\n",
+ "df_delayed.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "General statistics of target:\n",
+ " count 107833.000000\n",
+ "mean 48.733013\n",
+ "std 117.135562\n",
+ "min 0.000000\n",
+ "25% 0.000000\n",
+ "50% 14.000000\n",
+ "75% 43.000000\n",
+ "max 3451.000000\n",
+ "Name: target, dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Overview of target distribution\n",
+ "print(\"General statistics of target:\\n\", (df['target']).describe())\n",
+ "\n",
+ "# Creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)\n",
+ "f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={\"height_ratios\": (.15, .85)})\n",
+ " \n",
+ "# Assigning a graph to each ax\n",
+ "sns.boxplot(df[\"target\"], orient=\"h\", ax=ax_box, color='lightblue')\n",
+ "sns.histplot(data=df, x=\"target\", ax=ax_hist, color='lightblue')\n",
+ "\n",
+ "# Remove x axis name for the boxplot\n",
+ "ax_box.set(xlabel='')\n",
+ "ax_box.set_xlim([-100, 500])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "General statistics of target (of delayed flights):\n",
+ " count 69665.000000\n",
+ "mean 75.432814\n",
+ "std 138.650946\n",
+ "min 1.000000\n",
+ "25% 15.000000\n",
+ "50% 30.000000\n",
+ "75% 74.000000\n",
+ "max 3451.000000\n",
+ "Name: target, dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Overview of target distribution (of delayed flights)\n",
+ "print(\"General statistics of target (of delayed flights):\\n\", (df_delayed['target']).describe())\n",
+ "\n",
+ "# Creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)\n",
+ "f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={\"height_ratios\": (.15, .85)})\n",
+ " \n",
+ "# Assigning a graph to each ax\n",
+ "sns.boxplot(df_delayed[\"target\"], orient=\"h\", ax=ax_box, color='lightblue')\n",
+ "sns.histplot(data=df_delayed, x=\"target\", ax=ax_hist, color='lightblue')\n",
+ "\n",
+ "# Remove x axis name for the boxplot\n",
+ "ax_box.set(xlabel='')\n",
+ "ax_box.set_xlim([-20, 500])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Clean column names\n",
+ "df.columns = df.columns.str.replace(' ','_')\n",
+ "df.columns = df.columns.str.lower()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Feature 'ac' holds information about the model of the airplane. Extracting and converting the information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['airplane_model'] = df['ac'].str[3:6]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " sta \n",
+ " std \n",
+ " fltid \n",
+ " datop \n",
+ " target \n",
+ " depstn \n",
+ " arrstn \n",
+ " ac \n",
+ " airplane_model \n",
+ " status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Amount \n",
+ " 107833 \n",
+ " 85136 \n",
+ " 81697 \n",
+ " 1861 \n",
+ " 1011 \n",
+ " 968 \n",
+ " 132 \n",
+ " 128 \n",
+ " 68 \n",
+ " 16 \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id sta std fltid datop target depstn arrstn ac \n",
+ "Amount 107833 85136 81697 1861 1011 968 132 128 68 \\\n",
+ "\n",
+ " airplane_model status \n",
+ "Amount 16 5 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA Uniques\n",
+ "unique_counts = pd.DataFrame(df.nunique(), columns=['Amount']).sort_values('Amount', ascending=False).T\n",
+ "unique_counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'319',\n",
+ " '31A',\n",
+ " '31B',\n",
+ " '320',\n",
+ " '321',\n",
+ " '32A',\n",
+ " '332',\n",
+ " '343',\n",
+ " '345',\n",
+ " '733',\n",
+ " '734',\n",
+ " '736',\n",
+ " '738',\n",
+ " 'AT7',\n",
+ " 'CR9',\n",
+ " 'M87'}"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "set(df['airplane_model'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "replacement_dict = {\n",
+ " '31A': 'Airbus',\n",
+ " '31B': 'Airbus',\n",
+ " '320': 'Airbus',\n",
+ " '321': 'Airbus',\n",
+ " '32A': 'Airbus',\n",
+ " '332': 'Airbus',\n",
+ " '343': 'Airbus',\n",
+ " '345': 'Airbus',\n",
+ " '733': 'Boeing',\n",
+ " '734': 'Boeing',\n",
+ " '736': 'Boeing',\n",
+ " 'AT7': 'ATR',\n",
+ " 'CR9': 'Bombardier'\n",
+ "}\n",
+ "\n",
+ "df['producer'] = df['airplane_model']\n",
+ "\n",
+ "# Replace values in the 'purpose' column\n",
+ "df['producer'] = df['producer'].replace(replacement_dict)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \\\n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus \n",
+ "\n",
+ " airline_1 \n",
+ "0 TU \n",
+ "1 TU \n",
+ "2 TU \n",
+ "3 TU \n",
+ "4 TU "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['airline_1'] = df['fltid'].str[0:2]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'12',\n",
+ " '20',\n",
+ " '6Y',\n",
+ " 'A ',\n",
+ " 'AO',\n",
+ " 'AT',\n",
+ " 'AU',\n",
+ " 'C ',\n",
+ " 'D4',\n",
+ " 'DA',\n",
+ " 'GJ',\n",
+ " 'IN',\n",
+ " 'PR',\n",
+ " 'SG',\n",
+ " 'TU',\n",
+ " 'UG',\n",
+ " 'UH',\n",
+ " 'WK',\n",
+ " 'X9'}"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "set(df['airline_1'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \\\n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus \n",
+ "\n",
+ " airline_1 airline_2 \n",
+ "0 TU TU \n",
+ "1 TU TU \n",
+ "2 TU TU \n",
+ "3 TU TU \n",
+ "4 TU TU "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['airline_2'] = df['ac'].str[0:2]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'5K',\n",
+ " '5M',\n",
+ " '6P',\n",
+ " 'BJ',\n",
+ " 'D4',\n",
+ " 'GJ',\n",
+ " 'GW',\n",
+ " 'OL',\n",
+ " 'PS',\n",
+ " 'QS',\n",
+ " 'TU',\n",
+ " 'UG',\n",
+ " 'UJ',\n",
+ " 'X9'}"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "set(df['airline_2'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \\\n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus \n",
+ "\n",
+ " airline_1 airline_2 \n",
+ "0 TU TU \n",
+ "1 TU TU \n",
+ "2 TU TU \n",
+ "3 TU TU \n",
+ "4 TU TU "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Load airpots data set and clean. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Merge data sets based on airport short handle. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Add columns with weekdays, yyyy, mm, dd, hh:mm:ss\n",
+ "\n",
+ "y = '_year'\n",
+ "m = '_month'\n",
+ "wd = '_wd'\n",
+ "M = '_min'\n",
+ "\n",
+ "### std ###\n",
+ "\n",
+ "date = 'std'\n",
+ "\n",
+ "idx = df.columns.get_loc(date)\n",
+ "\n",
+ "df[date] = pd.to_datetime(df[date], format='%Y-%m-%d %H:%M:%S')\n",
+ "df.insert(loc=idx+1, column=date+y, value=df[date].dt.strftime('%Y')) # year yyyy\n",
+ "df.insert(loc=idx+2, column=date+m, value=df[date].dt.strftime('%#m')) # month m\n",
+ "df.insert(loc=idx+3, column=date+wd, value=df[date].dt.strftime('%w')) # weekday wd\n",
+ "h = df[date].dt.strftime('%#H').astype(int) # hours\n",
+ "minutes = df[date].dt.strftime('%#M').astype(int) # minutes\n",
+ "# calcualte time in just minutes\n",
+ "t = 60*h + minutes\n",
+ "df.insert(loc=idx+4, column=date+M, value=t) # minutes\n",
+ "\n",
+ "### sta ###\n",
+ "\n",
+ "date = 'sta'\n",
+ "\n",
+ "idx = df.columns.get_loc(date)\n",
+ "\n",
+ "df[date] = pd.to_datetime(df[date], format='%Y-%m-%d %H.%M.%S')\n",
+ "df.insert(loc=idx+1, column=date+y, value=df[date].dt.strftime('%Y')) # year yyyy\n",
+ "df.insert(loc=idx+2, column=date+m, value=df[date].dt.strftime('%#m')) # month m\n",
+ "df.insert(loc=idx+3, column=date+wd, value=df[date].dt.strftime('%w')) # weekday wd\n",
+ "h = df[date].dt.strftime('%#H').astype(int) # hours\n",
+ "minutes = df[date].dt.strftime('%#M').astype(int)\n",
+ "# calcualte time in just minutes\n",
+ "t = 60*h + minutes\n",
+ "df.insert(loc=idx+4, column=date+M, value=t) # minutes\n",
+ "\n",
+ "### datop ###\n",
+ "\n",
+ "date = 'datop'\n",
+ "\n",
+ "idx = df.columns.get_loc(date)\n",
+ "\n",
+ "df[date] = pd.to_datetime(df[date], format='%Y-%m-%d')\n",
+ "df.insert(loc=idx+1, column=date+y, value=df[date].dt.strftime('%Y')) # year yyyy\n",
+ "df.insert(loc=idx+2, column=date+m, value=df[date].dt.strftime('%#m')) # month m\n",
+ "df.insert(loc=idx+3, column=date+wd, value=df[date].dt.strftime('%w')) # weekday wd\n",
+ "\n",
+ "# convert new columns as integers\n",
+ "list = ['std_year', 'std_month', 'std_wd', 'sta_year', 'sta_month', 'sta_wd', 'datop_year', 'datop_month', 'datop_wd', 'target']\n",
+ "\n",
+ "for date in list:\n",
+ " df[date] = df[date].astype(int)\n",
+ "\n",
+ "# change weekday numbers to EU where day 1 = Monday\n",
+ "list = ['std_wd', 'sta_wd', 'datop_wd']\n",
+ "\n",
+ "for date in list:\n",
+ " df[date][df[date] == 0] = 7 # Sunday"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Geo-encoding of airports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " city \n",
+ " country \n",
+ " short \n",
+ " latitude \n",
+ " longitude \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Pitt Meadows \n",
+ " Canada \n",
+ " \\N \n",
+ " 49.216099 \n",
+ " -122.709999 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Blida \n",
+ " Algeria \n",
+ " \\N \n",
+ " 36.503601 \n",
+ " 2.814170 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Bou Saada \n",
+ " Algeria \n",
+ " \\N \n",
+ " 35.332500 \n",
+ " 4.206390 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " city country short latitude longitude\n",
+ "0 Pitt Meadows Canada \\N 49.216099 -122.709999\n",
+ "1 Blida Algeria \\N 36.503601 2.814170\n",
+ "2 Bou Saada Algeria \\N 35.332500 4.206390"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load csv\n",
+ "df_airports = pd.read_csv('data/airports.csv')\n",
+ "df_airports.columns = ['id', 'name', 'city', 'country', 'short', 'rubbish_6', 'latitude', 'longitude', 'rubbish_1', 'rubbish_2', 'rubbish_3', 'rubbish_4', 'type', 'rubbish_5']\n",
+ "df_airports = df_airports.drop(['id', 'name', 'rubbish_1', 'rubbish_2', 'rubbish_3', 'rubbish_4', 'rubbish_5', 'rubbish_6', 'type'], axis=1)\n",
+ "df_airports = df_airports.dropna(subset=['short'])\n",
+ "df_airports.head(3)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " city_dep \n",
+ " country_dep \n",
+ " short \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " city_arr \n",
+ " country_arr \n",
+ " short_arr \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016-01-03 12:55:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " Casablanca \n",
+ " Morocco \n",
+ " CMN \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " Tunis \n",
+ " Tunisia \n",
+ " TUN \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016-01-03 12:55:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " Casablanca \n",
+ " Morocco \n",
+ " CMN \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " Tunis \n",
+ " Tunesia \n",
+ " TUN \n",
+ " 36.847685 \n",
+ " 10.217603 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 905 \n",
+ " 2016-01-13 16:55:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 1015 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " Milano \n",
+ " Italy \n",
+ " MXP \n",
+ " 45.6306 \n",
+ " 8.72811 \n",
+ " Tunis \n",
+ " Tunisia \n",
+ " TUN \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop datop_year datop_month datop_wd fltid depstn \n",
+ "0 train_id_0 2016-01-03 2016 1 7 TU 0712 CMN \\\n",
+ "1 train_id_0 2016-01-03 2016 1 7 TU 0712 CMN \n",
+ "2 train_id_1 2016-01-13 2016 1 3 TU 0757 MXP \n",
+ "\n",
+ " arrstn std std_year std_month std_wd std_min \n",
+ "0 TUN 2016-01-03 10:30:00 2016 1 7 630 \\\n",
+ "1 TUN 2016-01-03 10:30:00 2016 1 7 630 \n",
+ "2 TUN 2016-01-13 15:05:00 2016 1 3 905 \n",
+ "\n",
+ " sta sta_year sta_month sta_wd sta_min status ac \n",
+ "0 2016-01-03 12:55:00 2016 1 7 775 ATA TU 32AIMN \\\n",
+ "1 2016-01-03 12:55:00 2016 1 7 775 ATA TU 32AIMN \n",
+ "2 2016-01-13 16:55:00 2016 1 3 1015 ATA TU 31BIMO \n",
+ "\n",
+ " target airplane_model producer airline_1 airline_2 city_dep country_dep \n",
+ "0 260 32A Airbus TU TU Casablanca Morocco \\\n",
+ "1 260 32A Airbus TU TU Casablanca Morocco \n",
+ "2 20 31B Airbus TU TU Milano Italy \n",
+ "\n",
+ " short latitude_dep longitude_dep city_arr country_arr short_arr \n",
+ "0 CMN 33.3675 -7.58997 Tunis Tunisia TUN \\\n",
+ "1 CMN 33.3675 -7.58997 Tunis Tunesia TUN \n",
+ "2 MXP 45.6306 8.72811 Tunis Tunisia TUN \n",
+ "\n",
+ " latitude_arr longitude_arr \n",
+ "0 36.851002 10.227200 \n",
+ "1 36.847685 10.217603 \n",
+ "2 36.851002 10.227200 "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.merge(df_airports, left_on='depstn', right_on='short', how='left', suffixes=('', '_dep'))\n",
+ "\n",
+ "# Merge based on arrival station\n",
+ "df = df.merge(df_airports, left_on='arrstn', right_on='short', how='left', suffixes=('', '_arr'))\n",
+ "\n",
+ "# Rename columns for clarity\n",
+ "df = df.rename(columns={\n",
+ " 'city': 'city_dep',\n",
+ " 'country': 'country_dep',\n",
+ " 'latitude': 'latitude_dep',\n",
+ " 'longitude': 'longitude_dep'\n",
+ "})\n",
+ "\n",
+ "df.head(3)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(197247, 35)"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check if datum datop matches with datum std\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(197247, 35)"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(df[(df[\"datop_year\"]==df[\"std_year\"])&(df[\"datop_month\"]==df[\"std_month\"])&(df[\"datop_wd\"]==df[\"std_wd\"])]).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " statistic \n",
+ " id \n",
+ " datop \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " city_dep \n",
+ " country_dep \n",
+ " short \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " city_arr \n",
+ " country_arr \n",
+ " short_arr \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " dtype \n",
+ " object \n",
+ " datetime64[ns] \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " datetime64[ns] \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " datetime64[ns] \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " object \n",
+ " object \n",
+ " int32 \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " float64 \n",
+ " float64 \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " float64 \n",
+ " float64 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " mean \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 6.6 \n",
+ " 4.1 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 6.6 \n",
+ " 4.1 \n",
+ " 759.7 \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 6.6 \n",
+ " 4.1 \n",
+ " 793.3 \n",
+ " NaN \n",
+ " NaN \n",
+ " 47.2 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 38.4 \n",
+ " 8.9 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 38.4 \n",
+ " 8.9 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " std \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.8 \n",
+ " 3.3 \n",
+ " 2.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.8 \n",
+ " 3.3 \n",
+ " 2.0 \n",
+ " 315.7 \n",
+ " NaN \n",
+ " 0.8 \n",
+ " 3.3 \n",
+ " 2.0 \n",
+ " 334.4 \n",
+ " NaN \n",
+ " NaN \n",
+ " 112.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 7.2 \n",
+ " 8.4 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 7.1 \n",
+ " 8.4 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " min \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " -11.6 \n",
+ " -73.7 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " -11.6 \n",
+ " -73.7 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 25% \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 4.0 \n",
+ " 2.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 4.0 \n",
+ " 2.0 \n",
+ " 500.0 \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 4.0 \n",
+ " 2.0 \n",
+ " 580.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.8 \n",
+ " 6.8 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.8 \n",
+ " 6.8 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 50% \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 7.0 \n",
+ " 4.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 7.0 \n",
+ " 4.0 \n",
+ " 760.0 \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 7.0 \n",
+ " 4.0 \n",
+ " 795.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 13.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.9 \n",
+ " 10.2 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.9 \n",
+ " 10.2 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 75% \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 9.0 \n",
+ " 6.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 9.0 \n",
+ " 6.0 \n",
+ " 985.0 \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 9.0 \n",
+ " 6.0 \n",
+ " 1050.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 42.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 43.4 \n",
+ " 10.2 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 43.4 \n",
+ " 10.2 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " max \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 12.0 \n",
+ " 7.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 12.0 \n",
+ " 7.0 \n",
+ " 1439.0 \n",
+ " NaN \n",
+ " 2019.0 \n",
+ " 12.0 \n",
+ " 7.0 \n",
+ " 1439.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 3451.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 64.0 \n",
+ " 51.6 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 59.8 \n",
+ " 51.6 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " statistic id datop datop_year datop_month datop_wd fltid \n",
+ "0 dtype object datetime64[ns] int32 int32 int32 object \\\n",
+ "1 mean NaN NaN 2017.0 6.6 4.1 NaN \n",
+ "2 std NaN NaN 0.8 3.3 2.0 NaN \n",
+ "3 min NaN NaN 2016.0 1.0 1.0 NaN \n",
+ "4 25% NaN NaN 2016.0 4.0 2.0 NaN \n",
+ "5 50% NaN NaN 2017.0 7.0 4.0 NaN \n",
+ "6 75% NaN NaN 2018.0 9.0 6.0 NaN \n",
+ "7 max NaN NaN 2018.0 12.0 7.0 NaN \n",
+ "\n",
+ " depstn arrstn std std_year std_month std_wd std_min \n",
+ "0 object object datetime64[ns] int32 int32 int32 int32 \\\n",
+ "1 NaN NaN NaN 2017.0 6.6 4.1 759.7 \n",
+ "2 NaN NaN NaN 0.8 3.3 2.0 315.7 \n",
+ "3 NaN NaN NaN 2016.0 1.0 1.0 0.0 \n",
+ "4 NaN NaN NaN 2016.0 4.0 2.0 500.0 \n",
+ "5 NaN NaN NaN 2017.0 7.0 4.0 760.0 \n",
+ "6 NaN NaN NaN 2018.0 9.0 6.0 985.0 \n",
+ "7 NaN NaN NaN 2018.0 12.0 7.0 1439.0 \n",
+ "\n",
+ " sta sta_year sta_month sta_wd sta_min status ac target \n",
+ "0 datetime64[ns] int32 int32 int32 int32 object object int32 \\\n",
+ "1 NaN 2017.0 6.6 4.1 793.3 NaN NaN 47.2 \n",
+ "2 NaN 0.8 3.3 2.0 334.4 NaN NaN 112.0 \n",
+ "3 NaN 2016.0 1.0 1.0 0.0 NaN NaN 0.0 \n",
+ "4 NaN 2016.0 4.0 2.0 580.0 NaN NaN 0.0 \n",
+ "5 NaN 2017.0 7.0 4.0 795.0 NaN NaN 13.0 \n",
+ "6 NaN 2018.0 9.0 6.0 1050.0 NaN NaN 42.0 \n",
+ "7 NaN 2019.0 12.0 7.0 1439.0 NaN NaN 3451.0 \n",
+ "\n",
+ " airplane_model producer airline_1 airline_2 city_dep country_dep short \n",
+ "0 object object object object object object object \\\n",
+ "1 NaN NaN NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN NaN NaN NaN \n",
+ "5 NaN NaN NaN NaN NaN NaN NaN \n",
+ "6 NaN NaN NaN NaN NaN NaN NaN \n",
+ "7 NaN NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " latitude_dep longitude_dep city_arr country_arr short_arr latitude_arr \n",
+ "0 float64 float64 object object object float64 \\\n",
+ "1 38.4 8.9 NaN NaN NaN 38.4 \n",
+ "2 7.2 8.4 NaN NaN NaN 7.1 \n",
+ "3 -11.6 -73.7 NaN NaN NaN -11.6 \n",
+ "4 36.8 6.8 NaN NaN NaN 36.8 \n",
+ "5 36.9 10.2 NaN NaN NaN 36.9 \n",
+ "6 43.4 10.2 NaN NaN NaN 43.4 \n",
+ "7 64.0 51.6 NaN NaN NaN 59.8 \n",
+ "\n",
+ " longitude_arr \n",
+ "0 float64 \n",
+ "1 8.9 \n",
+ "2 8.4 \n",
+ "3 -73.7 \n",
+ "4 6.8 \n",
+ "5 10.2 \n",
+ "6 10.2 \n",
+ "7 51.6 "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA info & describe\n",
+ "info = pd.concat([\n",
+ "df.dtypes.to_frame().T,\n",
+ "df.mean(numeric_only=True).to_frame().T,\n",
+ "df.std(numeric_only=True).to_frame().T,\n",
+ "df.min(numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.25, numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.5, numeric_only=True).to_frame().T, \n",
+ "df.quantile(0.75, numeric_only=True).to_frame().T,\n",
+ "df.max(numeric_only=True).to_frame().T,], ignore_index=True).applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)\n",
+ "\n",
+ "info.insert(0, 'statistic', ['dtype', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])\n",
+ "info"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Feature engineering"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " depstn \n",
+ " arrstn \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " status \n",
+ " target \n",
+ " airline_1 \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " 260 \n",
+ " TU \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " 260 \n",
+ " TU \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.847685 \n",
+ " 10.217603 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_1 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 905 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 1015 \n",
+ " ATA \n",
+ " 20 \n",
+ " TU \n",
+ " 45.6306 \n",
+ " 8.72811 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop_year datop_month datop_wd depstn arrstn std_year \n",
+ "0 train_id_0 2016 1 7 CMN TUN 2016 \\\n",
+ "1 train_id_0 2016 1 7 CMN TUN 2016 \n",
+ "2 train_id_1 2016 1 3 MXP TUN 2016 \n",
+ "\n",
+ " std_month std_wd std_min sta_year sta_month sta_wd sta_min status \n",
+ "0 1 7 630 2016 1 7 775 ATA \\\n",
+ "1 1 7 630 2016 1 7 775 ATA \n",
+ "2 1 3 905 2016 1 3 1015 ATA \n",
+ "\n",
+ " target airline_1 latitude_dep longitude_dep latitude_arr longitude_arr \n",
+ "0 260 TU 33.3675 -7.58997 36.851002 10.227200 \n",
+ "1 260 TU 33.3675 -7.58997 36.847685 10.217603 \n",
+ "2 20 TU 45.6306 8.72811 36.851002 10.227200 "
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.drop(['datop', 'fltid', 'std', 'sta', 'ac', 'short', 'short_arr', 'city_dep', 'country_dep', 'city_arr', 'country_arr'], axis=1)\n",
+ "\n",
+ "# aggressive feature drop\n",
+ "df = df.drop(['airplane_model', 'producer', 'airline_2'], axis=1)\n",
+ "\n",
+ "\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# List of columns to encode\n",
+ "columns_to_encode = ['depstn', 'status', 'arrstn', 'airline_1'] # reduced by aggressive feature drop\n",
+ "\n",
+ "# Create a copy of the original dataframe\n",
+ "df_encoded = df.copy()\n",
+ "\n",
+ "# Encode each column separately\n",
+ "for column in columns_to_encode:\n",
+ " lb = LabelBinarizer()\n",
+ " encoded = lb.fit_transform(df[column])\n",
+ " \n",
+ " # If binary classification, create a single column\n",
+ " if len(lb.classes_) == 2:\n",
+ " df_encoded[f'{column}_encoded'] = encoded\n",
+ " else:\n",
+ " # For multiclass, create multiple columns\n",
+ " encoded_df = pd.DataFrame(encoded, columns=[f'{column}_{cls}' for cls in lb.classes_], index=df.index)\n",
+ " df_encoded = pd.concat([df_encoded, encoded_df], axis=1)\n",
+ "\n",
+ "df_encoded = df_encoded.drop(column, axis=1)\n",
+ "\n",
+ "# Now, combine the non-encoded columns from df with the encoded columns from df_encoded\n",
+ "df = pd.concat([df, df_encoded], axis=1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " target \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " depstn_AAE \n",
+ " depstn_AAL \n",
+ " depstn_ABJ \n",
+ " depstn_ADB \n",
+ " depstn_AHU \n",
+ " depstn_ALG \n",
+ " depstn_AMM \n",
+ " depstn_AMS \n",
+ " depstn_ARN \n",
+ " depstn_ATH \n",
+ " depstn_AYT \n",
+ " depstn_BCN \n",
+ " depstn_BDS \n",
+ " depstn_BEG \n",
+ " depstn_BEY \n",
+ " depstn_BJA \n",
+ " depstn_BKO \n",
+ " depstn_BLL \n",
+ " depstn_BLQ \n",
+ " depstn_BOD \n",
+ " depstn_BRI \n",
+ " depstn_BRQ \n",
+ " depstn_BRU \n",
+ " depstn_BTS \n",
+ " depstn_BUD \n",
+ " depstn_BYJ \n",
+ " depstn_CAG \n",
+ " depstn_CAI \n",
+ " depstn_CDG \n",
+ " depstn_CGN \n",
+ " depstn_CKY \n",
+ " depstn_CMN \n",
+ " depstn_COO \n",
+ " depstn_CPH \n",
+ " depstn_CRL \n",
+ " depstn_CTA \n",
+ " depstn_CZL \n",
+ " depstn_DJE \n",
+ " depstn_DKR \n",
+ " depstn_DOH \n",
+ " depstn_DSS \n",
+ " depstn_DUS \n",
+ " depstn_EBL \n",
+ " depstn_EBM \n",
+ " depstn_ESB \n",
+ " depstn_FBM \n",
+ " depstn_FCO \n",
+ " depstn_FIH \n",
+ " depstn_FRA \n",
+ " depstn_GAE \n",
+ " depstn_GAF \n",
+ " depstn_GHA \n",
+ " depstn_GNB \n",
+ " depstn_GVA \n",
+ " depstn_HAJ \n",
+ " depstn_HAM \n",
+ " depstn_HBE \n",
+ " depstn_IEV \n",
+ " depstn_IST \n",
+ " depstn_JED \n",
+ " depstn_JIB \n",
+ " depstn_KBP \n",
+ " depstn_KEF \n",
+ " depstn_KGL \n",
+ " depstn_KRR \n",
+ " depstn_KRT \n",
+ " depstn_KSC \n",
+ " depstn_KTW \n",
+ " depstn_LAD \n",
+ " depstn_LBV \n",
+ " depstn_LED \n",
+ " depstn_LFW \n",
+ " depstn_LGW \n",
+ " depstn_LHR \n",
+ " depstn_LIL \n",
+ " depstn_LIS \n",
+ " depstn_LJU \n",
+ " depstn_LUX \n",
+ " depstn_LYS \n",
+ " depstn_MAD \n",
+ " depstn_MED \n",
+ " depstn_MIR \n",
+ " depstn_MLA \n",
+ " depstn_MRS \n",
+ " depstn_MUC \n",
+ " depstn_MVB \n",
+ " depstn_MXP \n",
+ " depstn_NAP \n",
+ " depstn_NBE \n",
+ " depstn_NCE \n",
+ " depstn_NDR \n",
+ " depstn_NIM \n",
+ " depstn_NKC \n",
+ " depstn_NTE \n",
+ " depstn_OPO \n",
+ " depstn_ORN \n",
+ " depstn_ORY \n",
+ " depstn_OSR \n",
+ " depstn_OST \n",
+ " depstn_OTP \n",
+ " depstn_OUA \n",
+ " depstn_OUD \n",
+ " depstn_PMO \n",
+ " depstn_PRG \n",
+ " depstn_PSA \n",
+ " depstn_RAK \n",
+ " depstn_RTM \n",
+ " depstn_SFA \n",
+ " depstn_SJJ \n",
+ " depstn_SKG \n",
+ " depstn_SKX \n",
+ " depstn_STR \n",
+ " depstn_SVO \n",
+ " depstn_SXB \n",
+ " depstn_SXF \n",
+ " depstn_TBJ \n",
+ " depstn_TLL \n",
+ " depstn_TLS \n",
+ " depstn_TMR \n",
+ " depstn_TNG \n",
+ " depstn_TOE \n",
+ " depstn_TPS \n",
+ " depstn_TRN \n",
+ " depstn_TUN \n",
+ " depstn_VCE \n",
+ " depstn_VIE \n",
+ " depstn_VKO \n",
+ " depstn_VNO \n",
+ " depstn_VOG \n",
+ " depstn_VRN \n",
+ " depstn_YUL \n",
+ " depstn_ZRH \n",
+ " status_ATA \n",
+ " status_DEL \n",
+ " status_DEP \n",
+ " status_RTR \n",
+ " status_SCH \n",
+ " arrstn_AAE \n",
+ " arrstn_ABJ \n",
+ " arrstn_ADB \n",
+ " arrstn_AHU \n",
+ " arrstn_ALG \n",
+ " arrstn_AMM \n",
+ " arrstn_AMS \n",
+ " arrstn_ARN \n",
+ " arrstn_ATH \n",
+ " arrstn_AYT \n",
+ " arrstn_BCN \n",
+ " arrstn_BDS \n",
+ " arrstn_BEG \n",
+ " arrstn_BEY \n",
+ " arrstn_BGY \n",
+ " arrstn_BJA \n",
+ " arrstn_BKO \n",
+ " arrstn_BLL \n",
+ " arrstn_BLQ \n",
+ " arrstn_BOD \n",
+ " arrstn_BRI \n",
+ " arrstn_BRQ \n",
+ " arrstn_BRU \n",
+ " arrstn_BTS \n",
+ " arrstn_BUD \n",
+ " arrstn_BYJ \n",
+ " arrstn_CAG \n",
+ " arrstn_CAI \n",
+ " arrstn_CDG \n",
+ " arrstn_CGN \n",
+ " arrstn_CKY \n",
+ " arrstn_CMN \n",
+ " arrstn_COO \n",
+ " arrstn_CPH \n",
+ " arrstn_CRL \n",
+ " arrstn_CTA \n",
+ " arrstn_CZL \n",
+ " arrstn_DJE \n",
+ " arrstn_DKR \n",
+ " arrstn_DOH \n",
+ " arrstn_DSS \n",
+ " arrstn_DUS \n",
+ " arrstn_EBL \n",
+ " arrstn_EBM \n",
+ " arrstn_ESB \n",
+ " arrstn_FBM \n",
+ " arrstn_FCO \n",
+ " arrstn_FIH \n",
+ " arrstn_FRA \n",
+ " arrstn_GAE \n",
+ " arrstn_GAF \n",
+ " arrstn_GHA \n",
+ " arrstn_GNB \n",
+ " arrstn_GVA \n",
+ " arrstn_HAJ \n",
+ " arrstn_HAM \n",
+ " arrstn_HBE \n",
+ " arrstn_IEV \n",
+ " arrstn_IST \n",
+ " arrstn_JED \n",
+ " arrstn_JIB \n",
+ " arrstn_KBP \n",
+ " arrstn_KGL \n",
+ " arrstn_KRR \n",
+ " arrstn_KRT \n",
+ " arrstn_KSC \n",
+ " arrstn_LAD \n",
+ " arrstn_LBV \n",
+ " arrstn_LED \n",
+ " arrstn_LFW \n",
+ " arrstn_LGW \n",
+ " arrstn_LHR \n",
+ " arrstn_LIL \n",
+ " arrstn_LIS \n",
+ " arrstn_LJU \n",
+ " arrstn_LUX \n",
+ " arrstn_LYS \n",
+ " arrstn_MAD \n",
+ " arrstn_MED \n",
+ " arrstn_MIR \n",
+ " arrstn_MLA \n",
+ " arrstn_MRS \n",
+ " arrstn_MUC \n",
+ " arrstn_MVB \n",
+ " arrstn_MXP \n",
+ " arrstn_NAP \n",
+ " arrstn_NBE \n",
+ " arrstn_NCE \n",
+ " arrstn_NDR \n",
+ " arrstn_NIM \n",
+ " arrstn_NKC \n",
+ " arrstn_NTE \n",
+ " arrstn_OPO \n",
+ " arrstn_ORN \n",
+ " arrstn_ORY \n",
+ " arrstn_OSR \n",
+ " arrstn_OST \n",
+ " arrstn_OTP \n",
+ " arrstn_OUA \n",
+ " arrstn_OUD \n",
+ " arrstn_PMO \n",
+ " arrstn_PRG \n",
+ " arrstn_PSA \n",
+ " arrstn_RAK \n",
+ " arrstn_RTM \n",
+ " arrstn_SFA \n",
+ " arrstn_SJJ \n",
+ " arrstn_SKG \n",
+ " arrstn_SKX \n",
+ " arrstn_SVO \n",
+ " arrstn_SXB \n",
+ " arrstn_SXF \n",
+ " arrstn_TBJ \n",
+ " arrstn_TLS \n",
+ " arrstn_TMR \n",
+ " arrstn_TNG \n",
+ " arrstn_TOE \n",
+ " arrstn_TPS \n",
+ " arrstn_TRN \n",
+ " arrstn_TUN \n",
+ " arrstn_VCE \n",
+ " arrstn_VIE \n",
+ " arrstn_VKO \n",
+ " arrstn_VNO \n",
+ " arrstn_VOG \n",
+ " arrstn_VRN \n",
+ " arrstn_YUL \n",
+ " arrstn_ZRH \n",
+ " airline_1_12 \n",
+ " airline_1_20 \n",
+ " airline_1_6Y \n",
+ " airline_1_A \n",
+ " airline_1_AO \n",
+ " airline_1_AT \n",
+ " airline_1_AU \n",
+ " airline_1_C \n",
+ " airline_1_D4 \n",
+ " airline_1_DA \n",
+ " airline_1_GJ \n",
+ " airline_1_IN \n",
+ " airline_1_PR \n",
+ " airline_1_SG \n",
+ " airline_1_TU \n",
+ " airline_1_UG \n",
+ " airline_1_UH \n",
+ " airline_1_WK \n",
+ " airline_1_X9 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " 260 \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " 260 \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.847685 \n",
+ " 10.217603 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_1 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 905 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 1015 \n",
+ " 20 \n",
+ " 45.6306 \n",
+ " 8.72811 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop_year datop_month datop_wd std_year std_month std_wd \n",
+ "0 train_id_0 2016 1 7 2016 1 7 \\\n",
+ "1 train_id_0 2016 1 7 2016 1 7 \n",
+ "2 train_id_1 2016 1 3 2016 1 3 \n",
+ "\n",
+ " std_min sta_year sta_month sta_wd sta_min target latitude_dep \n",
+ "0 630 2016 1 7 775 260 33.3675 \\\n",
+ "1 630 2016 1 7 775 260 33.3675 \n",
+ "2 905 2016 1 3 1015 20 45.6306 \n",
+ "\n",
+ " longitude_dep latitude_arr longitude_arr depstn_AAE depstn_AAL \n",
+ "0 -7.58997 36.851002 10.227200 0 0 \\\n",
+ "1 -7.58997 36.847685 10.217603 0 0 \n",
+ "2 8.72811 36.851002 10.227200 0 0 \n",
+ "\n",
+ " depstn_ABJ depstn_ADB depstn_AHU depstn_ALG depstn_AMM depstn_AMS \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_ARN depstn_ATH depstn_AYT depstn_BCN depstn_BDS depstn_BEG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_BEY depstn_BJA depstn_BKO depstn_BLL depstn_BLQ depstn_BOD \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_BRI depstn_BRQ depstn_BRU depstn_BTS depstn_BUD depstn_BYJ \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_CAG depstn_CAI depstn_CDG depstn_CGN depstn_CKY depstn_CMN \n",
+ "0 0 0 0 0 0 1 \\\n",
+ "1 0 0 0 0 0 1 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_COO depstn_CPH depstn_CRL depstn_CTA depstn_CZL depstn_DJE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_DKR depstn_DOH depstn_DSS depstn_DUS depstn_EBL depstn_EBM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_ESB depstn_FBM depstn_FCO depstn_FIH depstn_FRA depstn_GAE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_GAF depstn_GHA depstn_GNB depstn_GVA depstn_HAJ depstn_HAM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_HBE depstn_IEV depstn_IST depstn_JED depstn_JIB depstn_KBP \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_KEF depstn_KGL depstn_KRR depstn_KRT depstn_KSC depstn_KTW \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_LAD depstn_LBV depstn_LED depstn_LFW depstn_LGW depstn_LHR \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_LIL depstn_LIS depstn_LJU depstn_LUX depstn_LYS depstn_MAD \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_MED depstn_MIR depstn_MLA depstn_MRS depstn_MUC depstn_MVB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_MXP depstn_NAP depstn_NBE depstn_NCE depstn_NDR depstn_NIM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 1 0 0 0 0 0 \n",
+ "\n",
+ " depstn_NKC depstn_NTE depstn_OPO depstn_ORN depstn_ORY depstn_OSR \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_OST depstn_OTP depstn_OUA depstn_OUD depstn_PMO depstn_PRG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_PSA depstn_RAK depstn_RTM depstn_SFA depstn_SJJ depstn_SKG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_SKX depstn_STR depstn_SVO depstn_SXB depstn_SXF depstn_TBJ \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_TLL depstn_TLS depstn_TMR depstn_TNG depstn_TOE depstn_TPS \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_TRN depstn_TUN depstn_VCE depstn_VIE depstn_VKO depstn_VNO \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_VOG depstn_VRN depstn_YUL depstn_ZRH status_ATA status_DEL \n",
+ "0 0 0 0 0 1 0 \\\n",
+ "1 0 0 0 0 1 0 \n",
+ "2 0 0 0 0 1 0 \n",
+ "\n",
+ " status_DEP status_RTR status_SCH arrstn_AAE arrstn_ABJ arrstn_ADB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_AHU arrstn_ALG arrstn_AMM arrstn_AMS arrstn_ARN arrstn_ATH \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_AYT arrstn_BCN arrstn_BDS arrstn_BEG arrstn_BEY arrstn_BGY \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_BJA arrstn_BKO arrstn_BLL arrstn_BLQ arrstn_BOD arrstn_BRI \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_BRQ arrstn_BRU arrstn_BTS arrstn_BUD arrstn_BYJ arrstn_CAG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_CAI arrstn_CDG arrstn_CGN arrstn_CKY arrstn_CMN arrstn_COO \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_CPH arrstn_CRL arrstn_CTA arrstn_CZL arrstn_DJE arrstn_DKR \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_DOH arrstn_DSS arrstn_DUS arrstn_EBL arrstn_EBM arrstn_ESB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_FBM arrstn_FCO arrstn_FIH arrstn_FRA arrstn_GAE arrstn_GAF \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_GHA arrstn_GNB arrstn_GVA arrstn_HAJ arrstn_HAM arrstn_HBE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_IEV arrstn_IST arrstn_JED arrstn_JIB arrstn_KBP arrstn_KGL \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_KRR arrstn_KRT arrstn_KSC arrstn_LAD arrstn_LBV arrstn_LED \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_LFW arrstn_LGW arrstn_LHR arrstn_LIL arrstn_LIS arrstn_LJU \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_LUX arrstn_LYS arrstn_MAD arrstn_MED arrstn_MIR arrstn_MLA \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_MRS arrstn_MUC arrstn_MVB arrstn_MXP arrstn_NAP arrstn_NBE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_NCE arrstn_NDR arrstn_NIM arrstn_NKC arrstn_NTE arrstn_OPO \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_ORN arrstn_ORY arrstn_OSR arrstn_OST arrstn_OTP arrstn_OUA \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_OUD arrstn_PMO arrstn_PRG arrstn_PSA arrstn_RAK arrstn_RTM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_SFA arrstn_SJJ arrstn_SKG arrstn_SKX arrstn_SVO arrstn_SXB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_SXF arrstn_TBJ arrstn_TLS arrstn_TMR arrstn_TNG arrstn_TOE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_TPS arrstn_TRN arrstn_TUN arrstn_VCE arrstn_VIE arrstn_VKO \n",
+ "0 0 0 1 0 0 0 \\\n",
+ "1 0 0 1 0 0 0 \n",
+ "2 0 0 1 0 0 0 \n",
+ "\n",
+ " arrstn_VNO arrstn_VOG arrstn_VRN arrstn_YUL arrstn_ZRH airline_1_12 \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " airline_1_20 airline_1_6Y airline_1_A airline_1_AO airline_1_AT \n",
+ "0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 \n",
+ "\n",
+ " airline_1_AU airline_1_C airline_1_D4 airline_1_DA airline_1_GJ \n",
+ "0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 \n",
+ "\n",
+ " airline_1_IN airline_1_PR airline_1_SG airline_1_TU airline_1_UG \n",
+ "0 0 0 0 1 0 \\\n",
+ "1 0 0 0 1 0 \n",
+ "2 0 0 0 1 0 \n",
+ "\n",
+ " airline_1_UH airline_1_WK airline_1_X9 \n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 "
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.drop(['depstn', 'arrstn', 'status', 'airline_1'], axis=1) # reduced by aggressive feature drop\n",
+ "duplicate_columns = df.columns[df.columns.duplicated()]\n",
+ "df = df.loc[:, ~df.columns.duplicated()]\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Splitting data for testing "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define features and target variable (target)\n",
+ "X = df.drop(['target', 'id'], axis=1)\n",
+ "y = df['target']\n",
+ "\n",
+ "# Split into train and test set\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Trainining the model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Baseline model: LogisticRegression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[38], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Instantiate and train linear regression model\u001b[39;00m\n\u001b[0;32m 2\u001b[0m model_0 \u001b[38;5;241m=\u001b[39m LogisticRegression(max_iter\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1000\u001b[39m)\n\u001b[1;32m----> 3\u001b[0m \u001b[43mmodel_0\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m \n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1291\u001b[0m, in \u001b[0;36mLogisticRegression.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 1288\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1289\u001b[0m n_threads \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1291\u001b[0m fold_coefs_ \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprefer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprefer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1292\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_func\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1293\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1294\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1295\u001b[0m \u001b[43m \u001b[49m\u001b[43mpos_class\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclass_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1296\u001b[0m \u001b[43m \u001b[49m\u001b[43mCs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mC_\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1297\u001b[0m \u001b[43m \u001b[49m\u001b[43ml1_ratio\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43ml1_ratio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1298\u001b[0m \u001b[43m \u001b[49m\u001b[43mfit_intercept\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_intercept\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1299\u001b[0m \u001b[43m \u001b[49m\u001b[43mtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtol\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1300\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1301\u001b[0m \u001b[43m \u001b[49m\u001b[43msolver\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msolver\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1302\u001b[0m \u001b[43m \u001b[49m\u001b[43mmulti_class\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmulti_class\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1303\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_iter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_iter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1304\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_input\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 1306\u001b[0m \u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrandom_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1307\u001b[0m \u001b[43m \u001b[49m\u001b[43mcoef\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwarm_start_coef_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1308\u001b[0m \u001b[43m \u001b[49m\u001b[43mpenalty\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpenalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1309\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_squared_sum\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_squared_sum\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1310\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_threads\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_threads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1312\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1313\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mclass_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwarm_start_coef_\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mzip\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mclasses_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwarm_start_coef\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1314\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1316\u001b[0m fold_coefs_, _, n_iter_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39mfold_coefs_)\n\u001b[0;32m 1317\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_iter_ \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39masarray(n_iter_, dtype\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mint32)[:, \u001b[38;5;241m0\u001b[39m]\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:63\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 58\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 59\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 60\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 62\u001b[0m )\n\u001b[1;32m---> 63\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1916\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1917\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m 1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:123\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 121\u001b[0m config \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 122\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig):\n\u001b[1;32m--> 123\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:450\u001b[0m, in \u001b[0;36m_logistic_regression_path\u001b[1;34m(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio, n_threads)\u001b[0m\n\u001b[0;32m 446\u001b[0m l2_reg_strength \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1.0\u001b[39m \u001b[38;5;241m/\u001b[39m C\n\u001b[0;32m 447\u001b[0m iprint \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m50\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m100\u001b[39m, \u001b[38;5;241m101\u001b[39m][\n\u001b[0;32m 448\u001b[0m np\u001b[38;5;241m.\u001b[39msearchsorted(np\u001b[38;5;241m.\u001b[39marray([\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m]), verbose)\n\u001b[0;32m 449\u001b[0m ]\n\u001b[1;32m--> 450\u001b[0m opt_res \u001b[38;5;241m=\u001b[39m \u001b[43moptimize\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mminimize\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 451\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 452\u001b[0m \u001b[43m \u001b[49m\u001b[43mw0\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 453\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mL-BFGS-B\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 454\u001b[0m \u001b[43m \u001b[49m\u001b[43mjac\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 455\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43ml2_reg_strength\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_threads\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 456\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43miprint\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43miprint\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgtol\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtol\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmaxiter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_iter\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 457\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 458\u001b[0m n_iter_i \u001b[38;5;241m=\u001b[39m _check_optimize_result(\n\u001b[0;32m 459\u001b[0m solver,\n\u001b[0;32m 460\u001b[0m opt_res,\n\u001b[0;32m 461\u001b[0m max_iter,\n\u001b[0;32m 462\u001b[0m extra_warning_msg\u001b[38;5;241m=\u001b[39m_LOGISTIC_SOLVER_CONVERGENCE_MSG,\n\u001b[0;32m 463\u001b[0m )\n\u001b[0;32m 464\u001b[0m w0, loss \u001b[38;5;241m=\u001b[39m opt_res\u001b[38;5;241m.\u001b[39mx, opt_res\u001b[38;5;241m.\u001b[39mfun\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\scipy\\optimize\\_minimize.py:731\u001b[0m, in \u001b[0;36mminimize\u001b[1;34m(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)\u001b[0m\n\u001b[0;32m 728\u001b[0m res \u001b[38;5;241m=\u001b[39m _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback,\n\u001b[0;32m 729\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39moptions)\n\u001b[0;32m 730\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m meth \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124ml-bfgs-b\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m--> 731\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43m_minimize_lbfgsb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfun\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx0\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjac\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbounds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 732\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallback\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallback\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 733\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m meth \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtnc\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m 734\u001b[0m res \u001b[38;5;241m=\u001b[39m _minimize_tnc(fun, x0, args, jac, bounds, callback\u001b[38;5;241m=\u001b[39mcallback,\n\u001b[0;32m 735\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39moptions)\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\scipy\\optimize\\_lbfgsb_py.py:407\u001b[0m, in \u001b[0;36m_minimize_lbfgsb\u001b[1;34m(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)\u001b[0m\n\u001b[0;32m 401\u001b[0m task_str \u001b[38;5;241m=\u001b[39m task\u001b[38;5;241m.\u001b[39mtobytes()\n\u001b[0;32m 402\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m task_str\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFG\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[0;32m 403\u001b[0m \u001b[38;5;66;03m# The minimization routine wants f and g at the current x.\u001b[39;00m\n\u001b[0;32m 404\u001b[0m \u001b[38;5;66;03m# Note that interruptions due to maxfun are postponed\u001b[39;00m\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# until the completion of the current minimization iteration.\u001b[39;00m\n\u001b[0;32m 406\u001b[0m \u001b[38;5;66;03m# Overwrite f and g:\u001b[39;00m\n\u001b[1;32m--> 407\u001b[0m f, g \u001b[38;5;241m=\u001b[39m \u001b[43mfunc_and_grad\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m task_str\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNEW_X\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[0;32m 409\u001b[0m \u001b[38;5;66;03m# new iteration\u001b[39;00m\n\u001b[0;32m 410\u001b[0m n_iterations \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\scipy\\optimize\\_differentiable_functions.py:343\u001b[0m, in \u001b[0;36mScalarFunction.fun_and_grad\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39marray_equal(x, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mx):\n\u001b[0;32m 342\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_x(x)\n\u001b[1;32m--> 343\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_update_fun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 344\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_grad()\n\u001b[0;32m 345\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mg\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\scipy\\optimize\\_differentiable_functions.py:294\u001b[0m, in \u001b[0;36mScalarFunction._update_fun\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 292\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_update_fun\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 293\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_updated:\n\u001b[1;32m--> 294\u001b[0m fx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wrapped_fun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 295\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fx \u001b[38;5;241m<\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lowest_f:\n\u001b[0;32m 296\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lowest_x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mx\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\scipy\\optimize\\_differentiable_functions.py:20\u001b[0m, in \u001b[0;36m_wrapper_fun..wrapped\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m 16\u001b[0m ncalls[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;66;03m# Send a copy because the user may overwrite it.\u001b[39;00m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;66;03m# Overwriting results in undefined behaviour because\u001b[39;00m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# fun(self.x) will change self.x, with the two no longer linked.\u001b[39;00m\n\u001b[1;32m---> 20\u001b[0m fx \u001b[38;5;241m=\u001b[39m \u001b[43mfun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m# Make sure the function returns a true scalar\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39misscalar(fx):\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\scipy\\optimize\\_optimize.py:79\u001b[0m, in \u001b[0;36mMemoizeJac.__call__\u001b[1;34m(self, x, *args)\u001b[0m\n\u001b[0;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, x, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m 78\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\" returns the function value \"\"\"\u001b[39;00m\n\u001b[1;32m---> 79\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_compute_if_needed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 80\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\scipy\\optimize\\_optimize.py:73\u001b[0m, in \u001b[0;36mMemoizeJac._compute_if_needed\u001b[1;34m(self, x, *args)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39mall(x \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mx) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mjac \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mx \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39masarray(x)\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 73\u001b[0m fg \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 74\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mjac \u001b[38;5;241m=\u001b[39m fg[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 75\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value \u001b[38;5;241m=\u001b[39m fg[\u001b[38;5;241m0\u001b[39m]\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_linear_loss.py:278\u001b[0m, in \u001b[0;36mLinearModelLoss.loss_gradient\u001b[1;34m(self, coef, X, y, sample_weight, l2_reg_strength, n_threads, raw_prediction)\u001b[0m\n\u001b[0;32m 275\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 276\u001b[0m weights, intercept \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mweight_intercept(coef)\n\u001b[1;32m--> 278\u001b[0m loss, grad_pointwise \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbase_loss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss_gradient\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 279\u001b[0m \u001b[43m \u001b[49m\u001b[43my_true\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 280\u001b[0m \u001b[43m \u001b[49m\u001b[43mraw_prediction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraw_prediction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 281\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 282\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_threads\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_threads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 283\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 284\u001b[0m loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39msum()\n\u001b[0;32m 285\u001b[0m loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39ml2_penalty(weights, l2_reg_strength)\n",
+ "File \u001b[1;32md:\\Github\\flight-delay-prediction\\.venv\\Lib\\site-packages\\sklearn\\_loss\\loss.py:257\u001b[0m, in \u001b[0;36mBaseLoss.loss_gradient\u001b[1;34m(self, y_true, raw_prediction, sample_weight, loss_out, gradient_out, n_threads)\u001b[0m\n\u001b[0;32m 255\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 256\u001b[0m sample_weight \u001b[38;5;241m=\u001b[39m ReadonlyArrayWrapper(sample_weight)\n\u001b[1;32m--> 257\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss_gradient\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 258\u001b[0m \u001b[43m \u001b[49m\u001b[43my_true\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 259\u001b[0m \u001b[43m \u001b[49m\u001b[43mraw_prediction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraw_prediction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 260\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43mloss_out\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mloss_out\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 262\u001b[0m \u001b[43m \u001b[49m\u001b[43mgradient_out\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgradient_out\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 263\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_threads\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_threads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 264\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "# Instantiate and train linear regression model\n",
+ "model_0 = LogisticRegression(max_iter=1000)\n",
+ "model_0.fit(X_train, y_train) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Predict\n",
+ "y_pred = model_0.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_0.predict(X_train)\n",
+ "y_pred_test = model_0.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Calculate Confusion Matrix\n",
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+ "print('Confusion Matrix: ', conf_matrix)\n",
+ "\n",
+ "# Create confusion matrix\n",
+ "conf_matrixf = pd.DataFrame(conf_matrix)\n",
+ "\n",
+ "# Plot confusion matrix\n",
+ "plt.figure(figsize=(10,7))\n",
+ "sns.heatmap(conf_matrixf, annot=True, fmt='d', cmap='YlGnBu')\n",
+ "plt.xlabel('Predicted')\n",
+ "plt.ylabel('True')\n",
+ "plt.title('Confusion Matrix')\n",
+ "plt.show()\n",
+ "\n",
+ "# Calculate Accuracy\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print('Accuracy: ', round(accuracy, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 1: Polynominal"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Instantiate and train SGDClassifier model\n",
+ "model_1 = SGDClassifier(random_state=RSEED)\n",
+ "model_1.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = model_1.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_1.predict(X_train)\n",
+ "y_pred_test = model_1.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Calculate Confusion Matrix\n",
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+ "print('Confusion Matrix: ', conf_matrix)\n",
+ "\n",
+ "# Create confusion matrix\n",
+ "conf_matrixf = pd.DataFrame(conf_matrix)\n",
+ "\n",
+ "# Plot confusion matrix\n",
+ "plt.figure(figsize=(10,7))\n",
+ "sns.heatmap(conf_matrixf, annot=True, fmt='d', cmap='YlGnBu')\n",
+ "plt.xlabel('Predicted')\n",
+ "plt.ylabel('True')\n",
+ "plt.title('Confusion Matrix')\n",
+ "plt.show()\n",
+ "\n",
+ "# Calculate Accuracy\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print('Accuracy: ', round(accuracy, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 2: KNeighborsClassifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Ridge(alpha=1, random_state=0) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "Ridge(alpha=1, random_state=0)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train Ridge regression model\n",
+ "model_2 = KNeighborsClassifier()\n",
+ "model_2.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = model_2.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_2.predict(X_train)\n",
+ "y_pred_test = model_2.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.078\n",
+ "R-squared (test): 0.08\n",
+ "R-squared adjusted (train): 0.076\n",
+ "R-squared adjusted (test): 0.079\n",
+ "RMSE (train): 108.161\n",
+ "RMSE (test): 104.742\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate Confusion Matrix\n",
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+ "print('Confusion Matrix: ', conf_matrix)\n",
+ "\n",
+ "# Create confusion matrix\n",
+ "conf_matrixf = pd.DataFrame(conf_matrix)\n",
+ "\n",
+ "# Plot confusion matrix\n",
+ "plt.figure(figsize=(10,7))\n",
+ "sns.heatmap(conf_matrixf, annot=True, fmt='d', cmap='YlGnBu')\n",
+ "plt.xlabel('Predicted')\n",
+ "plt.ylabel('True')\n",
+ "plt.title('Confusion Matrix')\n",
+ "plt.show()\n",
+ "\n",
+ "# Calculate Accuracy\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print('Accuracy: ', round(accuracy, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 3: Decision Tree Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Lasso(alpha=0.5, max_iter=1000000, random_state=0) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "Lasso(alpha=0.5, max_iter=1000000, random_state=0)"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train decision tree classifier on gini/entropy\n",
+ "model_3 = DecisionTreeClassifier(criterion='gini/entropy', random_state=RSEED)\n",
+ "model_3.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = model_3.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_3.predict(X_train)\n",
+ "y_pred_test = model_3.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.058\n",
+ "R-squared (test): 0.06\n",
+ "R-squared adjusted (train): 0.056\n",
+ "R-squared adjusted (test): 0.058\n",
+ "RMSE (train): 109.336\n",
+ "RMSE (test): 105.905\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate Confusion Matrix\n",
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+ "print('Confusion Matrix: ', conf_matrix)\n",
+ "\n",
+ "# Create confusion matrix\n",
+ "conf_matrixf = pd.DataFrame(conf_matrix)\n",
+ "\n",
+ "# Plot confusion matrix\n",
+ "plt.figure(figsize=(10,7))\n",
+ "sns.heatmap(conf_matrixf, annot=True, fmt='d', cmap='YlGnBu')\n",
+ "plt.xlabel('Predicted')\n",
+ "plt.ylabel('True')\n",
+ "plt.title('Confusion Matrix')\n",
+ "plt.show()\n",
+ "\n",
+ "# Calculate Accuracy\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print('Accuracy: ', round(accuracy, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 4: Random Forest Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "KNeighborsRegressor() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "KNeighborsRegressor()"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train Random Forest Classifier model\n",
+ "model_4 = RandomForestClassifier(random_state=RSEED, max_features = 'sqrt', n_jobs=-1, verbose = 1)\n",
+ "model_4.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = model_4.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_4.predict(X_train)\n",
+ "y_pred_test = model_4.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.384\n",
+ "R-squared (test): 0.068\n",
+ "R-squared adjusted (train): 0.382\n",
+ "R-squared adjusted (test): 0.066\n",
+ "RMSE (train): 88.423\n",
+ "RMSE (test): 105.441\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate Confusion Matrix\n",
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+ "print('Confusion Matrix: ', conf_matrix)\n",
+ "\n",
+ "# Create confusion matrix\n",
+ "conf_matrixf = pd.DataFrame(conf_matrix)\n",
+ "\n",
+ "# Plot confusion matrix\n",
+ "plt.figure(figsize=(10,7))\n",
+ "sns.heatmap(conf_matrixf, annot=True, fmt='d', cmap='YlGnBu')\n",
+ "plt.xlabel('Predicted')\n",
+ "plt.ylabel('True')\n",
+ "plt.title('Confusion Matrix')\n",
+ "plt.show()\n",
+ "\n",
+ "# Calculate Accuracy\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print('Accuracy: ', round(accuracy, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 5: XGBClassifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 17.7s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_jobs=1,\n",
+ " random_state=0, verbose=1) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_jobs=1,\n",
+ " random_state=0, verbose=1)"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train XGBoost Classifier model\n",
+ "model_5 = XGBClassifier(random_state=RSEED)\n",
+ "model_5.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 9.8s\n",
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 39.6s\n",
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 9.9s\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_pred = model_5.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_5.predict(X_train)\n",
+ "y_pred_test = model_5.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): -0.173\n",
+ "R-squared (test): -0.183\n",
+ "R-squared adjusted (train): -0.175\n",
+ "R-squared adjusted (test): -0.185\n",
+ "RMSE (train): 121.987\n",
+ "RMSE (test): 118.787\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate Confusion Matrix\n",
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+ "print('Confusion Matrix: ', conf_matrix)\n",
+ "\n",
+ "# Create confusion matrix\n",
+ "conf_matrixf = pd.DataFrame(conf_matrix)\n",
+ "\n",
+ "# Plot confusion matrix\n",
+ "plt.figure(figsize=(10,7))\n",
+ "sns.heatmap(conf_matrixf, annot=True, fmt='d', cmap='YlGnBu')\n",
+ "plt.xlabel('Predicted')\n",
+ "plt.ylabel('True')\n",
+ "plt.title('Confusion Matrix')\n",
+ "plt.show()\n",
+ "\n",
+ "# Calculate Accuracy\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print('Accuracy: ', round(accuracy, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 6: Ada Boost"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 2.5s\n",
+ "[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 14.3s\n",
+ "[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 31.9s finished\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "RandomForestRegressor(max_depth=30, max_features='sqrt', min_samples_leaf=5,\n",
+ " n_estimators=400, n_jobs=-1, random_state=0, verbose=1) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "RandomForestRegressor(max_depth=30, max_features='sqrt', min_samples_leaf=5,\n",
+ " n_estimators=400, n_jobs=-1, random_state=0, verbose=1)"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train XGBoost Classifier model\n",
+ "model_6 = XGBClassifier(random_state=RSEED)\n",
+ "model_6.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.1s\n",
+ "[Parallel(n_jobs=16)]: Done 400 out of 400 | elapsed: 0.2s finished\n",
+ "[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.6s\n",
+ "[Parallel(n_jobs=16)]: Done 400 out of 400 | elapsed: 1.6s finished\n",
+ "[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 400 out of 400 | elapsed: 0.2s finished\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_pred = model_6.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_6.predict(X_train)\n",
+ "y_pred_test = model_6.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.227\n",
+ "R-squared (test): 0.159\n",
+ "R-squared adjusted (train): 0.225\n",
+ "R-squared adjusted (test): 0.157\n",
+ "RMSE (train): 99.051\n",
+ "RMSE (test): 100.176\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate Confusion Matrix\n",
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+ "print('Confusion Matrix: ', conf_matrix)\n",
+ "\n",
+ "# Create confusion matrix\n",
+ "conf_matrixf = pd.DataFrame(conf_matrix)\n",
+ "\n",
+ "# Plot confusion matrix\n",
+ "plt.figure(figsize=(10,7))\n",
+ "sns.heatmap(conf_matrixf, annot=True, fmt='d', cmap='YlGnBu')\n",
+ "plt.xlabel('Predicted')\n",
+ "plt.ylabel('True')\n",
+ "plt.title('Confusion Matrix')\n",
+ "plt.show()\n",
+ "\n",
+ "# Calculate Accuracy\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print('Accuracy: ', round(accuracy, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Validation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Packaging"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/project_regression.ipynb b/project_regression.ipynb
new file mode 100644
index 0000000..0b097f3
--- /dev/null
+++ b/project_regression.ipynb
@@ -0,0 +1,5494 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Import of basic packages\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import operator\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings('ignore')\n",
+ "\n",
+ "# Import of chart packages\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import plotly.express as px\n",
+ "import altair as alt\n",
+ "\n",
+ "# Import of machine learning metric packages\n",
+ "from sklearn.metrics import f1_score, classification_report, confusion_matrix, mean_squared_error, r2_score, accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, fbeta_score\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "# Import of preprossesor packages\n",
+ "from sklearn import preprocessing\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, LabelBinarizer, PolynomialFeatures\n",
+ "\n",
+ "# Import of machine learning packages\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, RandomizedSearchCV\n",
+ "from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, SGDClassifier\n",
+ "from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier\n",
+ "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree\n",
+ "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, StackingRegressor, StackingClassifier, AdaBoostClassifier\n",
+ "#from xgboost import XGBClassifier\n",
+ "from sklearn.svm import SVC\n",
+ "\n",
+ "# Set random seed \n",
+ "RSEED = 0"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Style"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID DATOP FLTID DEPSTN ARRSTN STD \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "\n",
+ " STA STATUS AC target \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 "
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load csv\n",
+ "df = pd.read_csv('data/train.csv')\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore data analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " statistic \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " dtype \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " float64 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " mean \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 48.7 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " std \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 117.1 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " min \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 25% \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 50% \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 14.0 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 75% \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 43.0 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " max \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 3451.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " statistic ID DATOP FLTID DEPSTN ARRSTN STD STA STATUS \n",
+ "0 dtype object object object object object object object object \\\n",
+ "1 mean NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "2 std NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "3 min NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "4 25% NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "5 50% NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "6 75% NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "7 max NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " AC target \n",
+ "0 object float64 \n",
+ "1 NaN 48.7 \n",
+ "2 NaN 117.1 \n",
+ "3 NaN 0.0 \n",
+ "4 NaN 0.0 \n",
+ "5 NaN 14.0 \n",
+ "6 NaN 43.0 \n",
+ "7 NaN 3451.0 "
+ ]
+ },
+ "execution_count": 86,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA info & describe\n",
+ "info = pd.concat([\n",
+ "df.dtypes.to_frame().T,\n",
+ "df.mean(numeric_only=True).to_frame().T,\n",
+ "df.std(numeric_only=True).to_frame().T,\n",
+ "df.min(numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.25, numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.5, numeric_only=True).to_frame().T, \n",
+ "df.quantile(0.75, numeric_only=True).to_frame().T,\n",
+ "df.max(numeric_only=True).to_frame().T,], ignore_index=True).applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)\n",
+ "\n",
+ "info.insert(0, 'statistic', ['dtype', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])\n",
+ "info"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [ID, DATOP, FLTID, DEPSTN, ARRSTN, STD, STA, STATUS, AC, target]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA duplicates\n",
+ "duplicates = df.duplicated().sum()\n",
+ "duplicate_percentage = round((duplicates / df.shape[0]) * 100, 1)\n",
+ "df[df.duplicated(keep=False)].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Amount \n",
+ " Percentage \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [Amount, Percentage]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 88,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA NaNs\n",
+ "missing = pd.DataFrame(df.isnull().sum(), columns=['Amount'])\n",
+ "missing['Percentage'] = round((missing['Amount']/df.shape[0]) * 100, 1)\n",
+ "missing[missing['Amount'] != 0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of rows and columns: (107833, 10)\n",
+ "--------------------------------------------------\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_107830 \n",
+ " 2018-11-07 \n",
+ " SGT 0000 \n",
+ " TUN \n",
+ " TUN \n",
+ " 2018-11-07 05:00:00 \n",
+ " 2018-11-07 12.50.00 \n",
+ " SCH \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_107831 \n",
+ " 2018-01-23 \n",
+ " UG 0010 \n",
+ " TUN \n",
+ " DJE \n",
+ " 2018-01-23 18:00:00 \n",
+ " 2018-01-23 18.45.00 \n",
+ " ATA \n",
+ " TU CR9ISA \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " train_id_107832 \n",
+ " 2018-11-13 \n",
+ " UG 0002 \n",
+ " TUN \n",
+ " DJE \n",
+ " 2018-11-13 06:15:00 \n",
+ " 2018-11-13 07.05.00 \n",
+ " SCH \n",
+ " TU CR9ISA \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID DATOP FLTID DEPSTN ARRSTN STD \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_107830 2018-11-07 SGT 0000 TUN TUN 2018-11-07 05:00:00 \n",
+ "4 train_id_107831 2018-01-23 UG 0010 TUN DJE 2018-01-23 18:00:00 \n",
+ "5 train_id_107832 2018-11-13 UG 0002 TUN DJE 2018-11-13 06:15:00 \n",
+ "\n",
+ " STA STATUS AC target \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 \n",
+ "3 2018-11-07 12.50.00 SCH TU 736IOK 0.0 \n",
+ "4 2018-01-23 18.45.00 ATA TU CR9ISA 0.0 \n",
+ "5 2018-11-13 07.05.00 SCH TU CR9ISA 0.0 "
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA shape\n",
+ "print('Number of rows and columns: ',df.shape)\n",
+ "print('-'*50)\n",
+ "pd.concat([df.head(3), df.tail(3)]).reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " STA \n",
+ " STD \n",
+ " FLTID \n",
+ " DATOP \n",
+ " target \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " AC \n",
+ " STATUS \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Amount \n",
+ " 107833 \n",
+ " 85136 \n",
+ " 81697 \n",
+ " 1861 \n",
+ " 1011 \n",
+ " 968 \n",
+ " 132 \n",
+ " 128 \n",
+ " 68 \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID STA STD FLTID DATOP target DEPSTN ARRSTN AC STATUS\n",
+ "Amount 107833 85136 81697 1861 1011 968 132 128 68 5"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA Uniques\n",
+ "unique_counts = pd.DataFrame(df.nunique(), columns=['Amount']).sort_values('Amount', ascending=False).T\n",
+ "unique_counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ID \n",
+ " DATOP \n",
+ " FLTID \n",
+ " DEPSTN \n",
+ " ARRSTN \n",
+ " STD \n",
+ " STA \n",
+ " STATUS \n",
+ " AC \n",
+ " target \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " train_id_5 \n",
+ " 2016-01-17 \n",
+ " TU 0283 \n",
+ " TLS \n",
+ " TUN \n",
+ " 2016-01-17 16:20:00 \n",
+ " 2016-01-17 18.15.00 \n",
+ " ATA \n",
+ " TU 736IOP \n",
+ " 53.0 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " train_id_6 \n",
+ " 2016-01-18 \n",
+ " TU 0514 \n",
+ " TUN \n",
+ " BCN \n",
+ " 2016-01-18 07:15:00 \n",
+ " 2016-01-18 09.00.00 \n",
+ " ATA \n",
+ " TU 32AIMH \n",
+ " 10.0 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " train_id_7 \n",
+ " 2016-01-18 \n",
+ " TU 0716 \n",
+ " TUN \n",
+ " ORY \n",
+ " 2016-01-18 07:35:00 \n",
+ " 2016-01-18 09.55.00 \n",
+ " ATA \n",
+ " TU 32AIMI \n",
+ " 15.0 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " train_id_8 \n",
+ " 2016-01-18 \n",
+ " TU 0752 \n",
+ " TUN \n",
+ " FCO \n",
+ " 2016-01-18 07:40:00 \n",
+ " 2016-01-18 09.00.00 \n",
+ " ATA \n",
+ " TU 32AIMC \n",
+ " 16.0 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " train_id_9 \n",
+ " 2016-01-18 \n",
+ " TU 0996 \n",
+ " TUN \n",
+ " NCE \n",
+ " 2016-01-18 07:45:00 \n",
+ " 2016-01-18 09.15.00 \n",
+ " ATA \n",
+ " TU 31AIMK \n",
+ " 21.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID DATOP FLTID DEPSTN ARRSTN STD \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "5 train_id_5 2016-01-17 TU 0283 TLS TUN 2016-01-17 16:20:00 \n",
+ "6 train_id_6 2016-01-18 TU 0514 TUN BCN 2016-01-18 07:15:00 \n",
+ "7 train_id_7 2016-01-18 TU 0716 TUN ORY 2016-01-18 07:35:00 \n",
+ "8 train_id_8 2016-01-18 TU 0752 TUN FCO 2016-01-18 07:40:00 \n",
+ "9 train_id_9 2016-01-18 TU 0996 TUN NCE 2016-01-18 07:45:00 \n",
+ "\n",
+ " STA STATUS AC target \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 \n",
+ "5 2016-01-17 18.15.00 ATA TU 736IOP 53.0 \n",
+ "6 2016-01-18 09.00.00 ATA TU 32AIMH 10.0 \n",
+ "7 2016-01-18 09.55.00 ATA TU 32AIMI 15.0 \n",
+ "8 2016-01-18 09.00.00 ATA TU 32AIMC 16.0 \n",
+ "9 2016-01-18 09.15.00 ATA TU 31AIMK 21.0 "
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['CMN', 'MXP', 'TUN', 'DJE', 'TLS', 'IST', 'ORY', 'MIR', 'BRU',\n",
+ " 'ABJ', 'VCE', 'AMS', 'FRA', 'BCN', 'JED', 'ALG', 'LIS', 'SXB',\n",
+ " 'LYS', 'OUA', 'LGW', 'BEY', 'NCE', 'OPO', 'MRS', 'DUS', 'SFA',\n",
+ " 'FCO', 'CDG', 'NKC', 'NTE', 'ZRH', 'GVA', 'OUD', 'MUC', 'SXF',\n",
+ " 'HAM', 'NDR', 'NBE', 'CAI', 'BEG', 'VIE', 'ORN', 'MAD', 'TOE',\n",
+ " 'BKO', 'DKR', 'KGL', 'BLQ', 'MLA', 'AHU', 'LHR', 'BOD', 'PRG',\n",
+ " 'LJU', 'SVO', 'MED', 'BUD', 'ARN', 'CPH', 'CRL', 'OST', 'TNG',\n",
+ " 'GAF', 'NAP', 'BRQ', 'OSR', 'YUL', 'NIM', 'TMR', 'JIB', 'CGN',\n",
+ " 'EBL', 'GAE', 'BJA', 'AYT', 'RAK', 'LFW', 'LIL', 'PMO', 'FBM',\n",
+ " 'TBJ', 'PSA', 'KRT', 'GNB', 'CZL', 'MVB', 'VNO', 'ESB', 'LBV',\n",
+ " 'CKY', 'LED', 'KSC', 'BTS', 'AMM', 'OTP', 'TRN', 'IEV', 'HBE',\n",
+ " 'CAG', 'KBP', 'ATH', 'SKG', 'ADB', 'DSS', 'DOH', 'COO', 'LUX',\n",
+ " 'FIH', 'BYJ', 'KEF', 'EBM', 'BDS', 'AAL', 'VKO', 'AAE', 'BRI',\n",
+ " 'VRN', 'SKX', 'HAJ', 'BLL', 'TLL', 'VOG', 'LAD', 'GHA', 'KTW',\n",
+ " 'SJJ', 'KRR', 'RTM', 'STR', 'TPS', 'CTA'], dtype=object)"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Get unique departure airports\n",
+ "df[\"DEPSTN\"].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['TUN', 'IST', 'NTE', 'ALG', 'BCN', 'ORY', 'FCO', 'NCE', 'MRS',\n",
+ " 'MED', 'FRA', 'BRU', 'DJE', 'LYS', 'CMN', 'BEG', 'OUA', 'GVA',\n",
+ " 'MXP', 'BEY', 'MAD', 'JED', 'ABJ', 'VIE', 'MLA', 'BLQ', 'SFA',\n",
+ " 'LIS', 'LHR', 'CDG', 'MIR', 'CAI', 'DUS', 'HAM', 'NBE', 'ZRH',\n",
+ " 'AMS', 'NDR', 'TLS', 'VCE', 'SXB', 'MUC', 'LGW', 'CRL', 'ORN',\n",
+ " 'DKR', 'BOD', 'SXF', 'LJU', 'OST', 'NKC', 'BKO', 'TOE', 'AHU',\n",
+ " 'YUL', 'PRG', 'CPH', 'ARN', 'OUD', 'BRQ', 'GAF', 'JIB', 'BUD',\n",
+ " 'OPO', 'KGL', 'NIM', 'SVO', 'LIL', 'OSR', 'EBL', 'TNG', 'PSA',\n",
+ " 'CGN', 'AYT', 'GAE', 'NAP', 'BJA', 'KRT', 'LFW', 'TBJ', 'PMO',\n",
+ " 'TMR', 'FBM', 'RAK', 'GNB', 'ESB', 'CZL', 'LBV', 'KSC', 'CKY',\n",
+ " 'AMM', 'LED', 'BTS', 'MVB', 'HBE', 'OTP', 'CAG', 'VNO', 'TRN',\n",
+ " 'ATH', 'ADB', 'SKG', 'BYJ', 'DSS', 'COO', 'IEV', 'LUX', 'KBP',\n",
+ " 'DOH', 'FIH', 'EBM', 'BDS', 'VKO', 'AAE', 'BLL', 'HAJ', 'BRI',\n",
+ " 'CTA', 'VRN', 'SKX', 'VOG', 'BGY', 'LAD', 'KRR', 'SJJ', 'GHA',\n",
+ " 'RTM', 'TPS'], dtype=object)"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Get unique arrival airports\n",
+ "df[\"ARRSTN\"].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Index: 69665 entries, 0 to 107675\n",
+ "Data columns (total 10 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 ID 69665 non-null object \n",
+ " 1 DATOP 69665 non-null object \n",
+ " 2 FLTID 69665 non-null object \n",
+ " 3 DEPSTN 69665 non-null object \n",
+ " 4 ARRSTN 69665 non-null object \n",
+ " 5 STD 69665 non-null object \n",
+ " 6 STA 69665 non-null object \n",
+ " 7 STATUS 69665 non-null object \n",
+ " 8 AC 69665 non-null object \n",
+ " 9 target 69665 non-null float64\n",
+ "dtypes: float64(1), object(9)\n",
+ "memory usage: 5.8+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get dataframe for delayed flights\n",
+ "df_delayed = df.copy()\n",
+ "df_delayed = df_delayed[df_delayed[\"target\"] > 0]\n",
+ "df_delayed.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "General statistics of target:\n",
+ " count 107833.000000\n",
+ "mean 48.733013\n",
+ "std 117.135562\n",
+ "min 0.000000\n",
+ "25% 0.000000\n",
+ "50% 14.000000\n",
+ "75% 43.000000\n",
+ "max 3451.000000\n",
+ "Name: target, dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Overview of target distribution\n",
+ "print(\"General statistics of target:\\n\", (df['target']).describe())\n",
+ "\n",
+ "# Creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)\n",
+ "f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={\"height_ratios\": (.15, .85)})\n",
+ " \n",
+ "# Assigning a graph to each ax\n",
+ "sns.boxplot(df[\"target\"], orient=\"h\", ax=ax_box, color='lightblue')\n",
+ "sns.histplot(data=df, x=\"target\", ax=ax_hist, color='lightblue')\n",
+ "\n",
+ "# Remove x axis name for the boxplot\n",
+ "ax_box.set(xlabel='')\n",
+ "ax_box.set_xlim([-100, 500])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "General statistics of target (of delayed flights):\n",
+ " count 69665.000000\n",
+ "mean 75.432814\n",
+ "std 138.650946\n",
+ "min 1.000000\n",
+ "25% 15.000000\n",
+ "50% 30.000000\n",
+ "75% 74.000000\n",
+ "max 3451.000000\n",
+ "Name: target, dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Overview of target distribution (of delayed flights)\n",
+ "print(\"General statistics of target (of delayed flights):\\n\", (df_delayed['target']).describe())\n",
+ "\n",
+ "# Creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)\n",
+ "f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={\"height_ratios\": (.15, .85)})\n",
+ " \n",
+ "# Assigning a graph to each ax\n",
+ "sns.boxplot(df_delayed[\"target\"], orient=\"h\", ax=ax_box, color='lightblue')\n",
+ "sns.histplot(data=df_delayed, x=\"target\", ax=ax_hist, color='lightblue')\n",
+ "\n",
+ "# Remove x axis name for the boxplot\n",
+ "ax_box.set(xlabel='')\n",
+ "ax_box.set_xlim([-20, 500])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Clean column names\n",
+ "df.columns = df.columns.str.replace(' ','_')\n",
+ "df.columns = df.columns.str.lower()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Feature 'ac' holds information about the model of the airplane. Extracting and converting the information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 "
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['airplane_model'] = df['ac'].str[3:6]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " sta \n",
+ " std \n",
+ " fltid \n",
+ " datop \n",
+ " target \n",
+ " depstn \n",
+ " arrstn \n",
+ " ac \n",
+ " airplane_model \n",
+ " status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Amount \n",
+ " 107833 \n",
+ " 85136 \n",
+ " 81697 \n",
+ " 1861 \n",
+ " 1011 \n",
+ " 968 \n",
+ " 132 \n",
+ " 128 \n",
+ " 68 \n",
+ " 16 \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id sta std fltid datop target depstn arrstn ac \n",
+ "Amount 107833 85136 81697 1861 1011 968 132 128 68 \\\n",
+ "\n",
+ " airplane_model status \n",
+ "Amount 16 5 "
+ ]
+ },
+ "execution_count": 99,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA Uniques\n",
+ "unique_counts = pd.DataFrame(df.nunique(), columns=['Amount']).sort_values('Amount', ascending=False).T\n",
+ "unique_counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'319',\n",
+ " '31A',\n",
+ " '31B',\n",
+ " '320',\n",
+ " '321',\n",
+ " '32A',\n",
+ " '332',\n",
+ " '343',\n",
+ " '345',\n",
+ " '733',\n",
+ " '734',\n",
+ " '736',\n",
+ " '738',\n",
+ " 'AT7',\n",
+ " 'CR9',\n",
+ " 'M87'}"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "set(df['airplane_model'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "replacement_dict = {\n",
+ " '31A': 'Airbus',\n",
+ " '31B': 'Airbus',\n",
+ " '320': 'Airbus',\n",
+ " '321': 'Airbus',\n",
+ " '32A': 'Airbus',\n",
+ " '332': 'Airbus',\n",
+ " '343': 'Airbus',\n",
+ " '345': 'Airbus',\n",
+ " '733': 'Boeing',\n",
+ " '734': 'Boeing',\n",
+ " '736': 'Boeing',\n",
+ " 'AT7': 'ATR',\n",
+ " 'CR9': 'Bombardier'\n",
+ "}\n",
+ "\n",
+ "df['producer'] = df['airplane_model']\n",
+ "\n",
+ "# Replace values in the 'purpose' column\n",
+ "df['producer'] = df['producer'].replace(replacement_dict)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus "
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " TU \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \\\n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus \n",
+ "\n",
+ " airline_1 \n",
+ "0 TU \n",
+ "1 TU \n",
+ "2 TU \n",
+ "3 TU \n",
+ "4 TU "
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['airline_1'] = df['fltid'].str[0:2]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'12',\n",
+ " '20',\n",
+ " '6Y',\n",
+ " 'A ',\n",
+ " 'AO',\n",
+ " 'AT',\n",
+ " 'AU',\n",
+ " 'C ',\n",
+ " 'D4',\n",
+ " 'DA',\n",
+ " 'GJ',\n",
+ " 'IN',\n",
+ " 'PR',\n",
+ " 'SG',\n",
+ " 'TU',\n",
+ " 'UG',\n",
+ " 'UH',\n",
+ " 'WK',\n",
+ " 'X9'}"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "set(df['airline_1'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \\\n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus \n",
+ "\n",
+ " airline_1 airline_2 \n",
+ "0 TU TU \n",
+ "1 TU TU \n",
+ "2 TU TU \n",
+ "3 TU TU \n",
+ "4 TU TU "
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['airline_2'] = df['ac'].str[0:2]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'5K',\n",
+ " '5M',\n",
+ " '6P',\n",
+ " 'BJ',\n",
+ " 'D4',\n",
+ " 'GJ',\n",
+ " 'GW',\n",
+ " 'OL',\n",
+ " 'PS',\n",
+ " 'QS',\n",
+ " 'TU',\n",
+ " 'UG',\n",
+ " 'UJ',\n",
+ " 'X9'}"
+ ]
+ },
+ "execution_count": 106,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "set(df['airline_2'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " sta \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016-01-03 12.55.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016-01-13 16.55.00 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20.0 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_2 \n",
+ " 2016-01-16 \n",
+ " TU 0214 \n",
+ " TUN \n",
+ " IST \n",
+ " 2016-01-16 04:10:00 \n",
+ " 2016-01-16 06.45.00 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 0.0 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " train_id_3 \n",
+ " 2016-01-17 \n",
+ " TU 0480 \n",
+ " DJE \n",
+ " NTE \n",
+ " 2016-01-17 14:10:00 \n",
+ " 2016-01-17 17.00.00 \n",
+ " ATA \n",
+ " TU 736IOK \n",
+ " 0.0 \n",
+ " 736 \n",
+ " Boeing \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " train_id_4 \n",
+ " 2016-01-17 \n",
+ " TU 0338 \n",
+ " TUN \n",
+ " ALG \n",
+ " 2016-01-17 14:30:00 \n",
+ " 2016-01-17 15.50.00 \n",
+ " ATA \n",
+ " TU 320IMU \n",
+ " 22.0 \n",
+ " 320 \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop fltid depstn arrstn std \n",
+ "0 train_id_0 2016-01-03 TU 0712 CMN TUN 2016-01-03 10:30:00 \\\n",
+ "1 train_id_1 2016-01-13 TU 0757 MXP TUN 2016-01-13 15:05:00 \n",
+ "2 train_id_2 2016-01-16 TU 0214 TUN IST 2016-01-16 04:10:00 \n",
+ "3 train_id_3 2016-01-17 TU 0480 DJE NTE 2016-01-17 14:10:00 \n",
+ "4 train_id_4 2016-01-17 TU 0338 TUN ALG 2016-01-17 14:30:00 \n",
+ "\n",
+ " sta status ac target airplane_model producer \n",
+ "0 2016-01-03 12.55.00 ATA TU 32AIMN 260.0 32A Airbus \\\n",
+ "1 2016-01-13 16.55.00 ATA TU 31BIMO 20.0 31B Airbus \n",
+ "2 2016-01-16 06.45.00 ATA TU 32AIMN 0.0 32A Airbus \n",
+ "3 2016-01-17 17.00.00 ATA TU 736IOK 0.0 736 Boeing \n",
+ "4 2016-01-17 15.50.00 ATA TU 320IMU 22.0 320 Airbus \n",
+ "\n",
+ " airline_1 airline_2 \n",
+ "0 TU TU \n",
+ "1 TU TU \n",
+ "2 TU TU \n",
+ "3 TU TU \n",
+ "4 TU TU "
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Load airpots data set and clean. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Merge data sets based on airport short handle. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Add columns with weekdays, yyyy, mm, dd, hh:mm:ss\n",
+ "\n",
+ "y = '_year'\n",
+ "m = '_month'\n",
+ "wd = '_wd'\n",
+ "M = '_min'\n",
+ "\n",
+ "### std ###\n",
+ "\n",
+ "date = 'std'\n",
+ "\n",
+ "idx = df.columns.get_loc(date)\n",
+ "\n",
+ "df[date] = pd.to_datetime(df[date], format='%Y-%m-%d %H:%M:%S')\n",
+ "df.insert(loc=idx+1, column=date+y, value=df[date].dt.strftime('%Y')) # year yyyy\n",
+ "df.insert(loc=idx+2, column=date+m, value=df[date].dt.strftime('%#m')) # month m\n",
+ "df.insert(loc=idx+3, column=date+wd, value=df[date].dt.strftime('%w')) # weekday wd\n",
+ "h = df[date].dt.strftime('%#H').astype(int) # hours\n",
+ "minutes = df[date].dt.strftime('%#M').astype(int) # minutes\n",
+ "# calcualte time in just minutes\n",
+ "t = 60*h + minutes\n",
+ "df.insert(loc=idx+4, column=date+M, value=t) # minutes\n",
+ "\n",
+ "### sta ###\n",
+ "\n",
+ "date = 'sta'\n",
+ "\n",
+ "idx = df.columns.get_loc(date)\n",
+ "\n",
+ "df[date] = pd.to_datetime(df[date], format='%Y-%m-%d %H.%M.%S')\n",
+ "df.insert(loc=idx+1, column=date+y, value=df[date].dt.strftime('%Y')) # year yyyy\n",
+ "df.insert(loc=idx+2, column=date+m, value=df[date].dt.strftime('%#m')) # month m\n",
+ "df.insert(loc=idx+3, column=date+wd, value=df[date].dt.strftime('%w')) # weekday wd\n",
+ "h = df[date].dt.strftime('%#H').astype(int) # hours\n",
+ "minutes = df[date].dt.strftime('%#M').astype(int)\n",
+ "# calcualte time in just minutes\n",
+ "t = 60*h + minutes\n",
+ "df.insert(loc=idx+4, column=date+M, value=t) # minutes\n",
+ "\n",
+ "### datop ###\n",
+ "\n",
+ "date = 'datop'\n",
+ "\n",
+ "idx = df.columns.get_loc(date)\n",
+ "\n",
+ "df[date] = pd.to_datetime(df[date], format='%Y-%m-%d')\n",
+ "df.insert(loc=idx+1, column=date+y, value=df[date].dt.strftime('%Y')) # year yyyy\n",
+ "df.insert(loc=idx+2, column=date+m, value=df[date].dt.strftime('%#m')) # month m\n",
+ "df.insert(loc=idx+3, column=date+wd, value=df[date].dt.strftime('%w')) # weekday wd\n",
+ "\n",
+ "# convert new columns as integers\n",
+ "list = ['std_year', 'std_month', 'std_wd', 'sta_year', 'sta_month', 'sta_wd', 'datop_year', 'datop_month', 'datop_wd', 'target']\n",
+ "\n",
+ "for date in list:\n",
+ " df[date] = df[date].astype(int)\n",
+ "\n",
+ "# change weekday numbers to EU where day 1 = Monday\n",
+ "list = ['std_wd', 'sta_wd', 'datop_wd']\n",
+ "\n",
+ "for date in list:\n",
+ " df[date][df[date] == 0] = 7 # Sunday"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Geo-encoding of airports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " city \n",
+ " country \n",
+ " short \n",
+ " latitude \n",
+ " longitude \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Pitt Meadows \n",
+ " Canada \n",
+ " \\N \n",
+ " 49.216099 \n",
+ " -122.709999 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Blida \n",
+ " Algeria \n",
+ " \\N \n",
+ " 36.503601 \n",
+ " 2.814170 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Bou Saada \n",
+ " Algeria \n",
+ " \\N \n",
+ " 35.332500 \n",
+ " 4.206390 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " city country short latitude longitude\n",
+ "0 Pitt Meadows Canada \\N 49.216099 -122.709999\n",
+ "1 Blida Algeria \\N 36.503601 2.814170\n",
+ "2 Bou Saada Algeria \\N 35.332500 4.206390"
+ ]
+ },
+ "execution_count": 109,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load csv\n",
+ "df_airports = pd.read_csv('data/airports.csv')\n",
+ "df_airports.columns = ['id', 'name', 'city', 'country', 'short', 'rubbish_6', 'latitude', 'longitude', 'rubbish_1', 'rubbish_2', 'rubbish_3', 'rubbish_4', 'type', 'rubbish_5']\n",
+ "df_airports = df_airports.drop(['id', 'name', 'rubbish_1', 'rubbish_2', 'rubbish_3', 'rubbish_4', 'rubbish_5', 'rubbish_6', 'type'], axis=1)\n",
+ "df_airports = df_airports.dropna(subset=['short'])\n",
+ "df_airports.head(3)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " city_dep \n",
+ " country_dep \n",
+ " short \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " city_arr \n",
+ " country_arr \n",
+ " short_arr \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016-01-03 12:55:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " Casablanca \n",
+ " Morocco \n",
+ " CMN \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " Tunis \n",
+ " Tunisia \n",
+ " TUN \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_0 \n",
+ " 2016-01-03 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " TU 0712 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016-01-03 10:30:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016-01-03 12:55:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " TU 32AIMN \n",
+ " 260 \n",
+ " 32A \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " Casablanca \n",
+ " Morocco \n",
+ " CMN \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " Tunis \n",
+ " Tunesia \n",
+ " TUN \n",
+ " 36.847685 \n",
+ " 10.217603 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_1 \n",
+ " 2016-01-13 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " TU 0757 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016-01-13 15:05:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 905 \n",
+ " 2016-01-13 16:55:00 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 1015 \n",
+ " ATA \n",
+ " TU 31BIMO \n",
+ " 20 \n",
+ " 31B \n",
+ " Airbus \n",
+ " TU \n",
+ " TU \n",
+ " Milano \n",
+ " Italy \n",
+ " MXP \n",
+ " 45.6306 \n",
+ " 8.72811 \n",
+ " Tunis \n",
+ " Tunisia \n",
+ " TUN \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop datop_year datop_month datop_wd fltid depstn \n",
+ "0 train_id_0 2016-01-03 2016 1 7 TU 0712 CMN \\\n",
+ "1 train_id_0 2016-01-03 2016 1 7 TU 0712 CMN \n",
+ "2 train_id_1 2016-01-13 2016 1 3 TU 0757 MXP \n",
+ "\n",
+ " arrstn std std_year std_month std_wd std_min \n",
+ "0 TUN 2016-01-03 10:30:00 2016 1 7 630 \\\n",
+ "1 TUN 2016-01-03 10:30:00 2016 1 7 630 \n",
+ "2 TUN 2016-01-13 15:05:00 2016 1 3 905 \n",
+ "\n",
+ " sta sta_year sta_month sta_wd sta_min status ac \n",
+ "0 2016-01-03 12:55:00 2016 1 7 775 ATA TU 32AIMN \\\n",
+ "1 2016-01-03 12:55:00 2016 1 7 775 ATA TU 32AIMN \n",
+ "2 2016-01-13 16:55:00 2016 1 3 1015 ATA TU 31BIMO \n",
+ "\n",
+ " target airplane_model producer airline_1 airline_2 city_dep country_dep \n",
+ "0 260 32A Airbus TU TU Casablanca Morocco \\\n",
+ "1 260 32A Airbus TU TU Casablanca Morocco \n",
+ "2 20 31B Airbus TU TU Milano Italy \n",
+ "\n",
+ " short latitude_dep longitude_dep city_arr country_arr short_arr \n",
+ "0 CMN 33.3675 -7.58997 Tunis Tunisia TUN \\\n",
+ "1 CMN 33.3675 -7.58997 Tunis Tunesia TUN \n",
+ "2 MXP 45.6306 8.72811 Tunis Tunisia TUN \n",
+ "\n",
+ " latitude_arr longitude_arr \n",
+ "0 36.851002 10.227200 \n",
+ "1 36.847685 10.217603 \n",
+ "2 36.851002 10.227200 "
+ ]
+ },
+ "execution_count": 110,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.merge(df_airports, left_on='depstn', right_on='short', how='left', suffixes=('', '_dep'))\n",
+ "\n",
+ "# Merge based on arrival station\n",
+ "df = df.merge(df_airports, left_on='arrstn', right_on='short', how='left', suffixes=('', '_arr'))\n",
+ "\n",
+ "# Rename columns for clarity\n",
+ "df = df.rename(columns={\n",
+ " 'city': 'city_dep',\n",
+ " 'country': 'country_dep',\n",
+ " 'latitude': 'latitude_dep',\n",
+ " 'longitude': 'longitude_dep'\n",
+ "})\n",
+ "\n",
+ "df.head(3)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 125,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(197247, 35)"
+ ]
+ },
+ "execution_count": 125,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check if datum datop matches with datum std\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 126,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(197247, 35)"
+ ]
+ },
+ "execution_count": 126,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(df[(df[\"datop_year\"]==df[\"std_year\"])&(df[\"datop_month\"]==df[\"std_month\"])&(df[\"datop_wd\"]==df[\"std_wd\"])]).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " statistic \n",
+ " id \n",
+ " datop \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " fltid \n",
+ " depstn \n",
+ " arrstn \n",
+ " std \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " status \n",
+ " ac \n",
+ " target \n",
+ " airplane_model \n",
+ " producer \n",
+ " airline_1 \n",
+ " airline_2 \n",
+ " city_dep \n",
+ " country_dep \n",
+ " short \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " city_arr \n",
+ " country_arr \n",
+ " short_arr \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " dtype \n",
+ " object \n",
+ " datetime64[ns] \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " datetime64[ns] \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " datetime64[ns] \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " int32 \n",
+ " object \n",
+ " object \n",
+ " int32 \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " float64 \n",
+ " float64 \n",
+ " object \n",
+ " object \n",
+ " object \n",
+ " float64 \n",
+ " float64 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " mean \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 6.6 \n",
+ " 4.1 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 6.6 \n",
+ " 4.1 \n",
+ " 759.7 \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 6.6 \n",
+ " 4.1 \n",
+ " 793.3 \n",
+ " NaN \n",
+ " NaN \n",
+ " 47.2 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 38.4 \n",
+ " 8.9 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 38.4 \n",
+ " 8.9 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " std \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.8 \n",
+ " 3.3 \n",
+ " 2.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.8 \n",
+ " 3.3 \n",
+ " 2.0 \n",
+ " 315.7 \n",
+ " NaN \n",
+ " 0.8 \n",
+ " 3.3 \n",
+ " 2.0 \n",
+ " 334.4 \n",
+ " NaN \n",
+ " NaN \n",
+ " 112.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 7.2 \n",
+ " 8.4 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 7.1 \n",
+ " 8.4 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " min \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " -11.6 \n",
+ " -73.7 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " -11.6 \n",
+ " -73.7 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 25% \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 4.0 \n",
+ " 2.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 4.0 \n",
+ " 2.0 \n",
+ " 500.0 \n",
+ " NaN \n",
+ " 2016.0 \n",
+ " 4.0 \n",
+ " 2.0 \n",
+ " 580.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.8 \n",
+ " 6.8 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.8 \n",
+ " 6.8 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 50% \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 7.0 \n",
+ " 4.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 7.0 \n",
+ " 4.0 \n",
+ " 760.0 \n",
+ " NaN \n",
+ " 2017.0 \n",
+ " 7.0 \n",
+ " 4.0 \n",
+ " 795.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 13.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.9 \n",
+ " 10.2 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 36.9 \n",
+ " 10.2 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 75% \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 9.0 \n",
+ " 6.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 9.0 \n",
+ " 6.0 \n",
+ " 985.0 \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 9.0 \n",
+ " 6.0 \n",
+ " 1050.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 42.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 43.4 \n",
+ " 10.2 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 43.4 \n",
+ " 10.2 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " max \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 12.0 \n",
+ " 7.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2018.0 \n",
+ " 12.0 \n",
+ " 7.0 \n",
+ " 1439.0 \n",
+ " NaN \n",
+ " 2019.0 \n",
+ " 12.0 \n",
+ " 7.0 \n",
+ " 1439.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 3451.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 64.0 \n",
+ " 51.6 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 59.8 \n",
+ " 51.6 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " statistic id datop datop_year datop_month datop_wd fltid \n",
+ "0 dtype object datetime64[ns] int32 int32 int32 object \\\n",
+ "1 mean NaN NaN 2017.0 6.6 4.1 NaN \n",
+ "2 std NaN NaN 0.8 3.3 2.0 NaN \n",
+ "3 min NaN NaN 2016.0 1.0 1.0 NaN \n",
+ "4 25% NaN NaN 2016.0 4.0 2.0 NaN \n",
+ "5 50% NaN NaN 2017.0 7.0 4.0 NaN \n",
+ "6 75% NaN NaN 2018.0 9.0 6.0 NaN \n",
+ "7 max NaN NaN 2018.0 12.0 7.0 NaN \n",
+ "\n",
+ " depstn arrstn std std_year std_month std_wd std_min \n",
+ "0 object object datetime64[ns] int32 int32 int32 int32 \\\n",
+ "1 NaN NaN NaN 2017.0 6.6 4.1 759.7 \n",
+ "2 NaN NaN NaN 0.8 3.3 2.0 315.7 \n",
+ "3 NaN NaN NaN 2016.0 1.0 1.0 0.0 \n",
+ "4 NaN NaN NaN 2016.0 4.0 2.0 500.0 \n",
+ "5 NaN NaN NaN 2017.0 7.0 4.0 760.0 \n",
+ "6 NaN NaN NaN 2018.0 9.0 6.0 985.0 \n",
+ "7 NaN NaN NaN 2018.0 12.0 7.0 1439.0 \n",
+ "\n",
+ " sta sta_year sta_month sta_wd sta_min status ac target \n",
+ "0 datetime64[ns] int32 int32 int32 int32 object object int32 \\\n",
+ "1 NaN 2017.0 6.6 4.1 793.3 NaN NaN 47.2 \n",
+ "2 NaN 0.8 3.3 2.0 334.4 NaN NaN 112.0 \n",
+ "3 NaN 2016.0 1.0 1.0 0.0 NaN NaN 0.0 \n",
+ "4 NaN 2016.0 4.0 2.0 580.0 NaN NaN 0.0 \n",
+ "5 NaN 2017.0 7.0 4.0 795.0 NaN NaN 13.0 \n",
+ "6 NaN 2018.0 9.0 6.0 1050.0 NaN NaN 42.0 \n",
+ "7 NaN 2019.0 12.0 7.0 1439.0 NaN NaN 3451.0 \n",
+ "\n",
+ " airplane_model producer airline_1 airline_2 city_dep country_dep short \n",
+ "0 object object object object object object object \\\n",
+ "1 NaN NaN NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN NaN NaN NaN \n",
+ "5 NaN NaN NaN NaN NaN NaN NaN \n",
+ "6 NaN NaN NaN NaN NaN NaN NaN \n",
+ "7 NaN NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " latitude_dep longitude_dep city_arr country_arr short_arr latitude_arr \n",
+ "0 float64 float64 object object object float64 \\\n",
+ "1 38.4 8.9 NaN NaN NaN 38.4 \n",
+ "2 7.2 8.4 NaN NaN NaN 7.1 \n",
+ "3 -11.6 -73.7 NaN NaN NaN -11.6 \n",
+ "4 36.8 6.8 NaN NaN NaN 36.8 \n",
+ "5 36.9 10.2 NaN NaN NaN 36.9 \n",
+ "6 43.4 10.2 NaN NaN NaN 43.4 \n",
+ "7 64.0 51.6 NaN NaN NaN 59.8 \n",
+ "\n",
+ " longitude_arr \n",
+ "0 float64 \n",
+ "1 8.9 \n",
+ "2 8.4 \n",
+ "3 -73.7 \n",
+ "4 6.8 \n",
+ "5 10.2 \n",
+ "6 10.2 \n",
+ "7 51.6 "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# EDA info & describe\n",
+ "info = pd.concat([\n",
+ "df.dtypes.to_frame().T,\n",
+ "df.mean(numeric_only=True).to_frame().T,\n",
+ "df.std(numeric_only=True).to_frame().T,\n",
+ "df.min(numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.25, numeric_only=True).to_frame().T,\n",
+ "df.quantile(0.5, numeric_only=True).to_frame().T, \n",
+ "df.quantile(0.75, numeric_only=True).to_frame().T,\n",
+ "df.max(numeric_only=True).to_frame().T,], ignore_index=True).applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)\n",
+ "\n",
+ "info.insert(0, 'statistic', ['dtype', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])\n",
+ "info"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Feature engineering"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " depstn \n",
+ " arrstn \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " status \n",
+ " target \n",
+ " airline_1 \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " 260 \n",
+ " TU \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " CMN \n",
+ " TUN \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " ATA \n",
+ " 260 \n",
+ " TU \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.847685 \n",
+ " 10.217603 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_1 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " MXP \n",
+ " TUN \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 905 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 1015 \n",
+ " ATA \n",
+ " 20 \n",
+ " TU \n",
+ " 45.6306 \n",
+ " 8.72811 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop_year datop_month datop_wd depstn arrstn std_year \n",
+ "0 train_id_0 2016 1 7 CMN TUN 2016 \\\n",
+ "1 train_id_0 2016 1 7 CMN TUN 2016 \n",
+ "2 train_id_1 2016 1 3 MXP TUN 2016 \n",
+ "\n",
+ " std_month std_wd std_min sta_year sta_month sta_wd sta_min status \n",
+ "0 1 7 630 2016 1 7 775 ATA \\\n",
+ "1 1 7 630 2016 1 7 775 ATA \n",
+ "2 1 3 905 2016 1 3 1015 ATA \n",
+ "\n",
+ " target airline_1 latitude_dep longitude_dep latitude_arr longitude_arr \n",
+ "0 260 TU 33.3675 -7.58997 36.851002 10.227200 \n",
+ "1 260 TU 33.3675 -7.58997 36.847685 10.217603 \n",
+ "2 20 TU 45.6306 8.72811 36.851002 10.227200 "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.drop(['datop', 'fltid', 'std', 'sta', 'ac', 'short', 'short_arr', 'city_dep', 'country_dep', 'city_arr', 'country_arr'], axis=1)\n",
+ "\n",
+ "# aggressive feature drop\n",
+ "df = df.drop(['airplane_model', 'producer', 'airline_2'], axis=1)\n",
+ "\n",
+ "\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# List of columns to encode\n",
+ "columns_to_encode = ['depstn', 'status', 'arrstn', 'airline_1'] # reduced by aggressive feature drop\n",
+ "\n",
+ "# Create a copy of the original dataframe\n",
+ "df_encoded = df.copy()\n",
+ "\n",
+ "# Encode each column separately\n",
+ "for column in columns_to_encode:\n",
+ " lb = LabelBinarizer()\n",
+ " encoded = lb.fit_transform(df[column])\n",
+ " \n",
+ " # If binary classification, create a single column\n",
+ " if len(lb.classes_) == 2:\n",
+ " df_encoded[f'{column}_encoded'] = encoded\n",
+ " else:\n",
+ " # For multiclass, create multiple columns\n",
+ " encoded_df = pd.DataFrame(encoded, columns=[f'{column}_{cls}' for cls in lb.classes_], index=df.index)\n",
+ " df_encoded = pd.concat([df_encoded, encoded_df], axis=1)\n",
+ "\n",
+ "df_encoded = df_encoded.drop(column, axis=1)\n",
+ "\n",
+ "# Now, combine the non-encoded columns from df with the encoded columns from df_encoded\n",
+ "df = pd.concat([df, df_encoded], axis=1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " datop_year \n",
+ " datop_month \n",
+ " datop_wd \n",
+ " std_year \n",
+ " std_month \n",
+ " std_wd \n",
+ " std_min \n",
+ " sta_year \n",
+ " sta_month \n",
+ " sta_wd \n",
+ " sta_min \n",
+ " target \n",
+ " latitude_dep \n",
+ " longitude_dep \n",
+ " latitude_arr \n",
+ " longitude_arr \n",
+ " depstn_AAE \n",
+ " depstn_AAL \n",
+ " depstn_ABJ \n",
+ " depstn_ADB \n",
+ " depstn_AHU \n",
+ " depstn_ALG \n",
+ " depstn_AMM \n",
+ " depstn_AMS \n",
+ " depstn_ARN \n",
+ " depstn_ATH \n",
+ " depstn_AYT \n",
+ " depstn_BCN \n",
+ " depstn_BDS \n",
+ " depstn_BEG \n",
+ " depstn_BEY \n",
+ " depstn_BJA \n",
+ " depstn_BKO \n",
+ " depstn_BLL \n",
+ " depstn_BLQ \n",
+ " depstn_BOD \n",
+ " depstn_BRI \n",
+ " depstn_BRQ \n",
+ " depstn_BRU \n",
+ " depstn_BTS \n",
+ " depstn_BUD \n",
+ " depstn_BYJ \n",
+ " depstn_CAG \n",
+ " depstn_CAI \n",
+ " depstn_CDG \n",
+ " depstn_CGN \n",
+ " depstn_CKY \n",
+ " depstn_CMN \n",
+ " depstn_COO \n",
+ " depstn_CPH \n",
+ " depstn_CRL \n",
+ " depstn_CTA \n",
+ " depstn_CZL \n",
+ " depstn_DJE \n",
+ " depstn_DKR \n",
+ " depstn_DOH \n",
+ " depstn_DSS \n",
+ " depstn_DUS \n",
+ " depstn_EBL \n",
+ " depstn_EBM \n",
+ " depstn_ESB \n",
+ " depstn_FBM \n",
+ " depstn_FCO \n",
+ " depstn_FIH \n",
+ " depstn_FRA \n",
+ " depstn_GAE \n",
+ " depstn_GAF \n",
+ " depstn_GHA \n",
+ " depstn_GNB \n",
+ " depstn_GVA \n",
+ " depstn_HAJ \n",
+ " depstn_HAM \n",
+ " depstn_HBE \n",
+ " depstn_IEV \n",
+ " depstn_IST \n",
+ " depstn_JED \n",
+ " depstn_JIB \n",
+ " depstn_KBP \n",
+ " depstn_KEF \n",
+ " depstn_KGL \n",
+ " depstn_KRR \n",
+ " depstn_KRT \n",
+ " depstn_KSC \n",
+ " depstn_KTW \n",
+ " depstn_LAD \n",
+ " depstn_LBV \n",
+ " depstn_LED \n",
+ " depstn_LFW \n",
+ " depstn_LGW \n",
+ " depstn_LHR \n",
+ " depstn_LIL \n",
+ " depstn_LIS \n",
+ " depstn_LJU \n",
+ " depstn_LUX \n",
+ " depstn_LYS \n",
+ " depstn_MAD \n",
+ " depstn_MED \n",
+ " depstn_MIR \n",
+ " depstn_MLA \n",
+ " depstn_MRS \n",
+ " depstn_MUC \n",
+ " depstn_MVB \n",
+ " depstn_MXP \n",
+ " depstn_NAP \n",
+ " depstn_NBE \n",
+ " depstn_NCE \n",
+ " depstn_NDR \n",
+ " depstn_NIM \n",
+ " depstn_NKC \n",
+ " depstn_NTE \n",
+ " depstn_OPO \n",
+ " depstn_ORN \n",
+ " depstn_ORY \n",
+ " depstn_OSR \n",
+ " depstn_OST \n",
+ " depstn_OTP \n",
+ " depstn_OUA \n",
+ " depstn_OUD \n",
+ " depstn_PMO \n",
+ " depstn_PRG \n",
+ " depstn_PSA \n",
+ " depstn_RAK \n",
+ " depstn_RTM \n",
+ " depstn_SFA \n",
+ " depstn_SJJ \n",
+ " depstn_SKG \n",
+ " depstn_SKX \n",
+ " depstn_STR \n",
+ " depstn_SVO \n",
+ " depstn_SXB \n",
+ " depstn_SXF \n",
+ " depstn_TBJ \n",
+ " depstn_TLL \n",
+ " depstn_TLS \n",
+ " depstn_TMR \n",
+ " depstn_TNG \n",
+ " depstn_TOE \n",
+ " depstn_TPS \n",
+ " depstn_TRN \n",
+ " depstn_TUN \n",
+ " depstn_VCE \n",
+ " depstn_VIE \n",
+ " depstn_VKO \n",
+ " depstn_VNO \n",
+ " depstn_VOG \n",
+ " depstn_VRN \n",
+ " depstn_YUL \n",
+ " depstn_ZRH \n",
+ " status_ATA \n",
+ " status_DEL \n",
+ " status_DEP \n",
+ " status_RTR \n",
+ " status_SCH \n",
+ " arrstn_AAE \n",
+ " arrstn_ABJ \n",
+ " arrstn_ADB \n",
+ " arrstn_AHU \n",
+ " arrstn_ALG \n",
+ " arrstn_AMM \n",
+ " arrstn_AMS \n",
+ " arrstn_ARN \n",
+ " arrstn_ATH \n",
+ " arrstn_AYT \n",
+ " arrstn_BCN \n",
+ " arrstn_BDS \n",
+ " arrstn_BEG \n",
+ " arrstn_BEY \n",
+ " arrstn_BGY \n",
+ " arrstn_BJA \n",
+ " arrstn_BKO \n",
+ " arrstn_BLL \n",
+ " arrstn_BLQ \n",
+ " arrstn_BOD \n",
+ " arrstn_BRI \n",
+ " arrstn_BRQ \n",
+ " arrstn_BRU \n",
+ " arrstn_BTS \n",
+ " arrstn_BUD \n",
+ " arrstn_BYJ \n",
+ " arrstn_CAG \n",
+ " arrstn_CAI \n",
+ " arrstn_CDG \n",
+ " arrstn_CGN \n",
+ " arrstn_CKY \n",
+ " arrstn_CMN \n",
+ " arrstn_COO \n",
+ " arrstn_CPH \n",
+ " arrstn_CRL \n",
+ " arrstn_CTA \n",
+ " arrstn_CZL \n",
+ " arrstn_DJE \n",
+ " arrstn_DKR \n",
+ " arrstn_DOH \n",
+ " arrstn_DSS \n",
+ " arrstn_DUS \n",
+ " arrstn_EBL \n",
+ " arrstn_EBM \n",
+ " arrstn_ESB \n",
+ " arrstn_FBM \n",
+ " arrstn_FCO \n",
+ " arrstn_FIH \n",
+ " arrstn_FRA \n",
+ " arrstn_GAE \n",
+ " arrstn_GAF \n",
+ " arrstn_GHA \n",
+ " arrstn_GNB \n",
+ " arrstn_GVA \n",
+ " arrstn_HAJ \n",
+ " arrstn_HAM \n",
+ " arrstn_HBE \n",
+ " arrstn_IEV \n",
+ " arrstn_IST \n",
+ " arrstn_JED \n",
+ " arrstn_JIB \n",
+ " arrstn_KBP \n",
+ " arrstn_KGL \n",
+ " arrstn_KRR \n",
+ " arrstn_KRT \n",
+ " arrstn_KSC \n",
+ " arrstn_LAD \n",
+ " arrstn_LBV \n",
+ " arrstn_LED \n",
+ " arrstn_LFW \n",
+ " arrstn_LGW \n",
+ " arrstn_LHR \n",
+ " arrstn_LIL \n",
+ " arrstn_LIS \n",
+ " arrstn_LJU \n",
+ " arrstn_LUX \n",
+ " arrstn_LYS \n",
+ " arrstn_MAD \n",
+ " arrstn_MED \n",
+ " arrstn_MIR \n",
+ " arrstn_MLA \n",
+ " arrstn_MRS \n",
+ " arrstn_MUC \n",
+ " arrstn_MVB \n",
+ " arrstn_MXP \n",
+ " arrstn_NAP \n",
+ " arrstn_NBE \n",
+ " arrstn_NCE \n",
+ " arrstn_NDR \n",
+ " arrstn_NIM \n",
+ " arrstn_NKC \n",
+ " arrstn_NTE \n",
+ " arrstn_OPO \n",
+ " arrstn_ORN \n",
+ " arrstn_ORY \n",
+ " arrstn_OSR \n",
+ " arrstn_OST \n",
+ " arrstn_OTP \n",
+ " arrstn_OUA \n",
+ " arrstn_OUD \n",
+ " arrstn_PMO \n",
+ " arrstn_PRG \n",
+ " arrstn_PSA \n",
+ " arrstn_RAK \n",
+ " arrstn_RTM \n",
+ " arrstn_SFA \n",
+ " arrstn_SJJ \n",
+ " arrstn_SKG \n",
+ " arrstn_SKX \n",
+ " arrstn_SVO \n",
+ " arrstn_SXB \n",
+ " arrstn_SXF \n",
+ " arrstn_TBJ \n",
+ " arrstn_TLS \n",
+ " arrstn_TMR \n",
+ " arrstn_TNG \n",
+ " arrstn_TOE \n",
+ " arrstn_TPS \n",
+ " arrstn_TRN \n",
+ " arrstn_TUN \n",
+ " arrstn_VCE \n",
+ " arrstn_VIE \n",
+ " arrstn_VKO \n",
+ " arrstn_VNO \n",
+ " arrstn_VOG \n",
+ " arrstn_VRN \n",
+ " arrstn_YUL \n",
+ " arrstn_ZRH \n",
+ " airline_1_12 \n",
+ " airline_1_20 \n",
+ " airline_1_6Y \n",
+ " airline_1_A \n",
+ " airline_1_AO \n",
+ " airline_1_AT \n",
+ " airline_1_AU \n",
+ " airline_1_C \n",
+ " airline_1_D4 \n",
+ " airline_1_DA \n",
+ " airline_1_GJ \n",
+ " airline_1_IN \n",
+ " airline_1_PR \n",
+ " airline_1_SG \n",
+ " airline_1_TU \n",
+ " airline_1_UG \n",
+ " airline_1_UH \n",
+ " airline_1_WK \n",
+ " airline_1_X9 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " 260 \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " train_id_0 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 630 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 7 \n",
+ " 775 \n",
+ " 260 \n",
+ " 33.3675 \n",
+ " -7.58997 \n",
+ " 36.847685 \n",
+ " 10.217603 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " train_id_1 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 905 \n",
+ " 2016 \n",
+ " 1 \n",
+ " 3 \n",
+ " 1015 \n",
+ " 20 \n",
+ " 45.6306 \n",
+ " 8.72811 \n",
+ " 36.851002 \n",
+ " 10.227200 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id datop_year datop_month datop_wd std_year std_month std_wd \n",
+ "0 train_id_0 2016 1 7 2016 1 7 \\\n",
+ "1 train_id_0 2016 1 7 2016 1 7 \n",
+ "2 train_id_1 2016 1 3 2016 1 3 \n",
+ "\n",
+ " std_min sta_year sta_month sta_wd sta_min target latitude_dep \n",
+ "0 630 2016 1 7 775 260 33.3675 \\\n",
+ "1 630 2016 1 7 775 260 33.3675 \n",
+ "2 905 2016 1 3 1015 20 45.6306 \n",
+ "\n",
+ " longitude_dep latitude_arr longitude_arr depstn_AAE depstn_AAL \n",
+ "0 -7.58997 36.851002 10.227200 0 0 \\\n",
+ "1 -7.58997 36.847685 10.217603 0 0 \n",
+ "2 8.72811 36.851002 10.227200 0 0 \n",
+ "\n",
+ " depstn_ABJ depstn_ADB depstn_AHU depstn_ALG depstn_AMM depstn_AMS \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_ARN depstn_ATH depstn_AYT depstn_BCN depstn_BDS depstn_BEG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_BEY depstn_BJA depstn_BKO depstn_BLL depstn_BLQ depstn_BOD \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_BRI depstn_BRQ depstn_BRU depstn_BTS depstn_BUD depstn_BYJ \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_CAG depstn_CAI depstn_CDG depstn_CGN depstn_CKY depstn_CMN \n",
+ "0 0 0 0 0 0 1 \\\n",
+ "1 0 0 0 0 0 1 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_COO depstn_CPH depstn_CRL depstn_CTA depstn_CZL depstn_DJE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_DKR depstn_DOH depstn_DSS depstn_DUS depstn_EBL depstn_EBM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_ESB depstn_FBM depstn_FCO depstn_FIH depstn_FRA depstn_GAE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_GAF depstn_GHA depstn_GNB depstn_GVA depstn_HAJ depstn_HAM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_HBE depstn_IEV depstn_IST depstn_JED depstn_JIB depstn_KBP \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_KEF depstn_KGL depstn_KRR depstn_KRT depstn_KSC depstn_KTW \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_LAD depstn_LBV depstn_LED depstn_LFW depstn_LGW depstn_LHR \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_LIL depstn_LIS depstn_LJU depstn_LUX depstn_LYS depstn_MAD \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_MED depstn_MIR depstn_MLA depstn_MRS depstn_MUC depstn_MVB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_MXP depstn_NAP depstn_NBE depstn_NCE depstn_NDR depstn_NIM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 1 0 0 0 0 0 \n",
+ "\n",
+ " depstn_NKC depstn_NTE depstn_OPO depstn_ORN depstn_ORY depstn_OSR \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_OST depstn_OTP depstn_OUA depstn_OUD depstn_PMO depstn_PRG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_PSA depstn_RAK depstn_RTM depstn_SFA depstn_SJJ depstn_SKG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_SKX depstn_STR depstn_SVO depstn_SXB depstn_SXF depstn_TBJ \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_TLL depstn_TLS depstn_TMR depstn_TNG depstn_TOE depstn_TPS \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_TRN depstn_TUN depstn_VCE depstn_VIE depstn_VKO depstn_VNO \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " depstn_VOG depstn_VRN depstn_YUL depstn_ZRH status_ATA status_DEL \n",
+ "0 0 0 0 0 1 0 \\\n",
+ "1 0 0 0 0 1 0 \n",
+ "2 0 0 0 0 1 0 \n",
+ "\n",
+ " status_DEP status_RTR status_SCH arrstn_AAE arrstn_ABJ arrstn_ADB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_AHU arrstn_ALG arrstn_AMM arrstn_AMS arrstn_ARN arrstn_ATH \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_AYT arrstn_BCN arrstn_BDS arrstn_BEG arrstn_BEY arrstn_BGY \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_BJA arrstn_BKO arrstn_BLL arrstn_BLQ arrstn_BOD arrstn_BRI \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_BRQ arrstn_BRU arrstn_BTS arrstn_BUD arrstn_BYJ arrstn_CAG \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_CAI arrstn_CDG arrstn_CGN arrstn_CKY arrstn_CMN arrstn_COO \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_CPH arrstn_CRL arrstn_CTA arrstn_CZL arrstn_DJE arrstn_DKR \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_DOH arrstn_DSS arrstn_DUS arrstn_EBL arrstn_EBM arrstn_ESB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_FBM arrstn_FCO arrstn_FIH arrstn_FRA arrstn_GAE arrstn_GAF \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_GHA arrstn_GNB arrstn_GVA arrstn_HAJ arrstn_HAM arrstn_HBE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_IEV arrstn_IST arrstn_JED arrstn_JIB arrstn_KBP arrstn_KGL \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_KRR arrstn_KRT arrstn_KSC arrstn_LAD arrstn_LBV arrstn_LED \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_LFW arrstn_LGW arrstn_LHR arrstn_LIL arrstn_LIS arrstn_LJU \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_LUX arrstn_LYS arrstn_MAD arrstn_MED arrstn_MIR arrstn_MLA \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_MRS arrstn_MUC arrstn_MVB arrstn_MXP arrstn_NAP arrstn_NBE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_NCE arrstn_NDR arrstn_NIM arrstn_NKC arrstn_NTE arrstn_OPO \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_ORN arrstn_ORY arrstn_OSR arrstn_OST arrstn_OTP arrstn_OUA \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_OUD arrstn_PMO arrstn_PRG arrstn_PSA arrstn_RAK arrstn_RTM \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_SFA arrstn_SJJ arrstn_SKG arrstn_SKX arrstn_SVO arrstn_SXB \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_SXF arrstn_TBJ arrstn_TLS arrstn_TMR arrstn_TNG arrstn_TOE \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " arrstn_TPS arrstn_TRN arrstn_TUN arrstn_VCE arrstn_VIE arrstn_VKO \n",
+ "0 0 0 1 0 0 0 \\\n",
+ "1 0 0 1 0 0 0 \n",
+ "2 0 0 1 0 0 0 \n",
+ "\n",
+ " arrstn_VNO arrstn_VOG arrstn_VRN arrstn_YUL arrstn_ZRH airline_1_12 \n",
+ "0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 \n",
+ "\n",
+ " airline_1_20 airline_1_6Y airline_1_A airline_1_AO airline_1_AT \n",
+ "0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 \n",
+ "\n",
+ " airline_1_AU airline_1_C airline_1_D4 airline_1_DA airline_1_GJ \n",
+ "0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 \n",
+ "\n",
+ " airline_1_IN airline_1_PR airline_1_SG airline_1_TU airline_1_UG \n",
+ "0 0 0 0 1 0 \\\n",
+ "1 0 0 0 1 0 \n",
+ "2 0 0 0 1 0 \n",
+ "\n",
+ " airline_1_UH airline_1_WK airline_1_X9 \n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 "
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.drop(['depstn', 'arrstn', 'status', 'airline_1'], axis=1) # reduced by aggressive feature drop\n",
+ "duplicate_columns = df.columns[df.columns.duplicated()]\n",
+ "df = df.loc[:, ~df.columns.duplicated()]\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Splitting data for testing "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define features and target variable (target)\n",
+ "X = df.drop(['target', 'id'], axis=1)\n",
+ "y = df['target']\n",
+ "\n",
+ "# Split into train and test set\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Trainining the model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Baseline model: linear regression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "LinearRegression() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "LinearRegression()"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train linear regression model\n",
+ "model_0 = LinearRegression()\n",
+ "model_0.fit(X_train, y_train) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Predict\n",
+ "y_pred = model_0.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_0.predict(X_train)\n",
+ "y_pred_test = model_0.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.078\n",
+ "R-squared (test): -406677.45\n",
+ "R-squared adjusted (train): 0.076\n",
+ "R-squared adjusted (test): -407449.509\n",
+ "RMSE (train): 108.156\n",
+ "RMSE (test): 69656.206\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate r-squared\n",
+ "r_squared_train = r2_score(y_train, y_pred_train)\n",
+ "r_squared_test = r2_score(y_test, y_pred_test)\n",
+ "print('R-squared (train): ', round(r_squared_train, 3))\n",
+ "print('R-squared (test): ', round(r_squared_test, 3))\n",
+ "\n",
+ "# Calculate adjusted r-squared\n",
+ "r_squared_adjusted_train = 1 - ((1 - r_squared_train) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "r_squared_adjusted_test = 1 - ((1 - r_squared_test) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "print('R-squared adjusted (train): ', round(r_squared_adjusted_train, 3))\n",
+ "print('R-squared adjusted (test): ', round(r_squared_adjusted_test, 3))\n",
+ "\n",
+ "# Calculate RMSE\n",
+ "rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n",
+ "rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
+ "print('RMSE (train): ', round(rmse_train, 3))\n",
+ "print('RMSE (test): ', round(rmse_test, 3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 1: Polynominal"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Instantiate and train degree degree polynomial model\n",
+ "#model_1 = PolynomialFeatures(degree=8)\n",
+ "#model_1.fit_transform(X, y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#y_pred = model_0.predict(X_test)\n",
+ "\n",
+ "#y_pred_train = model_0.predict(X_train)\n",
+ "#y_pred_test = model_0.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 2: Ridge"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Ridge(alpha=1, random_state=0) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "Ridge(alpha=1, random_state=0)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train Ridge regression model\n",
+ "model_2 = Ridge(alpha=1, random_state=RSEED)\n",
+ "model_2.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = model_2.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_2.predict(X_train)\n",
+ "y_pred_test = model_2.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.078\n",
+ "R-squared (test): 0.08\n",
+ "R-squared adjusted (train): 0.076\n",
+ "R-squared adjusted (test): 0.079\n",
+ "RMSE (train): 108.161\n",
+ "RMSE (test): 104.742\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate r-squared\n",
+ "r_squared_train = r2_score(y_train, y_pred_train)\n",
+ "r_squared_test = r2_score(y_test, y_pred_test)\n",
+ "print('R-squared (train): ', round(r_squared_train, 3))\n",
+ "print('R-squared (test): ', round(r_squared_test, 3))\n",
+ "\n",
+ "# Calculate adjusted r-squared\n",
+ "r_squared_adjusted_train = 1 - ((1 - r_squared_train) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "r_squared_adjusted_test = 1 - ((1 - r_squared_test) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "print('R-squared adjusted (train): ', round(r_squared_adjusted_train, 3))\n",
+ "print('R-squared adjusted (test): ', round(r_squared_adjusted_test, 3))\n",
+ "\n",
+ "# Calculate RMSE\n",
+ "rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n",
+ "rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
+ "print('RMSE (train): ', round(rmse_train, 3))\n",
+ "print('RMSE (test): ', round(rmse_test, 3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 3: Lasso"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Lasso(alpha=0.5, max_iter=1000000, random_state=0) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "Lasso(alpha=0.5, max_iter=1000000, random_state=0)"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train Lasso regression model\n",
+ "model_3 = Lasso(alpha=0.5, max_iter=int(10e5), random_state=RSEED)\n",
+ "model_3.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = model_3.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_3.predict(X_train)\n",
+ "y_pred_test = model_3.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.058\n",
+ "R-squared (test): 0.06\n",
+ "R-squared adjusted (train): 0.056\n",
+ "R-squared adjusted (test): 0.058\n",
+ "RMSE (train): 109.336\n",
+ "RMSE (test): 105.905\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate r-squared\n",
+ "r_squared_train = r2_score(y_train, y_pred_train)\n",
+ "r_squared_test = r2_score(y_test, y_pred_test)\n",
+ "print('R-squared (train): ', round(r_squared_train, 3))\n",
+ "print('R-squared (test): ', round(r_squared_test, 3))\n",
+ "\n",
+ "# Calculate adjusted r-squared\n",
+ "r_squared_adjusted_train = 1 - ((1 - r_squared_train) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "r_squared_adjusted_test = 1 - ((1 - r_squared_test) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "print('R-squared adjusted (train): ', round(r_squared_adjusted_train, 3))\n",
+ "print('R-squared adjusted (test): ', round(r_squared_adjusted_test, 3))\n",
+ "\n",
+ "# Calculate RMSE\n",
+ "rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n",
+ "rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
+ "print('RMSE (train): ', round(rmse_train, 3))\n",
+ "print('RMSE (test): ', round(rmse_test, 3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 4: KNN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "KNeighborsRegressor() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "KNeighborsRegressor()"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train KNN Regression model\n",
+ "model_4 = KNeighborsRegressor()\n",
+ "model_4.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = model_4.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_4.predict(X_train)\n",
+ "y_pred_test = model_4.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.384\n",
+ "R-squared (test): 0.068\n",
+ "R-squared adjusted (train): 0.382\n",
+ "R-squared adjusted (test): 0.066\n",
+ "RMSE (train): 88.423\n",
+ "RMSE (test): 105.441\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate r-squared\n",
+ "r_squared_train = r2_score(y_train, y_pred_train)\n",
+ "r_squared_test = r2_score(y_test, y_pred_test)\n",
+ "print('R-squared (train): ', round(r_squared_train, 3))\n",
+ "print('R-squared (test): ', round(r_squared_test, 3))\n",
+ "\n",
+ "# Calculate adjusted r-squared\n",
+ "r_squared_adjusted_train = 1 - ((1 - r_squared_train) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "r_squared_adjusted_test = 1 - ((1 - r_squared_test) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "print('R-squared adjusted (train): ', round(r_squared_adjusted_train, 3))\n",
+ "print('R-squared adjusted (test): ', round(r_squared_adjusted_test, 3))\n",
+ "\n",
+ "# Calculate RMSE\n",
+ "rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n",
+ "rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
+ "print('RMSE (train): ', round(rmse_train, 3))\n",
+ "print('RMSE (test): ', round(rmse_test, 3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 5: Random Decision tree"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 17.7s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_jobs=1,\n",
+ " random_state=0, verbose=1) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_jobs=1,\n",
+ " random_state=0, verbose=1)"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train Random Forest Classifier model\n",
+ "model_5 = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=5, random_state=RSEED, max_features = 'sqrt', n_jobs=1, verbose = 1)\n",
+ "model_5.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 9.8s\n",
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 39.6s\n",
+ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 9.9s\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_pred = model_5.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_5.predict(X_train)\n",
+ "y_pred_test = model_5.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): -0.173\n",
+ "R-squared (test): -0.183\n",
+ "R-squared adjusted (train): -0.175\n",
+ "R-squared adjusted (test): -0.185\n",
+ "RMSE (train): 121.987\n",
+ "RMSE (test): 118.787\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate r-squared\n",
+ "r_squared_train = r2_score(y_train, y_pred_train)\n",
+ "r_squared_test = r2_score(y_test, y_pred_test)\n",
+ "print('R-squared (train): ', round(r_squared_train, 3))\n",
+ "print('R-squared (test): ', round(r_squared_test, 3))\n",
+ "\n",
+ "# Calculate adjusted r-squared\n",
+ "r_squared_adjusted_train = 1 - ((1 - r_squared_train) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "r_squared_adjusted_test = 1 - ((1 - r_squared_test) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "print('R-squared adjusted (train): ', round(r_squared_adjusted_train, 3))\n",
+ "print('R-squared adjusted (test): ', round(r_squared_adjusted_test, 3))\n",
+ "\n",
+ "# Calculate RMSE\n",
+ "rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n",
+ "rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
+ "print('RMSE (train): ', round(rmse_train, 3))\n",
+ "print('RMSE (test): ', round(rmse_test, 3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Model 6: Random Forest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 2.5s\n",
+ "[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 14.3s\n",
+ "[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 31.9s finished\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "RandomForestRegressor(max_depth=30, max_features='sqrt', min_samples_leaf=5,\n",
+ " n_estimators=400, n_jobs=-1, random_state=0, verbose=1) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "RandomForestRegressor(max_depth=30, max_features='sqrt', min_samples_leaf=5,\n",
+ " n_estimators=400, n_jobs=-1, random_state=0, verbose=1)"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Instantiate and train Random Forest Classifier model\n",
+ "# Instantiate and train Random Forest Regressor model\n",
+ "model_6 = RandomForestRegressor(n_estimators=300, max_depth=30, min_samples_leaf=5, random_state=RSEED, max_features='sqrt', n_jobs=-1, verbose=1)\n",
+ "model_6.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.1s\n",
+ "[Parallel(n_jobs=16)]: Done 400 out of 400 | elapsed: 0.2s finished\n",
+ "[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.6s\n",
+ "[Parallel(n_jobs=16)]: Done 400 out of 400 | elapsed: 1.6s finished\n",
+ "[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.\n",
+ "[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=16)]: Done 400 out of 400 | elapsed: 0.2s finished\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_pred = model_6.predict(X_test)\n",
+ "\n",
+ "y_pred_train = model_6.predict(X_train)\n",
+ "y_pred_test = model_6.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "R-squared (train): 0.227\n",
+ "R-squared (test): 0.159\n",
+ "R-squared adjusted (train): 0.225\n",
+ "R-squared adjusted (test): 0.157\n",
+ "RMSE (train): 99.051\n",
+ "RMSE (test): 100.176\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate r-squared\n",
+ "r_squared_train = r2_score(y_train, y_pred_train)\n",
+ "r_squared_test = r2_score(y_test, y_pred_test)\n",
+ "print('R-squared (train): ', round(r_squared_train, 3))\n",
+ "print('R-squared (test): ', round(r_squared_test, 3))\n",
+ "\n",
+ "# Calculate adjusted r-squared\n",
+ "r_squared_adjusted_train = 1 - ((1 - r_squared_train) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "r_squared_adjusted_test = 1 - ((1 - r_squared_test) * (X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))\n",
+ "print('R-squared adjusted (train): ', round(r_squared_adjusted_train, 3))\n",
+ "print('R-squared adjusted (test): ', round(r_squared_adjusted_test, 3))\n",
+ "\n",
+ "# Calculate RMSE\n",
+ "rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n",
+ "rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
+ "print('RMSE (train): ', round(rmse_train, 3))\n",
+ "print('RMSE (test): ', round(rmse_test, 3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Validation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Packaging"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}