From f4856c5db2965bf203ca8ea8fc50e56f34090744 Mon Sep 17 00:00:00 2001 From: Remi Tschupp Date: Tue, 25 Jun 2024 17:08:54 +0200 Subject: [PATCH] refactoring for data --- titanic/titanic_data_handling.ipynb | 39 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/titanic/titanic_data_handling.ipynb b/titanic/titanic_data_handling.ipynb index c787457..7ac919f 100644 --- a/titanic/titanic_data_handling.ipynb +++ b/titanic/titanic_data_handling.ipynb @@ -26,14 +26,21 @@ "## EDA" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Only labelled train file will be used to train our model. The other file could be used for inference test to show the capacities of the model." + ] + }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "train = pd.read_csv(os.path.join(gen_dirname,r\"data\\train.csv\"))\n", - "test = pd.read_csv(os.path.join(gen_dirname,r\"data\\test.csv\"))" + "labelled_train = pd.read_csv(os.path.join(gen_dirname,r\"data\\labelled_train.csv\"))\n", + "unlabelled_test = pd.read_csv(os.path.join(gen_dirname,r\"data\\unlabelled_test.csv\"))" ] }, { @@ -185,7 +192,7 @@ } ], "source": [ - "train.head()" + "labelled_train.head()" ] }, { @@ -238,8 +245,8 @@ } ], "source": [ - "train.info()\n", - "test.info()" + "labelled_train.info()\n", + "unlabelled_test.info()" ] }, { @@ -279,8 +286,8 @@ "metadata": {}, "outputs": [], "source": [ - "train_rough = train.drop(features_to_drop_rough,axis=\"columns\")\n", - "test_rough = test.drop(features_to_drop_rough,axis=\"columns\")" + "train_rough = labelled_train.drop(features_to_drop_rough,axis=\"columns\")\n", + "test_rough = unlabelled_test.drop(features_to_drop_rough,axis=\"columns\")" ] }, { @@ -340,7 +347,7 @@ } ], "source": [ - "test_rough.loc[test[\"Fare\"].isnull()]" + "test_rough.loc[unlabelled_test[\"Fare\"].isnull()]" ] }, { @@ -356,7 +363,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_rough[\"Fare\"] = test_rough[\"Fare\"].fillna(test[\"Fare\"].mean())" + "test_rough[\"Fare\"] = test_rough[\"Fare\"].fillna(unlabelled_test[\"Fare\"].mean())" ] }, { @@ -372,8 +379,8 @@ "metadata": {}, "outputs": [], "source": [ - "train_gentle = train.drop(features_to_drop_gentle,axis=\"columns\")\n", - "test_gentle = test.drop(features_to_drop_gentle,axis=\"columns\")" + "train_gentle = labelled_train.drop(features_to_drop_gentle,axis=\"columns\")\n", + "test_gentle = unlabelled_test.drop(features_to_drop_gentle,axis=\"columns\")" ] }, { @@ -393,7 +400,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_gentle[\"Fare\"] = test_gentle[\"Fare\"].fillna(test[\"Fare\"].mean())" + "test_gentle[\"Fare\"] = test_gentle[\"Fare\"].fillna(unlabelled_test[\"Fare\"].mean())" ] }, { @@ -443,8 +450,8 @@ "source": [ "save_folder = os.path.join(gen_dirname,r\"data\\rough\")\n", "os.makedirs(save_folder,exist_ok = True)\n", - "train_rough.to_csv(os.path.join(save_folder,\"train.csv\"),index=False)\n", - "test_rough.to_csv(os.path.join(save_folder,\"test.csv\"),index=False)" + "train_rough.to_csv(os.path.join(save_folder,\"labelled.csv\"),index=False)\n", + "test_rough.to_csv(os.path.join(save_folder,\"unlabelled.csv\"),index=False)" ] }, { @@ -455,8 +462,8 @@ "source": [ "save_folder = os.path.join(gen_dirname,r\"data\\gentle\")\n", "os.makedirs(save_folder,exist_ok = True)\n", - "train_gentle.to_csv(os.path.join(save_folder,\"train.csv\"),index=False)\n", - "test_gentle.to_csv(os.path.join(save_folder,\"test.csv\"),index=False)" + "train_gentle.to_csv(os.path.join(save_folder,\"labelled.csv\"),index=False)\n", + "test_gentle.to_csv(os.path.join(save_folder,\"unlabelled.csv\"),index=False)" ] } ],