.

KoppAlexander · Jul 12, 2024 · 0227b27 · 0227b27
2 parents 910603d + 7a40b58
commit 0227b27
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 129 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,17 @@
 
 ## Flight Prediction Test on Airport Data from Tunesian Airline
 
-Based on several machine learning classifier this project tries to predict delays of individual airplanes. 
+Based on several machine learning classifier this project tries to predict delays of individual airplanes.
+
+### Set up the Presentation
+
+- Thre presentation can be started with streamlit. Make sure to have streamlit installed in your directory, as described in the requirements. 
+
+     ```BASH
+    streamlit run app.py
+    ```
+    After that a local host is started in your standard browser. 
+
 
 
 ## Set up your Environment

diff --git a/app.py b/app.py
@@ -32,6 +32,7 @@
 
 # Sidebar for navigation
 page = st.sidebar.radio("Overview", ["Case Study", "Data", "Analysis", "Model"])
+page = st.sidebar.radio("Overview", ["Case Study", "Data", "Analysis", "Model"])
 
 # Introduction page
 if page == "Case Study":
@@ -159,7 +160,7 @@ def load_data():
 
 
     image = Image.open('images/delay_categories_distribution.png')
-    st.image(image, caption='Different categories for target', use_column_width=True)
+    st.image(image, use_column_width=True)
 
 
 

diff --git a/example_files/train.py b/example_files/train.py
@@ -194,7 +194,6 @@
 duplicate_columns = df.columns[df.columns.duplicated()]
 df = df.loc[:, ~df.columns.duplicated()]
 
-#Target engeneering
 # Convert target into certain category intervals
 
 def target_interval(row):
@@ -212,7 +211,7 @@ def target_interval(row):
         return 6  
 
 df['target_cat'] = df.apply(target_interval, axis=1)
-
+    
 # Standardization
 
 # Create a StandardScaler object
@@ -231,7 +230,43 @@ def target_interval(row):
 X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RSEED)
 
 # Train model
+# Define the parameter distribution for random search
+param_dist = {
+    'n_estimators': randint(50, 100),  # Reduced upper bound
+    'learning_rate': uniform(0.01, 0.5),  # Reduced upper bound
+    'base_estimator__max_depth': randint(1, 5),  # Reduced upper bound
+    'base_estimator__min_samples_split': randint(2, 10),  # Reduced upper bound
+    'base_estimator__min_samples_leaf': randint(1, 10),  # Reduced upper bound
+    'algorithm': ['SAMME', 'SAMME.R']
+}
 
+# Create a base model
+base_estimator = DecisionTreeClassifier(random_state=RSEED)
+ada = AdaBoostClassifier(base_estimator=base_estimator, random_state=RSEED)
+
+# Create a custom scorer (you can change this to other metrics if needed)
+scorer = make_scorer(f1_score)
+
+# Instantiate RandomizedSearchCV object
+random_search = RandomizedSearchCV(
+    estimator=ada,
+    param_distributions=param_dist,
+    n_iter=50,  # Reduced number of iterations
+    cv=3,  # Reduced number of cross-validation folds
+    scoring=scorer,
+    random_state=RSEED,
+    n_jobs=-1  # use all available cores
+)
+
+# Fit RandomizedSearchCV
+random_search.fit(X_train, y_train)
+
+# Print the best parameters and score
+print("Best parameters:", random_search.best_params_)
+print("Best cross-validation score:", random_search.best_score_)
+
+# Get the best model
+model = random_search.best_estimator_
 
 # Save the model
 dump(model, 'models/model.joblib')
diff --git a/project_classification.ipynb b/project_classification.ipynb
@@ -222701,57 +222701,60 @@
     "#### Model 2: KNeighborsClassifier"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### GridSearchCV"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "param_grid = {\n",
-    "    'n_neighbors': [3, 5, 7],\n",
-    "    'weights': ['uniform', 'distance'],\n",
-    "    'metric': ['euclidean', 'manhattan']\n",
-    "}\n",
-    "\n",
-    "Best parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'}"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Instantiate and train KNeighborsClassifier\n",
-    "#model_2 = KNeighborsClassifier(**grid_search.best_params_) # Get best parameters from grid search\n",
     "model_2 = KNeighborsClassifier(n_neighbors=7, weights='uniform', metric='manhattan')\n",
     "model_2.fit(X_train, y_train)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### GridSearchCV"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Definition of hyperparameters\n",
-    "param_grid = {\n",
+    "param_dist = {\n",
     "    'n_neighbors': [3, 5, 7],\n",
     "    'weights': ['uniform', 'distance'],\n",
     "    'metric': ['euclidean', 'manhattan']\n",
     "}\n",
     "\n",
-    "# GridSearchCV initiation\n",
-    "grid_search = GridSearchCV(model_2, param_grid, cv=5)\n",
+    "# Create a custom scorer (you can change this to other metrics if needed)\n",
+    "scorer = make_scorer(f1_score)\n",
+    "\n",
+    "# Instantiate RandomizedSearchCV object\n",
+    "random_search = RandomizedSearchCV(\n",
+    "    estimator=model_2,\n",
+    "    param_distributions=param_dist,\n",
+    "    n_iter=100,  # number of parameter settings that are sampled\n",
+    "    cv=5,  # number of cross-validation folds\n",
+    "    scoring=scorer,\n",
+    "    random_state=RSEED,\n",
+    "    n_jobs=-1  # use all available cores\n",
+    ")\n",
+    "\n",
+    "# Fit RandomizedSearchCV\n",
+    "random_search.fit(X_train, y_train)\n",
     "\n",
-    "# Search for best parameters\n",
-    "grid_search.fit(X_train, y_train)\n",
-    "print(\"Best parameters:\", grid_search.best_params_)"
+    "# Print the best parameters and score\n",
+    "print(\"Best parameters:\", random_search.best_params_)\n",
+    "print(\"Best cross-validation score:\", random_search.best_score_)\n",
+    "\n",
+    "# Get the best model\n",
+    "model_2 = random_search.best_estimator_"
    ]
   },
   {
@@ -222945,7 +222948,7 @@
    "source": [
     "# Define the parameter distribution for random search\n",
     "param_dist = {\n",
-    "    #'criterion': ['gini', 'entropy'],\n",
+    "    'criterion': ['gini', 'entropy'],\n",
     "    'max_depth': randint(5, 20),\n",
     "    'min_samples_split': randint(2, 20),\n",
     "    'min_samples_leaf': randint(2, 20),\n",
@@ -223170,31 +223173,35 @@
    "outputs": [],
    "source": [
     "# Definition of hyperparameters\n",
-    "param_grid = {\n",
-    "    'n_estimators': [100, 200, 700],\n",
-    "    'max_depth': [10, 20, 30],\n",
-    "    'max_features': ['auto', 'sqrt', 'log2']\n",
+    "param_dist = {\n",
+    "    'n_neighbors': [3, 5, 7],\n",
+    "    'weights': ['uniform', 'distance'],\n",
+    "    'metric': ['euclidean', 'manhattan']\n",
     "}\n",
     "\n",
-    "# GridSearchCV initiation\n",
-    "grid_search = GridSearchCV(model_4, param_grid, cv=5)\n",
+    "# Create a custom scorer (you can change this to other metrics if needed)\n",
+    "scorer = make_scorer(f1_score)\n",
     "\n",
-    "# Search for best parameters\n",
-    "grid_search.fit(X_train, y_train)\n",
-    "print(\"Best parameters:\", grid_search.best_params_)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "param_grid = {\n",
-    "    'n_estimators': [100, 200, 700],\n",
-    "    'max_depth': [10, 20, 30],\n",
-    "    'max_features': ['auto', 'sqrt', 'log2']\n",
-    "}\n",
+    "# Instantiate RandomizedSearchCV object\n",
+    "random_search = RandomizedSearchCV(\n",
+    "    estimator=model_4,\n",
+    "    param_distributions=param_dist,\n",
+    "    n_iter=100,  # number of parameter settings that are sampled\n",
+    "    cv=5,  # number of cross-validation folds\n",
+    "    scoring=scorer,\n",
+    "    random_state=RSEED,\n",
+    "    n_jobs=-1  # use all available cores\n",
+    ")\n",
     "\n",
-    "Best parameters: "
+    "# Fit RandomizedSearchCV\n",
+    "random_search.fit(X_train, y_train)\n",
+    "\n",
+    "# Print the best parameters and score\n",
+    "print(\"Best parameters:\", random_search.best_params_)\n",
+    "print(\"Best cross-validation score:\", random_search.best_score_)\n",
+    "\n",
+    "# Get the best model\n",
+    "model_4 = random_search.best_estimator_"
    ]
   },
   {
@@ -225850,88 +225857,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Ideas for model improvement:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# - Distribution of variables\n",
-    "# - Plot prediction on target\n",
-    "# - Poisson regressor? Just positive predictions (only positive delay!)\n",
-    "# - SMOTE for inbalanced classes?\n",
-    "# - Run code externally (aws = not free, https://stats.stackexchange.com/questions/12900/when-is-r-squared-negative, https://colab.research.google.com/, kaggle.com)\n",
-    "# - Windows: setup nvidia to use GPU\n",
-    "# - time series with sin, cos so that Mon = 1 und Sun = 7 with big difference but continously distributed and End and Start are close!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Validation"
+    "## Ideas for model improvement:"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Packaging"
+    "- Distribution of variables\n",
+    "- Plot prediction on target\n",
+    "- Poisson regressor? Just positive predictions (only positive delay!)\n",
+    "- SMOTE for inbalanced classes?\n",
+    "- Run code externally (aws = not free, https://stats.stackexchange.com/questions/12900/when-is-r-squared-negative, https://colab.research.google.com/, kaggle.com)\n",
+    "- Windows: setup nvidia to use GPU\n",
+    "- time series with sin, cos so that Mon = 1 und Sun = 7 with big difference but continously distributed and End and Start are close!"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {

diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,5 @@ jupyterlab-dash==0.1.0a3
 scikit-learn==1.2.2
 statsmodels==0.13.5
 pytest==7.3.1
-xgboost==1.24.3
+xgboost==1.24.3
+streamlit==1.36.0