From 29bb1b822927d785e06128dab5a72c851ed4b2da Mon Sep 17 00:00:00 2001 From: Claudio <122022571+claudio-tw@users.noreply.github.com> Date: Tue, 2 May 2023 09:40:22 +0100 Subject: [PATCH] Study notebooks - Comparison with Boruta (#25) * Dependence on ensemble - failure of HSIC * Update notebook * Study notebooks - Comparison iwth boruta * hselstudy conda env file * Notebook cosmetics --- hiselstudy.yml | 17 + notebooks/study/ensemble.ipynb | 404 +++++++++++++++++ notebooks/study/nonlinear.ipynb | 758 ++++++++++++++++++++++++++++++++ 3 files changed, 1179 insertions(+) create mode 100644 hiselstudy.yml create mode 100644 notebooks/study/ensemble.ipynb create mode 100644 notebooks/study/nonlinear.ipynb diff --git a/hiselstudy.yml b/hiselstudy.yml new file mode 100644 index 0000000..48d8fbe --- /dev/null +++ b/hiselstudy.yml @@ -0,0 +1,17 @@ +name: hiselstudy +channels: + - conda-forge + - nodefaults +dependencies: + - python=3.9 + - ipython + - ipykernel + - numpy + - pandas + - scipy + - scikit-learn + - shap + - lightgbm + - tqdm + - matplotlib + - pip diff --git a/notebooks/study/ensemble.ipynb b/notebooks/study/ensemble.ipynb new file mode 100644 index 0000000..20d1837 --- /dev/null +++ b/notebooks/study/ensemble.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "802e8c73", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import itertools\n", + "from sklearn.metrics import adjusted_mutual_info_score\n", + "\n", + "\n", + "from hisel import select, hsic\n", + "from hisel.select import FeatureType, HSICSelector as Selector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "798f7c6d", + "metadata": {}, + "outputs": [], + "source": [ + "k = 5\n", + "n = 10000\n", + "d = 30" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50b99be8", + "metadata": {}, + "outputs": [], + "source": [ + "x0 = np.random.randint(k, size=(n, 1))\n", + "x1 = np.random.randint(k, size=(n, 1))\n", + "ms = np.random.randint(low=2, high=20, size = d-2)\n", + "others = [np.random.choice(m, size=(n, 1)) for m in ms]\n", + "all_ = np.concatenate(\n", + " [x0, x1] + others,\n", + " axis=1\n", + ")\n", + "y = np.asarray(x0 == x1, dtype=int) # k + x0 - x1 # np.asarray(x0 == x1, dtype=int)\n", + "permuter = np.random.permutation(np.eye(d, dtype=int).T).T\n", + "x = np.array(all_ @ permuter, dtype=int)\n", + "expected_features = [np.argmax(permuter[0, :]), np.argmax(permuter[1, :])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6236e9e", + "metadata": {}, + "outputs": [], + "source": [ + "assert np.all(x[:, expected_features[0]] == x0[:, 0])\n", + "assert np.all(x[:, expected_features[1]] == x1[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f83edaef", + "metadata": {}, + "outputs": [], + "source": [ + "sns.scatterplot(x = x0[:, 0] - x1[:, 0], y = y[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "140b9f88", + "metadata": {}, + "outputs": [], + "source": [ + "xdf = pd.DataFrame(x, columns = [f'x{i}' for i in range(d)])\n", + "ydf = pd.Series(y[:, 0], name='y')" + ] + }, + { + "cell_type": "markdown", + "id": "e37502d7", + "metadata": {}, + "source": [ + "### Selection with marginal 1D ksg mutual info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "139b18ff", + "metadata": {}, + "outputs": [], + "source": [ + "ksgselection, mis = select.ksgmi(xdf, ydf, threshold=0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ffca204", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Expected features: {sorted(expected_features)}')\n", + "print(f'Marginal KSG selection: {sorted(ksgselection)}')" + ] + }, + { + "cell_type": "markdown", + "id": "c8906000", + "metadata": {}, + "source": [ + "### Selection with HSIC Lasso" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1487ff0e", + "metadata": {}, + "outputs": [], + "source": [ + "selector = Selector(x, y, xfeattype=FeatureType.DISCR, yfeattype=FeatureType.DISCR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afab6f16", + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = n // 10\n", + "minibatch_size = 200\n", + "number_of_epochs = 3\n", + "threshold = .0\n", + "device = None # run on CPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01efe57c", + "metadata": {}, + "outputs": [], + "source": [ + "hsiclasso_selection = selector.select(\n", + " number_of_features=2,\n", + " batch_size=batch_size,\n", + " minibatch_size=minibatch_size,\n", + " number_of_epochs=number_of_epochs,\n", + " device=device\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97929ada", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Expected features: {sorted(expected_features)}')\n", + "print(f'HSIC Lasso selection: {sorted(hsiclasso_selection)}')" + ] + }, + { + "cell_type": "markdown", + "id": "d88d85c5", + "metadata": {}, + "source": [ + "### Confirm that HSIC_b correctly assigns highest dependence to the correct selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38056f04", + "metadata": {}, + "outputs": [], + "source": [ + "correct_dependence = n * n * hsic.hsic_b(\n", + " x[:, list(expected_features)],\n", + " y\n", + ")\n", + "nsel = np.random.randint(low=1, high=d)\n", + "random_selection = np.random.choice(list(range(d)), replace=False, size=nsel)\n", + "random_dependence = n * n * hsic.hsic_b(\n", + " x[:, list(random_selection)],\n", + " y\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92bc809f", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'HSIC-estimated dependence between correct selection and target: {correct_dependence}')\n", + "print(f'HSIC-estimated dependence between random selection and target: {random_dependence}')" + ] + }, + { + "cell_type": "markdown", + "id": "beb34ecd", + "metadata": {}, + "source": [ + "### Selection with 2D discrete mutual information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d1459fb", + "metadata": {}, + "outputs": [], + "source": [ + "def onedimlabel(x):\n", + " assert x.ndim == 2\n", + " ns = np.amax(x, axis=0)\n", + " res = np.array(x[:, 0], copy=True)\n", + " m = 1\n", + " for i in range(1, x.shape[1]):\n", + " m *= max(1, ns[i-1])\n", + " res += (1+m) * x[:, i]\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16a8e7f5", + "metadata": {}, + "outputs": [], + "source": [ + "l = 2\n", + "miscores = {subset: \n", + " adjusted_mutual_info_score(onedimlabel(x[:, list(subset)]), y[:, 0])\n", + " for subset in itertools.combinations(list(range(d)), l)\n", + " \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "168eb38b", + "metadata": {}, + "outputs": [], + "source": [ + "s = (0,1)\n", + "mi = 0\n", + "for k, v in miscores.items():\n", + " if v > mi:\n", + " s = k\n", + " mi = v\n", + "twod_mi_selection = s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a14eb4e9", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Expected features: {sorted(expected_features)}')\n", + "print(f'2D discrete MI selection: {sorted(twod_mi_selection)}')" + ] + }, + { + "cell_type": "markdown", + "id": "6776b78e", + "metadata": {}, + "source": [ + "### Selection with Boruta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "511abe4a", + "metadata": {}, + "outputs": [], + "source": [ + "from arfs.feature_selection import allrelevant\n", + "from arfs.feature_selection.allrelevant import Leshy\n", + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a02901e1", + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 'auto'\n", + "perc = 95\n", + "alpha = 0.05\n", + "importance = \"shap\"\n", + "two_step = True\n", + "max_iter = 100\n", + "random_state = None\n", + "verbose = 0\n", + "keep_weak = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00081320", + "metadata": {}, + "outputs": [], + "source": [ + "xdf = pd.DataFrame(x, columns = [f'f{i}' for i in range(d)])\n", + "yser = pd.Series(y[:, 0], name='y')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f456422", + "metadata": {}, + "outputs": [], + "source": [ + "rf = RandomForestClassifier(n_jobs=-1, max_depth=8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a36700a6", + "metadata": {}, + "outputs": [], + "source": [ + "leshy = Leshy(\n", + " rf,\n", + " n_estimators=n_estimators,\n", + " perc=perc,\n", + " alpha=alpha,\n", + " importance=importance,\n", + " two_step=two_step,\n", + " max_iter=max_iter,\n", + " random_state=random_state,\n", + " verbose=verbose,\n", + " keep_weak=keep_weak,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9613c874", + "metadata": {}, + "outputs": [], + "source": [ + "leshy.fit(xdf, yser)\n", + "leshy_selection = [int(col.replace('f', '')) for col in leshy.selected_features_]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9056e56", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Expected features: {sorted(expected_features)}')\n", + "print(f'Boruta selection: {sorted(leshy_selection)}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hiselstudy", + "language": "python", + "name": "hiselstudy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/study/nonlinear.ipynb b/notebooks/study/nonlinear.ipynb new file mode 100644 index 0000000..a9e5e8b --- /dev/null +++ b/notebooks/study/nonlinear.ipynb @@ -0,0 +1,758 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fd61a5c3", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from scipy.stats import special_ortho_group\n", + "from hisel.select import HSICSelector as Selector" + ] + }, + { + "cell_type": "markdown", + "id": "c2559eae", + "metadata": {}, + "source": [ + "# Sin transform " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b761492", + "metadata": {}, + "outputs": [], + "source": [ + "dim_x = 10\n", + "dim_y = 1 # has to be one-dimensiona\n", + "dim_z = 1\n", + "\n", + "batch_size = int(1e+4)\n", + "minibatch_size = 250\n", + "num_of_samples = int(1e+4)\n", + "number_of_epochs = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d4f2545", + "metadata": {}, + "outputs": [], + "source": [ + "transform_tilde = np.eye(dim_z)[:dim_y]\n", + "A = np.random.permutation(np.concatenate((np.eye(dim_z), np.zeros((dim_z, dim_x - dim_z))), axis=1).T).T\n", + "transform = transform_tilde @ A" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bde18951", + "metadata": {}, + "outputs": [], + "source": [ + "x_samples = np.random.uniform(size=(num_of_samples, dim_x))\n", + "tt = np.repeat(np.expand_dims(transform, axis=0), repeats=num_of_samples, axis=0)\n", + "prey = (tt @ np.expand_dims(x_samples, axis=2))[:, :, 0]\n", + "y_samples = np.random.normal(0, 3e-1, size=prey.shape) \n", + "y_samples[:, 0] += np.sin(2*np.pi*prey[:, 0])" + ] + }, + { + "cell_type": "markdown", + "id": "1d0b9a75", + "metadata": {}, + "source": [ + "### Viz of relations between target and features" + ] + }, + { + "cell_type": "markdown", + "id": "9f2e819f", + "metadata": {}, + "source": [ + "Relation between $y$ and the correct feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fc8c15a", + "metadata": {}, + "outputs": [], + "source": [ + "expected_features = np.argsort(np.sum(A, axis=0))[::-1][:dim_z]\n", + "sns.scatterplot(x=x_samples[:, expected_features[0]], y=y_samples[:, 0])" + ] + }, + { + "cell_type": "markdown", + "id": "f4305414", + "metadata": {}, + "source": [ + "Relation between $y$ and a wrong feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0661e9a", + "metadata": {}, + "outputs": [], + "source": [ + "nonrelevant = set(range(dim_x)).difference(set(expected_features))\n", + "featureidx = np.random.choice(list(nonrelevant))\n", + "sns.scatterplot(x=x_samples[:, featureidx], y=y_samples[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c8caf8a", + "metadata": {}, + "outputs": [], + "source": [ + "projector = Selector(x_samples, y_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e198c95", + "metadata": {}, + "outputs": [], + "source": [ + "curve = projector.regularization_curve(\n", + " batch_size=batch_size,\n", + " minibatch_size=minibatch_size,\n", + " number_of_epochs=number_of_epochs\n", + ")\n", + "paths = projector.lasso_path()" + ] + }, + { + "cell_type": "markdown", + "id": "6551e522", + "metadata": {}, + "source": [ + "#### Sorted features by decreasing importance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a503fa32", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Sorted features by decreasing importance: {projector.ordered_features}')" + ] + }, + { + "cell_type": "markdown", + "id": "3b6679bf", + "metadata": {}, + "source": [ + "### Test selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65f990cd", + "metadata": {}, + "outputs": [], + "source": [ + "expected_features = np.argsort(np.sum(A, axis=0))[::-1][:dim_z]\n", + "noise_features = set(range(dim_x)).difference(set(expected_features))\n", + "selected_features = np.argsort(paths.iloc[-1, :])[::-1][:dim_z]\n", + "print(f'Expected features: {sorted(list(expected_features))}')\n", + "print(f'Selected features: {sorted(list(selected_features))}')" + ] + }, + { + "cell_type": "markdown", + "id": "a8bf88af", + "metadata": {}, + "source": [ + "## Comparison with sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "332ba768", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import f_regression, mutual_info_regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b76ba28", + "metadata": {}, + "outputs": [], + "source": [ + "fstats, _ = f_regression(x_samples, np.linalg.norm(y_samples, axis=1))\n", + "fstats /= np.max(fstats)\n", + "f_selection = np.argmax(fstats)\n", + "print(f'f_selection: {f_selection}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceec08f5", + "metadata": {}, + "outputs": [], + "source": [ + "mi = mutual_info_regression(x_samples, np.linalg.norm(y_samples, axis=1))\n", + "mi /= np.max(mi)\n", + "mi_selection = np.argmax(mi)\n", + "print(f'mi_selection: {mi_selection}')" + ] + }, + { + "cell_type": "markdown", + "id": "b455ad3a", + "metadata": {}, + "source": [ + "## Comparison with Boruta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96b8a9cb", + "metadata": {}, + "outputs": [], + "source": [ + "from arfs.feature_selection import allrelevant\n", + "from arfs.feature_selection.allrelevant import Leshy\n", + "from sklearn.ensemble import RandomForestRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "221af1e1", + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 'auto'\n", + "perc = 90\n", + "alpha = 0.05\n", + "importance = \"shap\"\n", + "two_step = True\n", + "max_iter = 100\n", + "random_state = None\n", + "verbose = 0\n", + "keep_weak = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b585adf9", + "metadata": {}, + "outputs": [], + "source": [ + "xdf = pd.DataFrame(x_samples, columns = [f'f{i}' for i in range(dim_x)])\n", + "yser = pd.Series(y_samples[:, 0], name='y')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ce26143", + "metadata": {}, + "outputs": [], + "source": [ + "rf = RandomForestRegressor(n_jobs=-1, max_depth=8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a996e76c", + "metadata": {}, + "outputs": [], + "source": [ + "leshy = Leshy(\n", + " rf,\n", + " n_estimators=n_estimators,\n", + " perc=perc,\n", + " alpha=alpha,\n", + " importance=importance,\n", + " two_step=two_step,\n", + " max_iter=max_iter,\n", + " random_state=random_state,\n", + " verbose=verbose,\n", + " keep_weak=keep_weak,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70b33240", + "metadata": {}, + "outputs": [], + "source": [ + "leshy.fit(xdf, yser)" + ] + }, + { + "cell_type": "markdown", + "id": "fb4528e0", + "metadata": {}, + "source": [ + "### Test selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93b73a26", + "metadata": {}, + "outputs": [], + "source": [ + "expected_features = np.argsort(np.sum(A, axis=0))[::-1][:dim_z]\n", + "leshy_selection = [int(col.replace('f', '')) for col in leshy.selected_features_]\n", + "print(f'Expected features: {sorted(list(expected_features))}')\n", + "print(f'Leshy-selected features: {sorted(list(leshy_selection))}')" + ] + }, + { + "cell_type": "markdown", + "id": "7771b83e", + "metadata": {}, + "source": [ + "# Linear and non-linear transformation in high dimension" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10edf512", + "metadata": {}, + "outputs": [], + "source": [ + "dim_x = 20\n", + "dim_y = 1 # has to be one-dimensional\n", + "dim_z = 5\n", + "\n", + "batch_size = int(1e+4)\n", + "minibatch_size = 250\n", + "num_of_samples = int(1e+4)\n", + "number_of_epochs = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ac11521", + "metadata": {}, + "outputs": [], + "source": [ + "transform_tilde = special_ortho_group.rvs(dim_z)[:dim_y]\n", + "A = np.random.permutation(np.concatenate((np.eye(dim_z), np.zeros((dim_z, dim_x - dim_z))), axis=1).T).T\n", + "transform = transform_tilde @ A" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15eb3b4c", + "metadata": {}, + "outputs": [], + "source": [ + "x_samples = np.random.uniform(size=(num_of_samples, dim_x))\n", + "tt = np.repeat(np.expand_dims(transform, axis=0), repeats=num_of_samples, axis=0)\n", + "prey = (tt @ np.expand_dims(x_samples, axis=2))[:, :, 0]\n", + "y_samples = np.random.normal(0, 1e-2, size=prey.shape) # np.zeros_like(prey)\n", + "y_samples[:, 0] = np.sin(2*np.pi*prey[:, 0])" + ] + }, + { + "cell_type": "markdown", + "id": "742b8c90", + "metadata": {}, + "source": [ + "### Viz of relations between target and features" + ] + }, + { + "cell_type": "markdown", + "id": "286a7886", + "metadata": {}, + "source": [ + "Becasue of the initial rotation, visual inspection of the relation between features and target does not give insights on which features should be selected" + ] + }, + { + "cell_type": "markdown", + "id": "dbc33430", + "metadata": {}, + "source": [ + "Relation between $y$ and a correct feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbb24917", + "metadata": {}, + "outputs": [], + "source": [ + "expected_features = np.argsort(np.sum(A, axis=0))[::-1][:dim_z]\n", + "sns.scatterplot(x=x_samples[:, expected_features[0]], y=y_samples[:, 0])" + ] + }, + { + "cell_type": "markdown", + "id": "dfad4947", + "metadata": {}, + "source": [ + "Relation between $y$ and a wrong feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12f86383", + "metadata": {}, + "outputs": [], + "source": [ + "nonrelevant = set(range(dim_x)).difference(set(expected_features))\n", + "featureidx = np.random.choice(list(nonrelevant))\n", + "sns.scatterplot(x=x_samples[:, featureidx], y=y_samples[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49701075", + "metadata": {}, + "outputs": [], + "source": [ + "projector = Selector(x_samples, y_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1af2d5e4", + "metadata": {}, + "outputs": [], + "source": [ + "curve = projector.regularization_curve(\n", + " batch_size=batch_size,\n", + " minibatch_size=minibatch_size,\n", + " number_of_epochs=number_of_epochs\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c96e444f", + "metadata": {}, + "outputs": [], + "source": [ + "paths = projector.lasso_path()" + ] + }, + { + "cell_type": "markdown", + "id": "bd1a2150", + "metadata": {}, + "source": [ + "#### Sorted features by decreasing importance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59656d81", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Sorted features by decreasing importance: {projector.ordered_features}')" + ] + }, + { + "cell_type": "markdown", + "id": "85ff9d3c", + "metadata": {}, + "source": [ + "### Test selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55d268d9", + "metadata": {}, + "outputs": [], + "source": [ + "expected_features = np.argsort(np.sum(A, axis=0))[::-1][:dim_z]\n", + "noise_features = set(range(dim_x)).difference(set(expected_features))\n", + "selected_features = np.argsort(paths.iloc[-1, :])[::-1][:dim_z]\n", + "print(f'Expected features: {sorted(list(expected_features))}')\n", + "print(f'Selected features: {sorted(list(selected_features))}')" + ] + }, + { + "cell_type": "markdown", + "id": "4e5280cf", + "metadata": {}, + "source": [ + "## Regularisation curve" + ] + }, + { + "cell_type": "markdown", + "id": "b578c3c6", + "metadata": {}, + "source": [ + "#### Cumulative beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71c90034", + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(np.arange(1, 1+len(curve)), curve)" + ] + }, + { + "cell_type": "markdown", + "id": "64d50369", + "metadata": {}, + "source": [ + "#### Absolute beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4ae8aab", + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(np.arange(1, len(curve)), np.abs(np.diff(curve)))" + ] + }, + { + "cell_type": "markdown", + "id": "791889d0", + "metadata": {}, + "source": [ + "## Lasso paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df408f64", + "metadata": {}, + "outputs": [], + "source": [ + "paths.plot(figsize=(10, 5))" + ] + }, + { + "cell_type": "markdown", + "id": "87b7675f", + "metadata": {}, + "source": [ + "## Comparison with sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30f6f83b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import f_regression, mutual_info_regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24df734c", + "metadata": {}, + "outputs": [], + "source": [ + "fstats, _ = f_regression(x_samples, np.linalg.norm(y_samples, axis=1))\n", + "fstats /= np.max(fstats)\n", + "f_selection = np.argsort(fstats)[::-1][:dim_z]\n", + "print(f'Expected features: {sorted(list(expected_features))}')\n", + "print(f'f_selection: {sorted(f_selection)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d84477ee", + "metadata": {}, + "outputs": [], + "source": [ + "mi = mutual_info_regression(x_samples, y_samples)\n", + "mi /= np.max(mi)\n", + "mi_selection = np.argsort(mi)[::-1][:dim_z]\n", + "print(f'Expected features: {sorted(list(expected_features))}')\n", + "print(f'mi_selection: {sorted(mi_selection)}')" + ] + }, + { + "cell_type": "markdown", + "id": "f5ada8be", + "metadata": {}, + "source": [ + "## Comparison with Boruta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "511abe4a", + "metadata": {}, + "outputs": [], + "source": [ + "from arfs.feature_selection import allrelevant\n", + "from arfs.feature_selection.allrelevant import Leshy\n", + "from sklearn.ensemble import RandomForestRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a02901e1", + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 'auto'\n", + "perc = 90\n", + "alpha = 0.05\n", + "importance = \"shap\"\n", + "two_step = True\n", + "max_iter = 100\n", + "random_state = None\n", + "verbose = 0\n", + "keep_weak = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00081320", + "metadata": {}, + "outputs": [], + "source": [ + "xdf = pd.DataFrame(x_samples, columns = [f'f{i}' for i in range(dim_x)])\n", + "yser = pd.Series(y_samples[:, 0], name='y')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f456422", + "metadata": {}, + "outputs": [], + "source": [ + "rf = RandomForestRegressor(n_jobs=-1, max_depth=8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a36700a6", + "metadata": {}, + "outputs": [], + "source": [ + "leshy = Leshy(\n", + " rf,\n", + " n_estimators=n_estimators,\n", + " perc=perc,\n", + " alpha=alpha,\n", + " importance=importance,\n", + " two_step=two_step,\n", + " max_iter=max_iter,\n", + " random_state=random_state,\n", + " verbose=verbose,\n", + " keep_weak=keep_weak,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9613c874", + "metadata": {}, + "outputs": [], + "source": [ + "leshy.fit(xdf, yser)" + ] + }, + { + "cell_type": "markdown", + "id": "7b37b316", + "metadata": {}, + "source": [ + "### Test selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf586f09", + "metadata": {}, + "outputs": [], + "source": [ + "expected_features = np.argsort(np.sum(A, axis=0))[::-1][:dim_z]\n", + "leshy_selection = [int(col.replace('f', '')) for col in leshy.selected_features_]\n", + "print(f'Expected features: {sorted(list(expected_features))}')\n", + "print(f'Leshy-selected features: {sorted(list(leshy_selection))}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hiselstudy", + "language": "python", + "name": "hiselstudy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}