From 165c92b331162f4d6f6a4de5cb453bbeac4c9cc4 Mon Sep 17 00:00:00 2001 From: Winston Date: Thu, 15 Aug 2019 11:06:13 -0700 Subject: [PATCH 1/7] initial commit for WIP pr & issue notebook addition; currently seeing parse_dates arg error on first read_csv; new issue as of yesterday; otherwise notebook was/should be running fine as labeled/noted --- .../zillow_kaggle_zestimate_comp.ipynb | 3046 +++++++++++++++++ 1 file changed, 3046 insertions(+) create mode 100644 colab_notebooks/zillow_kaggle_zestimate_comp.ipynb diff --git a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb new file mode 100644 index 00000000..24a1849f --- /dev/null +++ b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb @@ -0,0 +1,3046 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "zillow_kaggle_zestimate_comp.ipynb", + "version": "0.3.2", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "scfLT2i0MLyD", + "colab_type": "text" + }, + "source": [ + "# Environment Sanity Check #\n", + "\n", + "Click the _Runtime_ dropdown at the top of the page, then _Change Runtime Type_ and confirm the instance type is _GPU_.\n", + "\n", + "Check the output of `!nvidia-smi` to make sure you've been allocated a Tesla T4.\n", + "\n", + "#Setup:\n", + "\n", + "1. Install most recent Miniconda release compatible with Google Colab's Python install (3.6.7)\n", + "2. Install RAPIDS libraries\n", + "3. Set necessary environment variables\n", + "4. Copy RAPIDS .so files into current working directory, a workaround for conda/colab interactions\n", + "- **TLDR**\n", + " - Hit `Shift` + `Enter`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "W-um5d-x7o46", + "colab_type": "code", + "outputId": "37bf77fb-7f83-49fc-b5e5-514cd049e32d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 329 + } + }, + "source": [ + "\"\"\"make sure we have the right GPU\n", + "> column 1 row 3 == Tesla T4\n", + "\"\"\"\n", + "# display gpu specs\n", + "!nvidia-smi" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Thu Aug 15 03:12:33 2019 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 418.67 Driver Version: 410.79 CUDA Version: 10.0 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 60C P8 16W / 70W | 0MiB / 15079MiB | 0% Default |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: GPU Memory |\n", + "| GPU PID Type Process name Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kkEdr1VmigyU", + "colab_type": "text" + }, + "source": [ + "### Install RAPIDS AI" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "p129YxxnihcV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -nc https://github.com/rapidsai/notebooks-contrib/blob/master/utils/rapids-colab.sh\n", + "!bash rapids-colab.sh\n", + "\n", + "import sys, os\n", + "\n", + "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n", + "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", + "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1CsdVW7SU9Li", + "colab_type": "text" + }, + "source": [ + "# Zillow Kaggle Competition RAPIDS Conversion\n", + "- initially based off eswar3's [Zillow prediction models]( https://github.com/eswar3/Zillow-prediction-models) repo\n", + "## Download Data\n", + "- to download the data, please plug in your kaggle api username & key\n", + " - you can set up your kaggle api at `https://www.kaggle.com/YOUR USERNAME HERE/account`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "x1dLRTm168Tk", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Info on how to get your api key (kaggle.json) here: https://github.com/Kaggle/kaggle-api#api-credentials\n", + "!pip install kaggle\n", + "!mkdir /root/.kaggle\n", + "# plug api -- get your own API key\n", + "!echo '{\"username\":\"warobson\",\"key\":\"\"}' > /root/.kaggle/kaggle.json\n", + "!chmod 600 /root/.kaggle/kaggle.json\n", + "# !kaggle datasets download\n", + "!kaggle competitions download -c zillow-prize-1\n", + "\n", + "# unzip kaggle data\n", + "!unzip -q \"/content/sample_submission.csv.zip\"\n", + "!unzip -q \"/content/train_2016_v2.csv.zip\"\n", + "!unzip -q \"/content/properties_2016.csv.zip\"\n", + "!unzip -q \"/content/train_2017.csv.zip\"\n", + "!unzip -q \"/content/properties_2017.csv.zip\"" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LICr9uz8do9K", + "colab_type": "text" + }, + "source": [ + "#### How is the data saved?\n", + "- inside content directory " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6n75DyJ-dm4B", + "colab_type": "code", + "outputId": "fbd949ae-aa45-4c67-c6e2-74553239623e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 182 + } + }, + "source": [ + "# display content folder contents\n", + "!ls \"/content/\"" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.9\t\t\t\t sample_data\n", + "env-check.py\t\t\t sample_submission.csv\n", + "__MACOSX\t\t\t sample_submission.csv.zip\n", + "Miniconda3-4.5.4-Linux-x86_64.sh train_2016_v2.csv\n", + "properties_2016.csv\t\t train_2016_v2.csv.zip\n", + "properties_2016.csv.zip\t\t train_2017.csv\n", + "properties_2017.csv\t\t train_2017.csv.zip\n", + "properties_2017.csv.zip\t\t zillow_data_dictionary.xlsx.zip\n", + "rapids-colab.sh\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lpa1b4edIXuT", + "colab_type": "text" + }, + "source": [ + "# Imports\n", + "### RAPIDS\n", + "* `cuDf`\n", + " - words here\n", + "* `cuML`\n", + " - words here\n", + "* `cuPy`\n", + " - words here\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "_Tvf2biLAA9r", + "colab": {} + }, + "source": [ + "# rapids imports\n", + "import cudf, cuml, cupy\n", + "# general imports \n", + "import io, requests " + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YJeywzd2efw7", + "colab_type": "text" + }, + "source": [ + "## Data\n", + "* `properties_2016`\n", + " - aprox. 27,000,000 residential properties \n", + " - 58 attributes each\n", + "* `train_2016_v2`\n", + " - 90,000 transaction records for closings in the year 2016\n", + " * Merge datasets on `property_id`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uynoUxpx8Xsn", + "colab_type": "code", + "outputId": "545d3b69-741a-4f23-86df-62ec7f19fb7d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 227 + } + }, + "source": [ + "# import train 2016 data\n", + "train2016 = cudf.read_csv('/content/train_2016_v2.csv',\n", + " parse_dates=[\"transactiondate\"])\n", + "# peek display 2016 train\n", + "print(train2016.head())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "error", + "ename": "TypeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m train2016 = cudf.read_csv('/content/train_2016_v2.csv',\n\u001b[0;32m----> 2\u001b[0;31m parse_dates=[\"transactiondate\"])\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# peek display 2016 train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain2016\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: read_csv() got an unexpected keyword argument 'parse_dates'" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2EfApIzCfEtr", + "colab_type": "code", + "outputId": "eabb1351-f4f9-499c-9aea-2fa2953c11a7", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 146 + } + }, + "source": [ + "# import 2016 properties\n", + "prop2016 = cudf.read_csv('/content/properties_2016.csv')\n", + "# peek display 2016 properties\n", + "print(prop2016.head())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + " parcelid airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid ... censustractandblock\n", + "0 10754147 0.0 0.0 ... \n", + "1 10759547 0.0 0.0 ... \n", + "2 10843547 0.0 0.0 ... \n", + "3 10859147 0.0 0.0 3.0 ... \n", + "4 10879947 0.0 0.0 4.0 ... \n", + "[50 more columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gGiscxESJDrl", + "colab_type": "text" + }, + "source": [ + "## [Zillow Prediction Model](https://github.com/eswar3/Zillow-prediction-models/blob/master/Step%202a-Approach1.ipynb)\n", + "\n", + " In this approach the properties data and transaction data are merged together before adressing any missing values\n", + "\n", + "\n", + "#### Merging Data \n", + " - we will start by merging the two dataframes\n", + " - then rename the new dataframe's attributes to be meaningful \n", + " - e.g. from `pooltypeid7` to `pool_with_spa_tub_no` and `structuretaxvaluedollarcnt` to `structure_tax`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "o4CvSIcwm4B2", + "colab_type": "code", + "outputId": "6db5ec53-8522-4483-e2fa-d79d9d9d75e8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 146 + } + }, + "source": [ + "# merge 2016 train and property dataframes by parcel id\n", + "train = train2016.merge(prop2016, how='left', on='parcelid')\n", + "\n", + "# work on a copy\n", + "df_train = train.copy() # [:int(0.5*len(train))]\n", + "\n", + "# add column inidcaticating month of transaction\n", + "df_train['transaction_month'] = df_train['transactiondate'].dt.month\n", + "\n", + "# set colums to be renamed for general english understandability \n", + "rename_these = {\"bathroomcnt\": \"total_bath\",\n", + " \"fullbathcnt\": \"full_bath\",\n", + " \"threequarterbathnbr\": \"half_bath\",\n", + " \"yardbuildingsqft17\": \"patio_sqft\",\n", + " \"yardbuildingsqft26\":\"storage_sqft\",\n", + " \"decktypeid\": \"deck_flag\",\n", + " \"pooltypeid7\": \"pool_with_spa_tub_no\", \n", + " \"pooltypeid2\": \"pool_with_spa_tub_yes\",\n", + " \"hashottuborspa\": \"has_hottub_or_spa\", \n", + " \"pooltypeid10\": \"just_hottub_or_spa\",\n", + " \"calculatedfinishedsquarefeet\":\"total_finished_living_area_sqft\", \n", + " \"finishedsquarefeet12\": \"finished_living_area_sqft\",\n", + " \"lotsizesquarefeet\": \"lot_area_sqft\",\n", + " \"finishedsquarefeet50\":\"finished_living_area_entryfloor_sqft1\",\n", + " \"finishedfloor1squarefeet\":\"finished_living_area_entryfloor_sqft2\",\n", + " \"finishedsquarefeet6\": \"base_unfinished_and_finished_area_sqft\",\n", + " \"finishedsquarefeet15\": \"total_area_sqft\",\n", + " \"finishedsquarefeet13\": \"preimeter_living_area_sqft\",\n", + " \"taxvaluedollarcnt\":\"total_parcel_tax\",\n", + " \"landtaxvaluedollarcnt\":\"land_tax\",\n", + " \"taxamount\":\"total_property_tax_2016\",\n", + " \"structuretaxvaluedollarcnt\":\"structure_tax\",\n", + " \"garagetotalsqft\":\"garage_sqft\",\n", + " \"fireplacecnt\":\"fireplace_count\",\n", + " \"buildingqualitytypeid \":\"building_quality_id\",\n", + " \"heatingorsystemtypeid\":\"heating_system_id\",\n", + " \"airconditioningtypeid\":\"ac_id\",\n", + " \"storytypeid\": \"basement_flag\",\n", + " \"basementsqft\": \"basement_sqft\",\n", + " \"poolsizesum\": \"pool_sqft\",\n", + " \"poolcnt\": \"pool_count\"}\n", + "# rename columns \n", + "df_train = df_train.rename(columns = rename_these)\n", + "\n", + "# what's the data frame look like?\n", + "print(df_train.head())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + " parcelid logerror transactiondate ac_id architecturalstyletypeid basement_sqft total_bath ... transaction_month\n", + "0 11827818 0.0402 2016-03-15T00:00:00.000 4.0 ... 3\n", + "1 12123024 0.0296 2016-03-15T00:00:00.000 3.0 ... 3\n", + "2 13867327 0.0344 2016-03-15T00:00:00.000 2.0 ... 3\n", + "3 12681894 0.006 2016-03-15T00:00:00.000 3.0 ... 3\n", + "4 12848541 0.06949999999999999 2016-03-15T00:00:00.000 1.0 4.0 ... 3\n", + "[53 more columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YdtyBI2jFnJv", + "colab_type": "text" + }, + "source": [ + "## Conforming Attribute Values\n", + "### #0 boolean columns & null = 0s cases \n", + "* `pool_count`, `pool_with_spa_tub_no` and `pool_with_spa_tub_yes` are all binary variables, replace all NULL values with zero\n", + "* `basement_flag` has values 7 & `Null` but is supposed to be bool, convert the `7`s to `1`s and the `Null`s to `0`s \n", + "* patio and shed variables with null values are assumed to have none\n", + "* deck_flag has only 2 values, `66` and `null`\n", + " - convert it into binary flag\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "z3bPdNONHTYI", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# replace missing pool count values so we booling\n", + "the_bool_club = ['pool_count','pool_with_spa_tub_no','pool_with_spa_tub_yes',\n", + " 'basement_flag','patio_sqft','storage_sqft', 'deck_flag']\n", + "for col in the_bool_club:\n", + " # convert null values to 0\n", + " df_train[col]=df_train[col].fillna(0)\n", + "# convert 7s and 66s to 1s\n", + "df_train['basement_flag'] = df_train['basement_flag'].replace(7, 1)\n", + "df_train['deck_flag'] = df_train['deck_flag'].replace(66, 1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5MbGy6r7JLLD", + "colab_type": "text" + }, + "source": [ + "### #1 The pool\n", + "* When pool is present and if it has tub/spa then `just_hottub_or_spa` = 0" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "B3-1V93smA9A", + "colab_type": "code", + "outputId": "66d7335e-bc42-4108-a1c1-80f1afb06a4b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 380 + } + }, + "source": [ + "# when poolcnt=1 and has_hottub_or_spa=1 and just_hottub_or_spa is null, then just_hottub_or_spa =0\n", + "conditions = ((df_train['pool_count'] == 1) \n", + " & (df_train['has_hottub_or_spa'] == 1) \n", + " & (df_train['just_hottub_or_spa'].isna() == True))\n", + "df_train['just_hottub_or_spa'] = df_train['just_hottub_or_spa'].masked_assign(0, conditions) " + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "error", + "ename": "TypeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m conditions = ((df_train['pool_count'] == 1) \n\u001b[1;32m 2\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_hottub_or_spa'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m & (df_train['just_hottub_or_spa'].isna() == True))\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'just_hottub_or_spa'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'just_hottub_or_spa'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmasked_assign\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconditions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m__eq__\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 811\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__eq__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 813\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_unordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'eq'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 815\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mequals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m_unordered_compare\u001b[0;34m(self, other, cmpops)\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_unordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcmpops\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 782\u001b[0m \u001b[0mnvtx_range_push\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"CUDF_UNORDERED_COMP\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"orange\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 783\u001b[0;31m \u001b[0mother\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_normalize_binop_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 784\u001b[0m \u001b[0moutcol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcmpops\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 785\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_copy_construct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m_normalize_binop_value\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormalize_binop_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 780\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_unordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcmpops\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/string.py\u001b[0m in \u001b[0;36mnormalize_binop_value\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 705\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cannot broadcast {}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 707\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_na_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: cannot broadcast " + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v6E3-_XlSGBs", + "colab_type": "text" + }, + "source": [ + "- when `has_hottub_or_spa` is null and `just_hottub_or_spa` is null\n", + " - both should be zero\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Xa12WFccSGM6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# if both has hottub and just hottub are null\n", + "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", + " & (df_train['just_hottub_or_spa'].isna() == True))\n", + "# just hottub or spa = 0 \n", + "df_train['just_hottub_or_spa'] = df_train['just_hottub_or_spa'].masked_assign(0, conditions) \n", + "\n", + "# now, if has hottub is null and just hottub is 0 \n", + "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", + " & (df_train['just_hottub_or_spa'] == 0))\n", + "# has hottub or spa = 0 \n", + "df_train['has_hottub_or_spa'] = df_train['has_hottub_or_spa'].masked_assign(0, conditions) " + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5umCCWN73qxw", + "colab_type": "text" + }, + "source": [ + "- when there is no pool\n", + " - if there is tub/spa \n", + " - then `just_hottub_or_spa` = 1" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FBgs7zJm3qk-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# when poolcnt=0, has_hottub_or_spa=1\n", + "conditions = ((df_train['pool_count'] == 0) \n", + " & (df_train['has_hottub_or_spa'] == 1))\n", + "# just_hottub_or_spa=1\n", + "df_train['just_hottub_or_spa'] = df_train['just_hottub_or_spa'].masked_assign(1, conditions) \n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3LsRr1aoSCVx", + "colab_type": "text" + }, + "source": [ + "* When there is no pool, set pool size to zero instead of na" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "NtdyXCbx0TKx", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# where there is no pool\n", + "conditions = df_train['pool_count']==0\n", + "# square footage of non existant pool is 0 \n", + "df_train['pool_sqft'] = df_train['pool_sqft'].masked_assign(0, conditions)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3hQFkXmAgQPY", + "colab_type": "text" + }, + "source": [ + "### #2 The basement\n", + "* Where `basement_flag` is zero, `basement_sqft` should also be zero\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "kMuCOqAmLTmY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# where there is no basement\n", + "conditions = df_train['basement_flag'] == 0\n", + "# fun fact: we just did this with the pool\n", + "df_train['basement_sqft'] = df_train['basement_sqft'].masked_assign(0, conditions) " + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wU6Uohb-PDYB", + "colab_type": "text" + }, + "source": [ + "### #3 The fireplace\n", + "There seems to be inconsistency between the `fireplace_flag` and `fireplace_count`\n", + "- 90,053 flag values are null\n", + "- 80,688 `fireplace_count` values are null\n", + " * 9,385 (-11.5%) difference, but a boatload either way" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OZM6lXmmpj5k", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(f\"there are {df_train['fireplace_count'].isna().sum()} fireplace_count \\\n", + "nulls\\nthere are {df_train['fireplaceflag'].isna().sum()} fireplaceflag nulls\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v9ZAzFoIpkSF", + "colab_type": "text" + }, + "source": [ + "* context driven solutions\n", + " * where neither flag nor count exists, `fireplaceflag == False`\n", + " * when `fireplace_count` is more than zero `fireplaceflag` should be `True`\n", + " * if `fireplaceflag == False`, the `fireplace_count` is logically `0`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "i3YRZgU_qZhA", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# null flags with null counts are zero\n", + "conditions = ((df_train['fireplace_count'].isna()==True) \n", + " & (df_train['fireplaceflag'].isna()==True))\n", + "df_train['fireplaceflag'] = df_train['fireplaceflag'].masked_assign(False, conditions)\n", + "\n", + "# true flags for positive fireplace counts\n", + "conditions = df_train['fireplace_count'] > 0\n", + "df_train['fireplaceflag'] = df_train['fireplaceflag'].masked_assign(True, conditions)\n", + "\n", + "# set fireplace count nulls to 0 where false flags are\n", + "conditions = ((df_train['fireplace_count'].isna()==True) \n", + " & (df_train['fireplaceflag']==False))\n", + "df_train['fireplace_count'] = df_train['fireplace_count'].masked_assign(0, conditions)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pYntUejosOn3" + }, + "source": [ + "### #4 The garage\n", + "* Properties with no garages would have NA values for both " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "L9mGs-mK9E0Q", + "colab_type": "code", + "colab": {} + }, + "source": [ + "garage = ['garagecarcnt', 'garage_sqft']\n", + "# where garage car count and garage square feet are null, set both to 0\n", + "conditions = ((df_train['garagecarcnt'].isna()==True) \n", + " & (df_train['garage_sqft'].isna()==True))\n", + "for i in garage:\n", + " df_train[i] = df_train[i].masked_assign(0, conditions)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0uV115W6-ohW", + "colab_type": "text" + }, + "source": [ + "Exploring the data farther, we see\n", + "- `garage_sqft` holds over 8,900 measurements of 0 despite the garage's car count being 1 or more \n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gbbUIbwJ-ouS", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# show rows where garage count and square feet don't add up\n", + "conditions = (df_train.garagecarcnt > 0) & (df_train.garage_sqft == 0)\n", + "print(df_train.loc[conditions][garage])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5I1O76QKA8Cb", + "colab_type": "text" + }, + "source": [ + "- these 0 values need to be null\n", + " - because no garage holding 1 or more cars in 2016 measured 0sqft" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eWVtoty0A9Jt", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# where garage count and square feet don't add up\n", + "conditions = (df_train.garagecarcnt>0) & (df_train.garage_sqft==0)\n", + "# insert a NaN value\n", + "df_train['garage_sqft'] = df_train['garage_sqft'].masked_assign(cupy.nan, conditions)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "seb6r5wx5Bbz" + }, + "source": [ + "### #5 The bath\n", + "* `total_bath` & `calculatedbathnbr` are near-duplicates w/ `calculated` having more nulls\n", + " - let's drop it\n", + "* if `full_bath` is null and `half_bath` is also null\n", + " - let's make `total_bath` = 0 \n", + " - because we can't truthfully assume it's any more " + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "EgMNToed5BMu", + "colab": {} + }, + "source": [ + "# drop calculated bath column\n", + "df_train = df_train.drop('calculatedbathnbr', axis=1)\n", + "\n", + "# if full_bath is null & half_bath is null\n", + "conditions = ((df_train['full_bath'].isnull()==True) \n", + " & (df_train['half_bath'].isnull()==True))\n", + "# total_bath=0\n", + "df_train['total_bath'] = df_train['total_bath'].masked_assign(0, conditions)\n", + "\n", + "# when full_bath==total_bath\n", + "conditions = df_train.full_bath == df_train.total_bath\n", + "# half_bath=0 \n", + "df_train['half_bath'] = df_train['half_bath'].masked_assign(0, conditions)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Sh8cG0pr4_hl" + }, + "source": [ + "### #6 Mode Imputation \n", + "* scaling down the latitude and longitide\n", + " - knn imput takes more time due to the larger numbers\n", + " - standardizing gives better results on most algorithms\n", + " - this is a competition, we came to win" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "kitrNxKgLWUd", + "colab_type": "code", + "colab": {} + }, + "source": [ + "df_train['latitude'] = [lat/100000 for lat in df_train['latitude']]\n", + "df_train['longitude'] = [long/100000 for long in df_train['longitude']]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y6bhRhu5YZ1d", + "colab_type": "text" + }, + "source": [ + "### #7 numberofstories & unitcnt & roomcnt\n", + "* we can devise unit count based on property land type\n", + " - so we can now go ahead and correct the unit counts for each given property" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yHZH4rMNLfBA", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# where room count is 0, go ahead and NaN it\n", + "conditions = df_train['roomcnt'] == 0\n", + "df_train['roomcnt'] = df_train['roomcnt'].masked_assign(cupy.nan, conditions)\n", + "\n", + "\"\"\"\n", + "propertylandusetypeid & unitcnt are related \n", + " these are the propertylandusetypeid codes & their definitions\n", + " \n", + "#246 -Duplex (2 Units, Any Combination)\n", + "#247 -Triplex (3 Units, Any Combination)\n", + "#248 -Quadruplex (4 Units, Any Combination)\n", + "#260 -Residential General\n", + "#261 -Single Family Residential\n", + "#263 -Mobile Home\n", + "#264 -Townhouse\n", + "#266 -Condominium\n", + "#267 -Cooperative\n", + "#269 -Planned Unit Development\n", + "#275 -Residential Common Area \n", + "#31 - Commercial/Office/Residential Mixed Used\n", + "#47 -Store/Office (Mixed Use)\n", + "#265 -Cluster Home\n", + "\"\"\"\n", + "\n", + "# one unit \n", + "ones = [260,261,263,264,266,267,269,275]\n", + "for one in ones:\n", + " # adjust conditions to one unit indicator\n", + " conditions = ((df_train['propertylandusetypeid'] == one) \n", + " & (df_train['unitcnt'].isnull()))\n", + " df_train['unitcnt'] = df_train['unitcnt'].masked_assign(1, conditions)\n", + "\n", + "# two units \n", + "twos = [31,47,246]\n", + "for two in twos:\n", + " # adjust conditions to two unit indicator\n", + " conditions = ((df_train['propertylandusetypeid'] == two) \n", + " & (df_train['unitcnt'].isnull()))\n", + " df_train['unitcnt'] = df_train['unitcnt'].masked_assign(2, conditions)\n", + "\n", + "# three units\n", + "conditions = ((df_train['propertylandusetypeid'] == 247) \n", + " & (df_train['unitcnt'].isnull()))\n", + "df_train['unitcnt'] = df_train['unitcnt'].masked_assign(3, conditions)\n", + "\n", + "# four units\n", + "conditions = ((df_train['propertylandusetypeid'] == 248) \n", + " & (df_train['unitcnt'].isnull()))\n", + "df_train['unitcnt'] = df_train['unitcnt'].masked_assign(4, conditions)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "02yLicmxLs3C", + "colab_type": "text" + }, + "source": [ + "### #8 Time to Cut\n", + "**Because of the adjustments made so far a number of columns are no longer needed**\n", + "* transaction date column is no longer of use\n", + " - and can be dropped \n", + "* `preimeter_living_area_sqft` and `total_finished_living_area_sqft` have the same values \n", + " - except that `preimeter_living_area_sqft` has more duplicates\n", + "* `total_area_sqft` and `total_finished_living_area_sqft` have the same values \n", + " - except that \"total_area_sqft\" has more duplicates\n", + "* `total_finished_living_area_sqft` and `finished_living_area_sqft` have the same values \n", + " - except that `finished_living_area_sqft` has more duplicates\n", + "* `base_unfinished_and_finished_area_sqft` and `total_finished_living_area_sqft` have the same values \n", + " - except that `base_unfinished_and_finished_area_sqft` has more duplicates\n", + "* different counties follow different land use code\n", + " - to compare different counties, zillow has created it's own `propertylandusetypeid`\n", + " - hence we can drop `propertycountylandusecode`\n", + " - the same applies to `propertyzoningdesc`\n", + "* Most zip id's either invalid or out of city\n", + " - since enough information about location is given in latitude and longitude \n", + " - let's drop other location related fields\n", + " - `regionidcity`\n", + " - `regionidzip`\n", + " - `regionidneighborhood`\n", + "* `assessmentyear` has a constant value for all rows\n", + " - let's drop it" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OtOgzOqHLyid", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# collect columns to drop\n", + "cut = ['propertyzoningdesc','propertycountylandusecode',\n", + " 'base_unfinished_and_finished_area_sqft','finished_living_area_sqft',\n", + " 'total_area_sqft','preimeter_living_area_sqft','regionidzip',\n", + " 'regionidcity','regionidneighborhood','assessmentyear','transactiondate',\n", + " 'censustractandblock']\n", + "# cut columns form dataframe\n", + "df_train = df_train.drop(cut, axis=1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "icDvpvSD6BSb", + "colab_type": "text" + }, + "source": [ + "### #9 Tax, Year, & Census\n", + "- if tax deliquency flag is null, assume there is no unpaid tax on the property\n", + " - an issue arrises here because `taxdelinquencyflag` is a `StringColumn`\n", + " - i.e. null values indicate no tax delinquency, all other values are `Y` for yes\n", + " - because of this, the normal method of.." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8lYcO_T5XKNN", + "colab_type": "code", + "outputId": "0b77457e-0eed-4e21-be79-1df380432abc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 303 + } + }, + "source": [ + "# how we'd normally take care of this\n", + "df_train['taxdelinquencyflag'].fillna(0)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "error", + "ename": "TypeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1136\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1137\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1138\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1139\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 709\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 710\u001b[0m ):\n\u001b[0;32m--> 711\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 712\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: fill_value must be a string or a string series" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tA6xG6h59rLi", + "colab_type": "text" + }, + "source": [ + "- ...comes with error. \n", + " - Why?\n", + " - the series we are trying to fill the null values of is a string series\n", + " - because of this `.fillna()` requires a sting value (e.g. '0') instead of an int value (e.g. 0)\n", + " - So, what now?\n", + " - there is an easy and straightforward solution with masked assigning!! \n", + " - First\n", + " - switch 1 (current True, actual False) to -1\n", + " - Then\n", + " - switch 0 (current False, actual True) to 1 to reflect True status\n", + " - Finally\n", + " - switch -1 (old True, actual False) to 0 to reflect False status" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Svp6J0cJ5dL0", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# if bool 'Y'/None is already set, change string to int bool column via .isna()\n", + "df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'].isna()\n", + "\n", + "# next we must correct the values, with 1 (True) for 'Y' and 0 for no\n", + "switcharoo = [(1,-1),(0,1),(-1,0)]\n", + "# switch values in order\n", + "for pair in switcharoo:\n", + " # tag old value and new value it will be replaced with\n", + " old, new = pair\n", + " # replace old value with new value\n", + " df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'].replace(old, \n", + " new)\n", + "# display values in tax delinquency flag column\n", + "print(df_train['taxdelinquencyflag'].value_counts())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w5EAdWXaCTRU", + "colab_type": "text" + }, + "source": [ + "- Convert years\n", + " - from yy\n", + " - to 2016 - yyyy \n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6Bic66I9LfGC", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# set year paris -- e.g. from 5 to 2016 - 2005\n", + "year_pairs = [(99,2016-1999),(6,2016-2006),(7,2016-2007),(8,2016-2008),\n", + " (9,2016-2009),(10,2016-2010),(11,2016-2011),(12,2016-2012),\n", + " (13,2016-2013),(14,2016-2014),(15,2016-2015)]\n", + "# go though year pairs\n", + "for pair in year_pairs:\n", + " # tag old value and new value it will be replaced with\n", + " old, new = pair\n", + " # replace old value with new value\n", + " df_train['taxdelinquencyyear'] = df_train['taxdelinquencyyear'].replace(old, \n", + " new)\n", + "# what're we lookin at?\n", + "print(df_train['taxdelinquencyyear'].value_counts())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ya7xLHzdGVcs", + "colab_type": "text" + }, + "source": [ + "- values in `rawcensustractandblock` represent multiple fields concatened together as float values\n", + " - by converting those values to string we can split each and build new columns:\n", + " - `census_tractnumber`\n", + " - `block_number`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "b3sh8aGovTLT", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(df_train['rawcensustractandblock'].head())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AJrFMIuvvqUr", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# using series instead of dataframe\n", + "tractnumber = s_rawcensustractandblock.values_to_string()\n", + "# adjust tract number\n", + "for i in range(len(cudf_tractnumber)):\n", + " funct = slice(4,11)\n", + " tractnumber[i] = tractnumber[i][funct]\n", + "# set new tract number column\n", + "df_train['census_tractnumber'] = census_tractnumber\n", + "\n", + "# using series instead of dataframe\n", + "block_number = s_rawcensustractandblock.values_to_string()\n", + "# set/adjust block number\n", + "for i in range(len(block_number)):\n", + " funct = slice(11, None)\n", + " block_number[i] = block_number[i][funct]\n", + " block_number[i] = block_number[i][:4]+'.'+block_number[i][4:]+'0'\n", + " block_number[i] = int(round(float(block_number[i]), 0))\n", + " block_number[i] = str(block_number[i]).ljust(4,'0')\n", + "# add block number column to dataframe\n", + "df_train['block_number'] = block_number\n", + "\n", + "# rawcensustractandblock values have been converted\n", + "df_train = df_train.drop('rawcensustractandblock', axis=1)\n", + "# let's see what we've got\n", + "print(df_train[['census_tractnumber', 'block_number']].head(3))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T71orw51lpTN", + "colab_type": "text" + }, + "source": [ + "## Dealing with Missing Values\n", + "### #1 Setting standards\n", + "- Despite corecting and adjusting the data to this point, there are still some columns holding a large majority of null values\n", + "- For some columns, this majority represents over 95% of values\n", + " - Let's identify those columns\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xhCosNpXvTVU", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# calculate null value % for each column & frame it\n", + "missingvalues_prop = (df_train.isnull().sum()/len(df_train)).reset_index()\n", + "missingvalues_prop.columns = ['field','percentage']\n", + "\n", + "# sort by null values percentage, from highest % to lowest\n", + "missingvalues_prop = missingvalues_prop.sort_values(by='percentage', \n", + " ascending=False)\n", + "# identify columns with > 95% of values null\n", + "missingvaluescols = missingvalues_prop.loc[missingvalues_prop['percentage'] > 0.95]\n", + "\n", + "# display columns with highest % null values\n", + "print(missingvaluescols)\n", + "\n", + "# drop columns with more than 95% null values\n", + "df_train = df_train.drop(missingvaluescols['field'], axis=1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8eBIDWEUBHwz", + "colab_type": "text" + }, + "source": [ + "- and drop columns with more than 95% null values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "az6t2ntBCMRe", + "colab_type": "text" + }, + "source": [ + "### #2 Working with Remaining Values\n", + "- the majority of values still missing in unitcnt are rows were `propertylandusetypeid` = 265, \n", + " - which is Cluster Home (i.e. group of houses with shared walls)\n", + " - each cluster is anywhere between 5 to 25 units\n", + " - here we will asssume 10 units as reassonable count" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yB2lzAyopS_S", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# highly related propertylandusetypeid\n", + "conditions = df_train['propertylandusetypeid'] == 265\n", + "# unitcnt 360\n", + "df_train['unitcnt'] = df_train['unitcnt'].masked_assign(10, conditions)\n", + "# let's see what we've got\n", + "print(df_train['unitcnt'].value_counts())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ofZIC0EdKJ0Y", + "colab_type": "text" + }, + "source": [ + "# -----current: test ready-----" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "c8Zfn-YhlSBO", + "colab_type": "code", + "outputId": "2087fa66-8683-4040-a3e1-7654942367b7", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "poolsizesum_mean = df_train.loc[df_train['pool_count'] > 0].pool_sqft.mean()\n", + "\"\"\"\n", + "NEEDS TO BE CONFIRMED WITH OG\n", + "> is this supposed to only consider if pool_sqft > 0 as well?\n", + "\"\"\"\n", + "poolsizesum_mean" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "28.13881906038769" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 86 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "cA30ozCWo5x3", + "colab_type": "code", + "outputId": "fda7011f-6bee-4b60-e137-ec04d05e440b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 118 + } + }, + "source": [ + "print(df_train.loc[df_train['pool_count'] > 0].pool_sqft.head())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "5 0.0\n", + "8 0.0\n", + "11 0.0\n", + "13 0.0\n", + "23 0.0\n", + "Name: pool_sqft, dtype: float64\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-icFDeLSoJwl", + "colab_type": "code", + "outputId": "9c5035bd-b766-4509-c5a8-f3a475093dd4", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 907 + } + }, + "source": [ + "print(df_train.loc[df_train.pool_count>0].pool_sqft.value_counts())\n", + "print(df_train.pool_sqft.value_counts())\n", + "print(df_train.loc[df_train.pool_count>0].pool_sqft.isna().sum())\n", + "print(df_train.pool_sqft.isna().sum())\n", + "\n", + "\n", + "\n", + "# calculate the average pool square footage for properties with a pool(s)\n", + "new_value = df_train.loc[df_train['pool_count'] > 0, 'pool_sqft'].mean()\n", + "\n", + "# where the property has a pool(s) but pool square feet is 0\n", + "conditions = ((df_train['pool_count'] > 0) \n", + " & (df_train['pool_sqft'] == 0))\n", + "\n", + "# set pool square feet to the average pool square footage of pool properties\n", + "df_train['pool_sqft'] = df_train['pool_sqft'].masked_assign(new_value, conditions)\n", + "\n", + "\n", + "print(df_train.loc[df_train.pool_count>0].pool_sqft.value_counts())\n", + "print(df_train.pool_sqft.value_counts())\n", + "print()\n", + "print(df_train.loc[df_train.pool_count>0].pool_sqft.isna().sum())\n", + "print(df_train.pool_sqft.isna().sum())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.0 16932\n", + "450.0 105\n", + "400.0 41\n", + "800.0 39\n", + "500.0 36\n", + "600.0 35\n", + "512.0 30\n", + "480.0 27\n", + "648.0 18\n", + "420.0 17\n", + "[264 more rows]\n", + "dtype: int64\n", + "0.0 89306\n", + "450.0 105\n", + "400.0 41\n", + "800.0 39\n", + "500.0 36\n", + "600.0 35\n", + "512.0 30\n", + "480.0 27\n", + "648.0 18\n", + "420.0 17\n", + "[264 more rows]\n", + "dtype: int64\n", + "0\n", + "0\n", + "28.13881906038769 16932\n", + "450.0 105\n", + "400.0 41\n", + "800.0 39\n", + "500.0 36\n", + "600.0 35\n", + "512.0 30\n", + "480.0 27\n", + "648.0 18\n", + "420.0 17\n", + "[264 more rows]\n", + "dtype: int64\n", + "0.0 72374\n", + "28.13881906038769 16932\n", + "450.0 105\n", + "400.0 41\n", + "800.0 39\n", + "500.0 36\n", + "600.0 35\n", + "512.0 30\n", + "480.0 27\n", + "648.0 18\n", + "[265 more rows]\n", + "dtype: int64\n", + "\n", + "0\n", + "0\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3pVABkZTYK9F", + "colab_type": "code", + "outputId": "42a0b5cc-42e2-41c5-8fdd-11485c45c933", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 774 + } + }, + "source": [ + "# where total tax and land tax are both greater than 0\n", + "\n", + "# TESTING (SWITCH TO OG)\n", + "# test = df_train.copy()\n", + "# test.loc[(test.total_parcel_tax>0) & (test.land_tax>0),'structure_tax']=test['total_parcel_tax']-test['land_tax']\n", + "hmm = df_train.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)]\n", + "print(f'{len(hmm)} rows where total and land are greater than 0')\n", + "print(f'{len(df_train)} total rows, hopefully the same as above number')\n", + "print()\n", + "print(len(hmm.loc[hmm.structure_tax!=hmm['total_parcel_tax']-hmm['land_tax']]))\n", + "print()\n", + "print(hmm.loc[hmm.structure_tax!=hmm['total_parcel_tax']-hmm['land_tax']])\n", + "print()\n", + "any_neg = hmm.loc[hmm.total_parcel_tax < hmm.land_tax]\n", + "# if this comes back as 0, setting all structures to total - land should work\n", + "print(f'{len(any_neg)} total taxes are less than same rows land tax\\n')\n", + "print(any_neg)\n", + "# SWITCH TO RAPIDS \n", + "\"\"\"current concern\n", + "are there places where total and land are not greater than 0 \n", + "and setting structure to their difference is not the best move\"\"\"\n", + "\n", + "\n", + "# # structure tax should be equal to total tax minus land tax\n", + "# df_train['structure_tax'] = df_train['total_parcel_tax'] - df_train['land_tax']\n", + "new_value = df_train['total_parcel_tax'] - df_train['land_tax']\n", + "conditions = (df_train.total_parcel_tax>0) & (df_train.land_tax>0)\n", + "df_train['structure_tax'] = df_train['structure_tax'].masked_assign(new_value, conditions)\n", + "\n", + "# # where structure tax is 0\n", + "conditions = df_train['structure_tax'] == 0\n", + "# # we do not know the structure tax, so insert a Nan value\n", + "df_train['structure_tax'] = df_train['structure_tax'].masked_assign(cupy.nan, conditions)\n", + "\n", + "# print(test.isna().sum())\n", + "# print(test.value_counts().head())\n", + "# print(test_1.isna().sum())\n", + "# print(test_1.value_counts().head())\n", + "\n", + "\n", + "# SWITCH TO OG \n", + "\"\"\"\n", + "#total_parcel_tax\n", + "#structure_tax\n", + "#land_tax\n", + "#total_property_tax_2016\n", + "#2)recalculate total_parcel_tax =structure_tax + land_tax\n", + "\n", + "# total_parcel_tax =structure_tax + land_tax\n", + "#->structure_tax=total_parcel_tax -land_tax\n", + "\n", + "df_train.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0),'structure_tax']=df_train['total_parcel_tax']-df_train['land_tax']\n", + "\n", + "#structure_tax, i see a lot of structure tax is 0's, those must be NA's\n", + "\n", + "df_train.loc[df_train.structure_tax==0,'structure_tax']=np.nan\n", + "\"\"\"\n", + "print(df_train.total_property_tax_2016.isnull().sum())\n", + "print(df_train.structure_tax.isnull().sum())\n", + "print(df_train.total_parcel_tax.isnull().sum())\n", + "print(df_train.land_tax.isnull().sum())\n", + "\n", + "# SWITCH TO RAPIDS\n", + "# print(test[['structure_tax','land_tax','total_parcel_tax']])" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "90274 rows where total and land are greater than 0\n", + "90275 total rows, hopefully the same as above number\n", + "\n", + "379\n", + "\n", + " parcelid logerror ac_id basement_sqft total_bath bedroomcnt buildingqualitytypeid ... census_tractnumber\n", + "266 17188959 0.0944 0.0 0.0 0.0 ... 0056.00\n", + "297 12956410 -0.14850000000000002 0.0 0.0 0.0 ... 4080.05\n", + "336 12966610 0.0488 0.0 6.0 9.0 7.0 ... 4303.01\n", + "454 17188961 0.003 0.0 0.0 0.0 ... 0056.00\n", + "474 17188974 0.10260000000000001 0.0 0.0 0.0 ... 0056.00\n", + "555 17266056 -0.5175 0.0 0.0 0.0 ... 0059.08\n", + "601 17205423 0.0733 0.0 0.0 0.0 ... 0076.06\n", + "790 10858080 0.05450000000000001 0.0 2.0 3.0 7.0 ... 1412.01\n", + "791 10858080 0.08620000000000001 0.0 2.0 3.0 7.0 ... 1412.01\n", + "976 11325190 -0.024300000000000002 0.0 0.0 0.0 ... 9102.06\n", + "[369 more rows]\n", + "[38 more columns]\n", + "\n", + "0 total taxes are less than same rows land tax\n", + "\n", + "Empty DataFrame\n", + "Columns: ['parcelid', 'logerror', 'ac_id', 'basement_sqft', 'total_bath', 'bedroomcnt', 'buildingqualitytypeid', 'census_tractnumber']\n", + "Index: []\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "ValueError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mnew_value\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total_parcel_tax'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'land_tax'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mconditions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtotal_parcel_tax\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mland_tax\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'structure_tax'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'structure_tax'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmasked_assign\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_value\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconditions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;31m# # where structure tax is 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mmasked_assign\u001b[0;34m(self, value, mask)\u001b[0m\n\u001b[1;32m 1073\u001b[0m \"\"\"\n\u001b[1;32m 1074\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1075\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmasked_assign\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1076\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_copy_construct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1077\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/column.py\u001b[0m in \u001b[0;36mmasked_assign\u001b[0;34m(self, value, mask)\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_gpu_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmask_invert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 496\u001b[0;31m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 497\u001b[0m )\n\u001b[1;32m 498\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnull_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/utils/cudautils.py\u001b[0m in \u001b[0;36mfill_mask\u001b[0;34m(data, mask, value)\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 236\u001b[0m \u001b[0mconfigured\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgpu_fill_masked\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 237\u001b[0;31m \u001b[0mconfigured\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 238\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAutoJitCUDAKernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 224\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspecialize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 225\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mspecialize\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 761\u001b[0m '''\n\u001b[1;32m 762\u001b[0m argtypes = tuple(\n\u001b[0;32m--> 763\u001b[0;31m [self.typingctx.resolve_argument_type(a) for a in args])\n\u001b[0m\u001b[1;32m 764\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 765\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 761\u001b[0m '''\n\u001b[1;32m 762\u001b[0m argtypes = tuple(\n\u001b[0;32m--> 763\u001b[0;31m [self.typingctx.resolve_argument_type(a) for a in args])\n\u001b[0m\u001b[1;32m 764\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 765\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/typing/context.py\u001b[0m in \u001b[0;36mresolve_argument_type\u001b[0;34m(self, val)\u001b[0m\n\u001b[1;32m 296\u001b[0m \"\"\"\n\u001b[1;32m 297\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 298\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtypeof\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPurpose\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margument\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 299\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnumba\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_cuda_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/typing/typeof.py\u001b[0m in \u001b[0;36mtypeof\u001b[0;34m(val, purpose)\u001b[0m\n\u001b[1;32m 32\u001b[0m msg = _termcolor.errmsg(\n\u001b[1;32m 33\u001b[0m \"cannot determine Numba type of %r\") % (type(val),)\n\u001b[0;32m---> 34\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 35\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: cannot determine Numba type of " + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8SID48LOpYvu", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# regionidcounty is exact copy of fips code, dropping the dulicate column\n", + "df_train = df_train.drop(['regionidcounty'], axis=1)\n", + "df_train.shape" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tWmM2J8_pkg1", + "colab_type": "code", + "outputId": "2393cbab-218f-4849-c32c-700495dfb18e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 622 + } + }, + "source": [ + "#*******************************\n", + "#bedroomcnt #1421 zero bed room houses ??, observed it's missing all other room count also missing\n", + "print(df_train.bedroomcnt.value_counts())\n", + "\n", + "conditions = df_train['bedroomcnt'] == 0\n", + "df_train['bedroomcnt'] = df_train['bedroomcnt'].masked_assign(cupy.nan, conditions)\n", + "\n", + "\n", + "print(df_train.bedroomcnt.value_counts())\n", + "print(df_train.bedroomcnt.isnull().sum())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "3.0 35447\n", + "2.0 22357\n", + "4.0 20279\n", + "5.0 5077\n", + "1.0 3897\n", + "0.0 1421\n", + "6.0 1120\n", + "8.0 274\n", + "7.0 234\n", + "9.0 91\n", + "[7 more rows]\n", + "dtype: int64\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "RuntimeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbedroomcnt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbedroomcnt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mvalue_counts\u001b[0;34m(self, method, sort)\u001b[0m\n\u001b[1;32m 1827\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnull_count\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1828\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mint64\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1829\u001b[0;31m \u001b[0mvals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcnts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1830\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcnts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mas_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvals\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1831\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/numerical.py\u001b[0m in \u001b[0;36mvalue_counts\u001b[0;34m(self, method)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"non sort based value_count() not implemented yet\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 217\u001b[0;31m \u001b[0msegs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msortedvals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_unique_segments\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 218\u001b[0m \u001b[0;31m# Return both values and their counts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mout_vals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcpp_copying\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_gather_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msortedvals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msegs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/column.py\u001b[0m in \u001b[0;36m_unique_segments\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 684\u001b[0m \u001b[0mdensecol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dense_buffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;31m# sort the column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 686\u001b[0;31m \u001b[0msortcol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdensecol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_by_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 687\u001b[0m \u001b[0;31m# find segments\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 688\u001b[0m \u001b[0msortedvals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msortcol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/numerical.py\u001b[0m in \u001b[0;36msort_by_values\u001b[0;34m(self, ascending, na_position)\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msort_by_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_position\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"last\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 163\u001b[0;31m \u001b[0msort_inds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_sorted_inds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_position\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 164\u001b[0m \u001b[0mcol_keys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcpp_copying\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_gather_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msort_inds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m col_inds = self.replace(\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/_sort.py\u001b[0m in \u001b[0;36mget_sorted_inds\u001b[0;34m(by, ascending, na_position)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Must use a boolean or list of booleans\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m \u001b[0mcpp_sort\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_order_by\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_inds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_position\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 80\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcol_inds\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mcudf/bindings/sort.pyx\u001b[0m in \u001b[0;36mcudf.bindings.sort.apply_order_by\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mcudf/bindings/sort.pyx\u001b[0m in \u001b[0;36mcudf.bindings.sort.apply_order_by\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: merge_sort: failed to synchronize: an illegal memory access was encountered" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3qnP2L9LpmeJ", + "colab_type": "code", + "outputId": "bc0119de-0644-414f-bf59-bd132c7c0e15", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 387 + } + }, + "source": [ + "# propertylandusetypeid & total living area\n", + "# total_bath 1165\n", + "# full_bath 1182\n", + "# half_bath 1182\n", + "# bedroomcnt 1421\n", + "# roomcnt 1416\n", + "\n", + "\n", + "# roomcnt=(full_bath+half_bath)+ bedroomcnt\n", + "# total_bath=fullbath+ 0.5(half_bath)\n", + "\n", + "#caluculate full bath and half bath again from total bath as, it has few extra columns, (fixes 500 missing values in roomcnt )\n", + "\n", + "# where full & half bath and bedroom count are not null, but room count is null\n", + "conditions = ((df_train['full_bath'].isna() == False) \n", + " & (df_train['half_bath'].isna() == False) \n", + " & (df_train['bedroomcnt'].isna() == False) \n", + " & (df_train['roomcnt'].isna() == True))\n", + "# calculate room count including all full & half baths along with bedroom count\n", + "new_values = df_train.full_bath + df_train.half_bath + df_train.bedroomcnt\n", + "df_train['roomcnt'] = df_train['roomcnt'].masked_assign(new_values, conditions)\n", + "\n", + "\"\"\"df_train.loc[(df_train.full_bath.notnull()) \n", + " & (df_train.half_bath.notnull()) \n", + " & (df_train.bedroomcnt.notnull()) \n", + " & (df_train.roomcnt.isnull()),['roomcnt']]=df_train.full_bath + df_train.half_bath + df_train.bedroomcnt\"\"\"\n", + "\n", + "\n", + "# most bedroom count and roomcount null are in same place\n", + "# all column null count 1133 all columns are null\n", + "\n", + "print(df_train.total_bath.isnull().sum())\n", + "print(df_train.full_bath.isnull().sum())\n", + "print(df_train.half_bath.isnull().sum())\n", + "print(df_train.bedroomcnt.isnull().sum())\n", + "print(df_train.roomcnt.isnull().sum())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "ERROR:Call to cuOccupancyMaxPotentialBlockSize results in UNKNOWN_CUDA_ERROR\n" + ], + "name": "stderr" + }, + { + "output_type": "error", + "ename": "CudaAPIError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mCudaAPIError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'half_bath'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bedroomcnt'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m & (df_train['roomcnt'].isna() == True))\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;31m# calculate room count including all full & half baths along with bedroom count\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfull_bath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhalf_bath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbedroomcnt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36misna\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1238\u001b[0m \"\"\"Identify missing values in a Series. Alias for isnull.\n\u001b[1;32m 1239\u001b[0m \"\"\"\n\u001b[0;32m-> 1240\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1242\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mnotna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36misnull\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1232\u001b[0m )\n\u001b[1;32m 1233\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1234\u001b[0;31m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcudautils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnullmask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1235\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/utils/cudautils.py\u001b[0m in \u001b[0;36misnull_mask\u001b[0;34m(data, mask)\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_dary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 434\u001b[0;31m \u001b[0mgpu_isnull\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_dary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 435\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0moutput_dary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 228\u001b[0;31m \u001b[0mtpb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compute_thread_per_block\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 229\u001b[0m \u001b[0mtpbm1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtpb\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 230\u001b[0m \u001b[0mblkct\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mntasks\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtpbm1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0mtpb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m_compute_thread_per_block\u001b[0;34m(self, kernel)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;31m# Raises from the driver if the feature is unavailable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtpb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_max_potential_block_size\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;31m# Fallback to table-based approach.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36mget_max_potential_block_size\u001b[0;34m(self, func, b2d_func, memsize, blocksizelimit, flags)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mb2d_cb\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m memsize, blocksizelimit)\n\u001b[0m\u001b[1;32m 649\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m driver.cuOccupancyMaxPotentialBlockSizeWithFlags(byref(gridsize), byref(blocksize),\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36msafe_cuda_api_call\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0m_logger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'call driver api: %s'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibfn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0mretcode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 290\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretcode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 291\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msafe_cuda_api_call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36m_check_error\u001b[0;34m(self, fname, retcode)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0m_logger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcritical\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_getpid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCudaDriverError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"CUDA initialized before forking\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCudaAPIError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mretcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_device\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevnum\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mCudaAPIError\u001b[0m: [700] Call to cuOccupancyMaxPotentialBlockSize results in UNKNOWN_CUDA_ERROR" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mvy51Ckev9CX", + "colab_type": "text" + }, + "source": [ + "- correct number of stories by Zillow's `propertylandusetypeid` indicator\n", + " - where null values are not\n", + " - number of stories can be set to mode\n", + " - where there are null values\n", + " - number of stories can be set to the generally accepted number of stories" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IW4CG2InpolD", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# before\n", + "print(df_train.numberofstories.isnull().sum())\n", + "print(df_train.numberofstories.value_counts())\n", + "\n", + "#numberofstories\t69705\n", + "\n", + "# store ids and general number of stories \n", + "zillow_type_ids = [(31,2), (246,2), (247,2), (248,2), (260,2), (261,1), \n", + " (263,1), (266,1), (267,1), (269, 2), (275,1)]\n", + "\n", + "# go through each id pair \n", + "for type_id in zillow_type_ids:\n", + " # split the pair into type id and number of stories\n", + " id, n_stories = type_id\n", + "\n", + " # when type id matches and story count is not null\n", + " conditions = ((df_train['propertylandusetypeid'] == id) \n", + " & (df_train['numberofstories'].isna() == False))\n", + " # calculate the mode story count for matching id properties\n", + " mode_stories = df_train.loc[conditions, 'numberofstories'].mode()\n", + " # and set those non null values to the most common value seen\n", + " df_train['numberofstories'] = df_train['numberofstories'].masked_assign(mode_stories, \n", + " conditions)\n", + " \n", + " # when type id matches and story count is null\n", + " conditions = ((df_train['propertylandusetypeid'] == id) \n", + " & (df_train['numberofstories'].isna() == False))\n", + " # set null values to the common number of stories seen in that type id\n", + " df_train['numberofstories'] = df_train['numberofstories'].masked_assign(n_stories, \n", + " conditions)\n", + " \n", + "# TO BE ADDRESSED\n", + "# #https://en.wikipedia.org/wiki/Townhouse , typical town house are usually large, and has atleast 6 rooms\n", + "# df_train.loc[(df_train.propertylandusetypeid==264) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", + "\n", + "\"\"\"\n", + "df_train.loc[(df_train.propertylandusetypeid==246) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==246) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", + "\n", + "df_train.loc[(df_train.propertylandusetypeid==247) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==247) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", + "\n", + "df_train.loc[(df_train.propertylandusetypeid==248) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==248) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", + "\n", + "df_train.loc[(df_train.propertylandusetypeid==260) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==260) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", + "\n", + "df_train.loc[(df_train.propertylandusetypeid==261) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==261) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", + "\n", + "df_train.loc[(df_train.propertylandusetypeid==263) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==263) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", + "\n", + "df_train.loc[(df_train.propertylandusetypeid==266) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==266) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", + "\n", + "df_train.loc[(df_train.propertylandusetypeid==269) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==269) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", + "\n", + "prop2016.loc[(prop2016.propertylandusetypeid==275) & (prop2016.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==275) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", + "\n", + "prop2016.loc[(prop2016.propertylandusetypeid==267) & (prop2016.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==267) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", + "\n", + "#https://en.wikipedia.org/wiki/Townhouse , typical town house are usually large, and has atleast 6 rooms\n", + "df_train.loc[(df_train.propertylandusetypeid==264) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", + "\n", + "prop2016.loc[(prop2016.propertylandusetypeid==31) & (prop2016.numberofstories.notnull()),'numberofstories'].mode()\n", + "df_train.loc[(df_train.propertylandusetypeid==31) & (df_train.numberofstories.isnull()),'numberofstories']=2\"\"\"\n", + "\n", + "# after\n", + "print(df_train.numberofstories.isnull().sum())\n", + "print(df_train.numberofstories.value_counts())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AHcMsDCxprd4", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\"\"\"skeptical of this0 cell (and the one above)..\n", + "author provides no explination for moding\"\"\"\n", + "\n", + "# before\n", + "print(df_train.fireplace_count.isnull().sum())\n", + "print(df_train.fireplace_count.value_counts())\n", + "\n", + "# where there is a fire place, and count is not null\n", + "conditions = ((df_train.fireplaceflag==1) \n", + " & (df_train.fireplace_count.isna() == False))\n", + "# calculate the mode fireplace count \n", + "mode_fire_count = df_train.loc[conditions, 'fireplace_count'].mode()\n", + "# and set those non null values to the most common fireplace count\n", + "df_train['fireplace_count'] = df_train['fireplace_count'].masked_assign(mode_fire_count, \n", + " conditions)\n", + "\n", + "# where there is a fire place, and count is null\n", + "conditions = ((df_train.fireplaceflag==1) \n", + " & (df_train.fireplace_count.isna() == True))\n", + "# set null values to the most common fireplace count\n", + "df_train['fireplace_count'] = df_train['fireplace_count'].masked_assign(1, \n", + " conditions)\n", + "\n", + "# df_train.loc[(df_train.fireplaceflag==1) & (df_train.fireplace_count.notnull()),'fireplace_count'].mode()\n", + "# df_train.loc[(df_train.fireplaceflag==1) & (df_train.fireplace_count.isnull()),'fireplace_count']=1\n", + "\n", + "# after\n", + "print(df_train.fireplace_count.isnull().sum())\n", + "print(df_train.fireplace_count.value_counts())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DVgF1c_p_bN1", + "colab_type": "text" + }, + "source": [ + "# -----current: break-----" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FIuSWoJspt3H", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import seaborn as sns\n", + "color = sns.color_palette()\n", + "sns.set(style=\"darkgrid\")\n", + "\n", + "\n", + "ax = sns.countplot(x=\"buildingqualitytypeid\", data=df_train)\n", + "\n", + "plt.xticks(rotation='vertical')\n", + "plt.title(\"Frequency of Bathroom count\", fontsize=15)\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "KOHPCFRSp5y9", + "colab_type": "code", + "colab": {} + }, + "source": [ + "plt.plot(df_train.yearbuilt,df_train.buildingqualitytypeid , 'ro')\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_647tI5Lp94v", + "colab_type": "text" + }, + "source": [ + "### Final adjustments\n", + "- filling nans" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-4A3-sjRp8AE", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#location seems to be related to building quality, (knnclassifier)\n", + "\n", + "def fillna_knn( df, base, target):\n", + " data_colnames = [ target ] + base\n", + " #print(\"data_colnames\",data_colnames)\n", + " missing_values_boolflag = df[target].isnull() #true for missing rows, false for columns with values\n", + " #print(\"miss\",missing_values_boolflag.head())\n", + " not_missing_boolflag = ~missing_values_boolflag \n", + " #print(\"not miss\",not_missing_boolflag.head())\n", + " number_of_missing_val = missing_values_boolflag.sum()\n", + " print(\"# of miss\",number_of_missing_val)\n", + " not_missing_rows = df.loc[ not_missing_boolflag, data_colnames ]\n", + " #print(not_missing_rows.head())\n", + " Y = not_missing_rows[target]\n", + " X = not_missing_rows[base]\n", + " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=3192,stratify=Y)\n", + " metrics = ['euclidean'] \n", + " weights = ['distance'] \n", + " numNeighbors = [5,10,15,20,25]\n", + " param_grid = dict(metric=metrics,weights=weights,n_neighbors=numNeighbors)\n", + " cv = StratifiedKFold(n_splits=3,random_state=3192,shuffle=False)\n", + " grid = GridSearchCV(neighbors.KNeighborsClassifier(n_jobs=-1),param_grid=param_grid,cv=cv,scoring='f1_weighted',refit=True,return_train_score=True,verbose=1,n_jobs=-1,pre_dispatch='n_jobs')\n", + " grid.fit(X_train ,Y_train)\n", + " #print(\"grid.cv_results_\",grid.cv_results_)\n", + " print(\"grid.best_estimator_\",grid.best_estimator_)\n", + " print(\"grid.best_params_\",grid.best_params_)\n", + " print(\"grid.scorer_\",grid.scorer_)\n", + " #print(\"grid.n_splits_\",grid.n_splits_)\n", + " y_true, y_pred = Y_test, grid.predict(X_test)\n", + " \n", + " Z = grid.predict(df.loc[missing_values_boolflag, base])\n", + " #df.loc[ missing_values_boolflag, target ] = Z\n", + " return Z" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uCyRxp-7qEXf", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(df_train.buildingqualitytypeid.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['buildingqualitytypeid']=temp['buildingqualitytypeid'].fillna(-1)\n", + "temp=temp.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", + "temp['buildingqualitytypeid'] = temp['buildingqualitytypeid'].replace(-1,np.nan)\n", + "print(temp.buildingqualitytypeid.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knn(temp,\n", + " base = [ 'latitude', 'longitude' ] ,\n", + " target = 'buildingqualitytypeid')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['buildingqualitytypeid'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'buildingqualitytypeid' ] = missing_values\n", + "\n", + "print(df_train.buildingqualitytypeid.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "oTh_XPErqkHf", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(df_train.heating_system_id.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['heating_system_id']=temp['heating_system_id'].fillna(-1)\n", + "temp=temp.groupby(\"heating_system_id\").filter(lambda x: x.heating_system_id.size > 3)\n", + "temp['heating_system_id'] = temp['heating_system_id'].replace(-1,np.nan)\n", + "print(temp.heating_system_id.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knn(temp,\n", + " base = [ 'latitude', 'longitude' ] ,\n", + " target = 'heating_system_id')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['heating_system_id'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'heating_system_id' ] = missing_values\n", + "\n", + "\n", + "print(df_train.heating_system_id.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "oVjNSkUYqnCt", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(df_train.ac_id.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['ac_id']=temp['ac_id'].fillna(-1)\n", + "temp=temp.groupby(\"ac_id\").filter(lambda x: x.ac_id.size > 3)\n", + "temp['ac_id'] = temp['ac_id'].replace(-1,np.nan)\n", + "print(temp.ac_id.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knn(temp,\n", + " base = [ 'latitude', 'longitude' ] ,\n", + " target = 'ac_id')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['ac_id'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'ac_id' ] = missing_values\n", + "\n", + "print(df_train.ac_id.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qTbcYbexqr0Y", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#yearbuilt\n", + "print(df_train.yearbuilt.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['yearbuilt']=temp['yearbuilt'].fillna(-1)\n", + "temp=temp.groupby(\"yearbuilt\").filter(lambda x: x.yearbuilt.size > 3)\n", + "temp['yearbuilt'] = temp['yearbuilt'].replace(-1,np.nan)\n", + "print(temp.yearbuilt.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knn(temp,\n", + " base = [ 'latitude', 'longitude','buildingqualitytypeid','propertylandusetypeid' ] ,\n", + " target = 'yearbuilt')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['yearbuilt'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'yearbuilt' ] = missing_values\n", + "print(df_train.yearbuilt.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Gx1LYGmfqxLk", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#location seems to be related to building quality, (knnregressor)\n", + "from sklearn.model_selection import KFold\n", + "\n", + "def fillna_knnr( df, base, target):\n", + " data_colnames = [ target ] + base\n", + " #print(\"data_colnames\",data_colnames)\n", + " missing_values_boolflag = df[target].isnull() #true for missing rows, false for columns with values\n", + " #print(\"miss\",missing_values_boolflag.head())\n", + " not_missing_boolflag = ~missing_values_boolflag \n", + " #print(\"not miss\",not_missing_boolflag.head())\n", + " number_of_missing_val = missing_values_boolflag.sum()\n", + " print(\"# of miss\",number_of_missing_val)\n", + " not_missing_rows = df.loc[ not_missing_boolflag, data_colnames]\n", + " #print(not_missing_rows.head())\n", + " Y = not_missing_rows[target]\n", + " X = not_missing_rows[base]\n", + " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=3192)\n", + " metrics = ['euclidean'] \n", + " weights = ['distance'] \n", + " numNeighbors = [5,10,15,20,25]\n", + " param_grid = dict(metric=metrics,weights=weights,n_neighbors=numNeighbors)\n", + " cv = KFold(n_splits=3,random_state=3192,shuffle=False) \n", + " grid = GridSearchCV(neighbors.KNeighborsRegressor(n_jobs=-1),param_grid=param_grid,cv=cv,scoring='neg_mean_absolute_error',refit=True,return_train_score=True,verbose=1,n_jobs=-1,pre_dispatch='n_jobs')\n", + " grid.fit(X_train ,Y_train)\n", + " #print(\"grid.cv_results_\",grid.cv_results_)\n", + " print(\"grid.best_estimator_\",grid.best_estimator_)\n", + " print(\"grid.best_params_\",grid.best_params_)\n", + " print(\"grid.scorer_\",grid.scorer_)\n", + " #print(\"grid.n_splits_\",grid.n_splits_)\n", + " y_true, y_pred = Y_test, grid.predict(X_test) \n", + " Z = grid.predict(df.loc[missing_values_boolflag, base])\n", + " #df.loc[ missing_values_boolflag, target ] = Z\n", + " return Z" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "pj5PXm7ozg5l", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#garage_sqft\n", + "print(df_train.garage_sqft.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.loc[df_train.garagecarcnt>0,df_train.columns].copy()\n", + "\n", + "print(temp.garage_sqft.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knnr(temp,\n", + " base = [ 'latitude', 'longitude','garagecarcnt'] ,\n", + " target = 'garage_sqft')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['garage_sqft'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'garage_sqft' ] = missing_values\n", + "print(df_train.garage_sqft.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "b7e5CFTyzg_M", + "colab_type": "code", + "colab": {} + }, + "source": [ + "df_train = df_train.drop('parcelid', axis=1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YxGquCOOzhD7", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#All the other columns with missing values seems to be integer, will need regression to be imputed,\n", + "#time to get categorical variables hot encoded\n", + "\n", + "#Identify numerical columns to produce a heatmap\n", + "catcols = ['ac_id','buildingqualitytypeid','deck_flag','fips', 'heating_system_id','has_hottub_or_spa',\n", + " 'just_hottub_or_spa', 'pool_with_spa_tub_yes','pool_with_spa_tub_no','propertylandusetypeid','basement_flag'\n", + " ,'fireplaceflag','taxdelinquencyflag']\n", + "numcols = [x for x in df_train.columns if x not in catcols]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uVZkszJEzhHj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#total_finished_living_area_sqft\n", + "\n", + "print(df_train.total_finished_living_area_sqft.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_finished_living_area_sqft.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = [ 'latitude', 'longitude','basementsqft','numberofstories','poolcnt','garagecarcnt','garage_sqft','propertylandusetypeid'] ,\n", + " target = 'total_finished_living_area_sqft')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_finished_living_area_sqft'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_finished_living_area_sqft' ] = missing_values\n", + "print(df_train.total_finished_living_area_sqft.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CVrTMb92zhLX", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#total_bath\t1165\n", + "#full_bath\t1182\n", + "#half_bath\t1182\n", + "#roomcnt\t1416\n", + "#bedroomcnt\t1421\n", + "\n", + "#total_finished_living_area_sqft\n", + "\n", + "print(df_train.total_bath.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_bath.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['propertylandusetypeid','total_finished_living_area_sqft' ] ,\n", + " target = 'total_bath')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_bath'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_bath' ] = missing_values\n", + "print(df_train.total_bath.isnull().sum())#total_bath\t1165\n", + "#full_bath\t1182\n", + "#half_bath\t1182\n", + "#roomcnt\t1416\n", + "#bedroomcnt\t1421\n", + "\n", + "#total_finished_living_area_sqft\n", + "\n", + "print(df_train.total_bath.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_bath.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['propertylandusetypeid','total_finished_living_area_sqft' ] ,\n", + " target = 'total_bath')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_bath'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_bath' ] = missing_values\n", + "print(df_train.total_bath.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BjIKlu-tzhPI", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# rop half_bath and full bath, as there are only redundant values of total_bath\n", + "df_train = df_train.drop(['full_bath','half_bath'], axis=1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "02X1y6EBzhT9", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#bedroomcnt\t1421\n", + "\n", + "print(df_train.bedroomcnt.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.bedroomcnt.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['propertylandusetypeid','total_finished_living_area_sqft','total_bath' ] ,\n", + " target = 'bedroomcnt')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['bedroomcnt'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'bedroomcnt' ] = missing_values\n", + "print(df_train.bedroomcnt.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WzkZ_qeHzhXP", + "colab_type": "code", + "colab": {} + }, + "source": [ + "df_train['total_bath']=df_train.total_bath.round(1)\n", + "df_train['bedroomcnt']=df_train.bedroomcnt.round(1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "QF9DtDAczhaW", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#recalculate roomcnt\t1416 as we have used imputation for total_bath and bedroomcnt\n", + "\n", + "df_train.loc[(df_train.roomcnt.isnull()),['roomcnt']]=df_train.total_bath + df_train.bedroomcnt" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "U5N41TBlz60W", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(df_train.shape)\n", + "df_train =df_train.loc[(df_train.total_parcel_tax.notnull()) & (df_train.land_tax.notnull()),df_train.columns]\n", + "\n", + "print(df_train.shape)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "kv9h5yL3z64Q", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#lot_area_sqft\n", + "print(df_train.lot_area_sqft.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.lot_area_sqft.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['latitude','longitude','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", + " target = 'lot_area_sqft')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['lot_area_sqft'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'lot_area_sqft' ] = missing_values.round(2)\n", + "print(df_train.lot_area_sqft.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "GYJLHrR4z68f", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# predict structure_tax and recalculate total_parcel_tax = land_tax + structure_tax\n", + "\n", + "\n", + "print(df_train.structure_tax.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.structure_tax.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['latitude','longitude','lot_area_sqft','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", + " target = 'structure_tax')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['structure_tax'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'structure_tax' ] = missing_values.round(2)\n", + "print(df_train.structure_tax.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ya-3K06Zz6_y", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#36 total_property_tax_2016 \n", + "\n", + "#total_parcel_tax = land_tax + structure_tax\n", + " \n", + "df_train['total_parcel_tax']=df_train['structure_tax']+df_train['land_tax']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8Fvr7voVz7DX", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#age of the property\n", + "df_train['age'] = 2016 - df_train['yearbuilt']\n", + "df_train=df_train.drop(['yearbuilt'],axis=1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xl0EOIT-z7Gl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#total_property_tax_2016\n", + "\n", + "\n", + "print(df_train.total_property_tax_2016.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_property_tax_2016.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['latitude','longitude','lot_area_sqft','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", + " target = 'total_property_tax_2016')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_property_tax_2016'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_property_tax_2016' ] = missing_values.round(2)\n", + "print(df_train.total_property_tax_2016.isnull().sum())" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YlaxWegqz7I-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#check missing values\n", + "\n", + "missing_df = df_train.isnull().sum(axis=0).reset_index()\n", + "missing_df.columns = ['column_name', 'missing_count']\n", + "missing_df = missing_df.loc[missing_df['missing_count']>0]\n", + "missing_df = missing_df.sort_values(by='missing_count')\n", + "print(missing_df)\n", + "print(missing_df.shape)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dIl_nqKVz7NQ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#both the columns above miss 92% of the data, there is no related varibale to impute it, hence dropping them at this point\n", + "\n", + "df_train = df_train.drop(['finished_living_area_entryfloor_sqft2','finished_living_area_entryfloor_sqft1'], axis=1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "HQJd7rgKz7Qq", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Identify numerical columns to produce a heatmap\n", + "catcols = ['ac_id','buildingqualitytypeid','deck_flag','fips','pool_with_spa_tub_no','pool_with_spa_tub_yes','has_hottub_or_spa',\n", + " 'just_hottub_or_spa','heating_system_id','propertylandusetypeid','basement_flag','fireplaceflag','taxdelinquencyflag']\n", + "numcols = [x for x in df_train.columns if x not in catcols]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VUN3a6uJz7Ut", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# 2 variables are in object datatype, coverting into numeric\n", + "df_train[['census_tractnumber','block_number']] = df_train[['census_tractnumber','block_number']].apply(pd.to_numeric)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zGx77rRAz7ZZ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# dropping categorical columns as xgboost feature selection cannot hadle it\n", + "\n", + "train_x = df_train.drop(catcols+['logerror'], axis=1)\n", + "\n", + "train_y=df_train['logerror']\n", + "\n", + "train_x = train_x.astype(float) \n", + "train_y = train_y.astype(float)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "es_Ew2YJz7dT", + "colab_type": "code", + "colab": {} + }, + "source": [ + "pd.options.display.max_rows = 65\n", + "\n", + "dtype_df = train_x.dtypes.reset_index()\n", + "dtype_df.columns = [\"Count\", \"Column Type\"]\n", + "#dtype_df" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "bvWIhR38z7fW", + "colab_type": "code", + "colab": {} + }, + "source": [ + "df_train.loc[df_train.has_hottub_or_spa==True,'has_hottub_or_spa']=\"Yes\"\n", + "df_train.loc[df_train.has_hottub_or_spa==0,'has_hottub_or_spa']=\"No\"\n", + "\n", + "df_train.loc[df_train.just_hottub_or_spa==0,'just_hottub_or_spa']=\"No\"\n", + "df_train.loc[df_train.just_hottub_or_spa==1,'just_hottub_or_spa']=\"Yes\"\n", + "\n", + "df_train.loc[df_train.deck_flag==0,'deck_flag']=\"No\"\n", + "df_train.loc[df_train.deck_flag==1,'deck_flag']=\"Yes\"\n", + "\n", + "df_train.loc[df_train.basement_flag==0,'basement_flag']=\"No\"\n", + "df_train.loc[df_train.basement_flag==1,'basement_flag']=\"Yes\"\n", + "\n", + "df_train.loc[df_train.fireplaceflag==False,'fireplaceflag']=\"No\"\n", + "df_train.loc[df_train.fireplaceflag==True,'fireplaceflag']=\"Yes\"\n", + "#" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ef9JjrmMz7jw", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#ac_id,heating_system_id,propertylandusetypeid\n", + "dummieslist=['has_hottub_or_spa','just_hottub_or_spa',\n", + " 'deck_flag','fips','basement_flag','fireplaceflag','taxdelinquencyflag']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z51Zrt2Uz7oD", + "colab_type": "code", + "colab": {} + }, + "source": [ + "df_train[dummieslist] = df_train[dummieslist].astype(object)\n", + "dummies = pd.get_dummies(df_train[dummieslist], prefix= dummieslist)\n", + "dummies.shape" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VHBi5Gg6z7tu", + "colab_type": "code", + "colab": {} + }, + "source": [ + "dummies2=['pool_with_spa_tub_no','pool_with_spa_tub_yes']\n", + "df_train[dummies2] = df_train[dummies2].astype(int)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "oocTPKI9z7rk", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import MySQLdb\n", + "from sqlalchemy import create_engine\n", + "engineString = 'mysql+mysqldb://root:MyNewPass@localhost/sakila'\n", + "engine = create_engine(engineString)\n", + "con=engine.connect()\n", + "\n", + "with engine.connect() as con, con.begin():\n", + " df_train.to_sql('df_train_f1', engine, chunksize=10000, index =False,if_exists ='replace')" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zj5ZLSPlz7XC", + "colab_type": "code", + "colab": {} + }, + "source": [ + "numcols2=['basementsqft','total_bath','bedroomcnt','total_finished_living_area_sqft','fireplace_count','garagecarcnt',\n", + " 'garage_sqft','latitude','longitude','lot_area_sqft','poolcnt','pool_sqft','roomcnt','unitcnt','patio_sqft','storage_sqft',\n", + " 'numberofstories','structure_tax','total_parcel_tax','land_tax','total_property_tax_2016','taxdelinquencyyear','transaction_month',\n", + " 'census_tractnumber','block_number','age']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "fp53dotszhgA", + "colab_type": "code", + "colab": {} + }, + "source": [ + "Y=df_train['logerror']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "O0Uaei4rzhj6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#buildingqualitytypeid ->has order\n", + "le = LabelEncoder()\n", + "df_train['buildingqualitytypeid']=le.fit_transform(df_train.buildingqualitytypeid)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "g4-g-uvtzhds", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#df_train.ac_id.value_counts()\n", + "#df_train.propertylandusetypeid.value_counts()\n", + "#'buildingqualitytypeid','ac_id','heating_system_id','propertylandusetypeid'" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SzliXafdzhRd", + "colab_type": "code", + "colab": {} + }, + "source": [ + "X=pd.concat([dummies,df_train[dummies2],df_train[numcols2],df_train[['buildingqualitytypeid','ac_id','heating_system_id','propertylandusetypeid']]],axis=1)\n", + "X.shape" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "DBsZjyQd0W1N", + "colab_type": "code", + "colab": {} + }, + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=3192)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ihXFZWcn0W5D", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# top features\n", + "import xgboost as xgb\n", + "xgb_params = {\n", + " 'eta': 0.05,\n", + " 'max_depth': 8,\n", + " 'subsample': 0.7,\n", + " 'colsample_bytree': 0.7,\n", + " 'objective': 'reg:linear',\n", + " 'silent': 1,\n", + " 'seed' : 0\n", + "}\n", + "dtrain = xgb.DMatrix(X_train, Y_train, feature_names=X_train.columns.values)\n", + "model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)\n", + "# plot the important features #\n", + "fig, ax = plt.subplots(figsize=(12,18))\n", + "#max_num_features=50, error for no reason \n", + "xgb.plot_importance(model, height=0.8, ax=ax)\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "TQEEzNkX0W9w", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#top features\n", + "xgboost_selection=['total_finished_living_area_sqft','latitude','structure_tax','total_property_tax_2016',\n", + "'total_parcel_tax','land_tax','longitude','lot_area_sqft','census_tractnumber','age','total_bath','bedroomcnt',\n", + "'block_number','transaction_month','roomcnt','taxdelinquencyyear','unitcnt','taxdelinquencyflag_No',\n", + "'fips_LA','garage_sqft','pool_with_spa_tub_no','has_hottub_or_spa_No','garagecarcnt','deck_flag_No',\n", + "'poolcnt','pool_sqft'\n", + "]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Rr_6EO4G0XEj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# feature selection\n", + "#c_id,heating_system_id,propertylandusetypeid\n", + "from sklearn.ensemble import ExtraTreesRegressor\n", + "from sklearn.feature_selection import SelectFromModel\n", + "reg = ExtraTreesRegressor(n_estimators=500, max_depth=8, max_features='sqrt',\n", + " min_samples_split=100 ,min_samples_leaf=10, bootstrap=True,n_jobs=-1, random_state=3192)\n", + "reg = reg.fit(X_train, Y_train)\n", + "#print(\"importance\",reg.feature_importances_) \n", + "model = SelectFromModel(reg, prefit=True)\n", + "X_new = model.transform(X_train)\n", + "print(X_train.shape)\n", + "print(X_new.shape) \n", + "\n", + "feat_names = X.columns.values\n", + "importances = reg.feature_importances_\n", + "std = np.std([tree.feature_importances_ for tree in reg.estimators_], axis=0)\n", + "indices = np.argsort(importances)[::-1][:26]\n", + "plt.figure(figsize=(12,12))\n", + "plt.title(\"Feature importances\")\n", + "plt.bar(range(len(indices)), importances[indices], color=\"r\", yerr=std[indices], align=\"center\")\n", + "plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')\n", + "plt.xlim([-1, len(indices)])\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "i4FCNOG70XIU", + "colab_type": "code", + "colab": {} + }, + "source": [ + "tree_selection=[\n", + " 'total_finished_living_area_sqft','structure_tax','total_property_tax_2016','total_bath','total_parcel_tax',\n", + " 'age','latitude','census_tractnumber','bedroomcnt','longitude','land_tax','propertylandusetypeid','block_number',\n", + " 'buildingqualitytypeid','numberofstories','heating_system_id','unitcnt','transaction_month','lot_area_sqft','roomcnt',\n", + " 'garage_sqft','garagecarcnt','pool_with_spa_tub_no','poolcnt','fips_LA','taxdelinquencyyear','patio_sqft',\n", + " 'taxdelinquencyflag_No','taxdelinquencyflag_Yes'\n", + "]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "TmIS1WAS0XMW", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import KFold\n", + "from sklearn.linear_model import Ridge,Lasso\n", + "from sklearn.feature_selection import RFECV\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import r2_score,mean_absolute_error,make_scorer\n", + "\n", + "#model=Lasso(alpha=0.2, fit_intercept=True, normalize=True, precompute=False, copy_X=True,\n", + " # max_iter=1000, \n", + " # tol=0.0001, warm_start=False, positive=False, random_state=3192, selection='cyclic')\n", + "\n", + "#Ridge(random_state=3192,solver='auto',fit_intercept=True,normalize=True,alpha=0.1)\n", + "#LinearRegression(n_jobs=-1,fit_intercept=True, normalize=True, copy_X=True)\n", + "\n", + "\n", + "rfecv = RFECV(estimator=LinearRegression(n_jobs=-1,fit_intercept=True, normalize=True, copy_X=True), step=2, cv=KFold(4),scoring='neg_mean_absolute_error')\n", + "rfecv.fit(X_train, Y_train)\n", + "\n", + "print(\"Optimal number of features : %d\" % rfecv.n_features_)\n", + "\n", + "# Plot number of features VS. cross-validation scores\n", + "plt.figure()\n", + "plt.xlabel(\"Number of features selected\")\n", + "\n", + "plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n", + "plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)\n", + "plt.show()\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "DIw8O00U0XPR", + "colab_type": "code", + "colab": {} + }, + "source": [ + "rfe_selection = [i for indx,i in enumerate(X.columns) if rfecv.support_[indx] == True]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gHA0x5_80XWy", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Linear regression with rfe_selection selection\n", + "#rfe_selection, tree_selection, xgboost_selection\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import r2_score,mean_absolute_error,make_scorer,mean_squared_error\n", + "\n", + "# just to check whether normalized /not normalized data gives better results\n", + "parameters = {'fit_intercept':[True], 'normalize':[True,False], 'copy_X':[True]}\n", + "scoring = {'MAE':'neg_mean_absolute_error','MSE': make_scorer(mean_squared_error,greater_is_better=False)}\n", + "\n", + "grid1 = GridSearchCV(LinearRegression(n_jobs=-1),param_grid=parameters, scoring=scoring,cv=5,refit='MAE',\n", + " return_train_score=True,\n", + " verbose=0,n_jobs=-1,pre_dispatch='n_jobs')\n", + "\n", + "grid1.fit(X_train[rfe_selection], Y_train)\n", + "#print(\"5. grid best_score_\",abs(grid.best_score_))\n", + "Y_pred = grid1.predict(X_test[rfe_selection])\n", + "print(\"MAE on test data\",mean_absolute_error(Y_test,Y_pred))\n", + "print(\"MSE on test data\",mean_squared_error(Y_test,Y_pred))\n", + "print(\"R Squared data \",r2_score(Y_test,Y_pred))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ekn4pBs60XcT", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#pca selection\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import scale\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import scale\n", + "%matplotlib inline\n", + "scaled_x = scale(X)\n", + "pca = PCA(n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", + "pca.fit(scaled_x)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "yFuT-wUN0XfV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# The amount of variance that each PC explains\n", + "var= pca.explained_variance_ratio_\n", + "#Cumulative Variance explains\n", + "var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)\n", + "print(var1)\n", + "plt.plot(var1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iPN4OBUe0XlD", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Looking at above plot I'm taking 28 variables\n", + "\n", + "pca = PCA(n_components=28, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", + "pca.fit(scaled_x)\n", + "\n", + "pca1=pca.fit_transform(scaled_x)\n", + "\n", + "pca = PCA(n_components=28, copy=True, whiten=True, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", + "pca.fit(scaled_x)\n", + "pca2=pca.fit_transform(scaled_x)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EE4ednPC0XjX", + "colab_type": "code", + "colab": {} + }, + "source": [ + "pcaX_train, pcaX_test, pcaY_train, pcaY_test = train_test_split(pca1, Y, test_size=0.10, random_state=3192)\n", + "pca2X_train, pca2X_test, pca2Y_train, pca2Y_test = train_test_split(pca2, Y, test_size=0.10, random_state=3192)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "erYMXvTG0XaK", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.metrics import mean_absolute_error,make_scorer\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "# just to check whether normalized /not normalized data gives better results\n", + "\n", + " # 0.005 for 1200 trees.\n", + "param_grid={'n_estimators':[1200],'max_features':[22]}\n", + "\n", + " \n", + "grid13 = GridSearchCV(GradientBoostingRegressor(subsample=0.8,min_samples_leaf=50,min_samples_split=50,max_depth=9,loss='ls',criterion='friedman_mse',learning_rate=0.005,random_state=3192),\n", + " param_grid=param_grid, cv=5,refit='MAE',\n", + " return_train_score=True,\n", + " verbose=2,n_jobs=-1,pre_dispatch='n_jobs')\n", + "\n", + "grid13.fit(pcaX_train, pcaY_train)\n", + "print(\"5. grid best_score_\",abs(grid13.best_score_))\n", + "print(\"best params\",grid13.best_params_)\n", + "print(\"best score\",grid13.best_score_)\n", + "Y_pred = grid13.predict(pcaX_test)\n", + "print(\"MAE on test data\",mean_absolute_error(pcaY_test,Y_pred))\n", + "print(\"MSE on test data\",mean_squared_error(pcaY_test,Y_pred))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BgtbLCcR0XUx", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "FjdSCEFP0XCM", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + } + ] +} \ No newline at end of file From 601d9855ea912d9ee2198f0d3477a44371491ae5 Mon Sep 17 00:00:00 2001 From: Winston Date: Wed, 21 Aug 2019 17:27:26 -0700 Subject: [PATCH 2/7] updated install script; further progress into conversion; on final (3/3) section -- current break is on replacing pd.series.filter after grouping by buildingtypeid (dropping building types represented 3 or fewer times in data) --- .../zillow_kaggle_zestimate_comp.ipynb | 1812 ++++++++++++----- 1 file changed, 1249 insertions(+), 563 deletions(-) diff --git a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb index 24a1849f..cda0658e 100644 --- a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb +++ b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb @@ -43,10 +43,10 @@ "metadata": { "id": "W-um5d-x7o46", "colab_type": "code", - "outputId": "37bf77fb-7f83-49fc-b5e5-514cd049e32d", + "outputId": "35d83399-515c-4172-e915-3886511baba2", "colab": { "base_uri": "https://localhost:8080/", - "height": 329 + "height": 302 } }, "source": [ @@ -61,15 +61,15 @@ { "output_type": "stream", "text": [ - "Thu Aug 15 03:12:33 2019 \n", + "Wed Aug 21 22:49:26 2019 \n", "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 418.67 Driver Version: 410.79 CUDA Version: 10.0 |\n", + "| NVIDIA-SMI 430.40 Driver Version: 410.79 CUDA Version: 10.0 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", - "| N/A 60C P8 16W / 70W | 0MiB / 15079MiB | 0% Default |\n", + "| N/A 49C P8 16W / 70W | 0MiB / 15079MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", @@ -98,11 +98,16 @@ "metadata": { "id": "p129YxxnihcV", "colab_type": "code", - "colab": {} + "outputId": "a7de3ee2-b456-45d7-ab54-03eb1d72a956", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + } }, "source": [ - "!wget -nc https://github.com/rapidsai/notebooks-contrib/blob/master/utils/rapids-colab.sh\n", - "!bash rapids-colab.sh\n", + "!wget -nc https://raw.githubusercontent.com/randerzander/notebooks-contrib/master/utils/rapids-colab.sh\n", + "# RAPIDS 0.9 nightly\n", + "!bash rapids-colab.sh 0.9\n", "\n", "import sys, os\n", "\n", @@ -111,7 +116,256 @@ "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "--2019-08-21 22:49:32-- https://raw.githubusercontent.com/randerzander/notebooks-contrib/master/utils/rapids-colab.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1606 (1.6K) [text/plain]\n", + "Saving to: ‘rapids-colab.sh’\n", + "\n", + "\rrapids-colab.sh 0%[ ] 0 --.-KB/s \rrapids-colab.sh 100%[===================>] 1.57K --.-KB/s in 0s \n", + "\n", + "2019-08-21 22:49:33 (231 MB/s) - ‘rapids-colab.sh’ saved [1606/1606]\n", + "\n", + "--2019-08-21 22:49:33-- https://github.com/rapidsai/notebooks-extended/raw/master/utils/env-check.py\n", + "Resolving github.com (github.com)... 140.82.113.3\n", + "Connecting to github.com (github.com)|140.82.113.3|:443... connected.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://github.com/rapidsai/notebooks-contrib/raw/master/utils/env-check.py [following]\n", + "--2019-08-21 22:49:33-- https://github.com/rapidsai/notebooks-contrib/raw/master/utils/env-check.py\n", + "Reusing existing connection to github.com:443.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/env-check.py [following]\n", + "--2019-08-21 22:49:33-- https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/env-check.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 783 [text/plain]\n", + "Saving to: ‘env-check.py’\n", + "\n", + "env-check.py 100%[===================>] 783 --.-KB/s in 0s \n", + "\n", + "2019-08-21 22:49:33 (125 MB/s) - ‘env-check.py’ saved [783/783]\n", + "\n", + "Checking for GPU type:\n", + "*********************************************\n", + "Woo! Your instance has the right kind of GPU!\n", + "*********************************************\n", + "\n", + "Removing conflicting packages, will replace with RAPIDS compatible versions\n", + "Uninstalling xgboost-0.90:\n", + " Successfully uninstalled xgboost-0.90\n", + "Uninstalling dask-1.1.5:\n", + " Successfully uninstalled dask-1.1.5\n", + "Uninstalling distributed-1.25.3:\n", + " Successfully uninstalled distributed-1.25.3\n", + "Installing conda\n", + "--2019-08-21 22:49:38-- https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\n", + "Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c94f, ...\n", + "Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 58468498 (56M) [application/x-sh]\n", + "Saving to: ‘Miniconda3-4.5.4-Linux-x86_64.sh’\n", + "\n", + "Miniconda3-4.5.4-Li 100%[===================>] 55.76M 151MB/s in 0.4s \n", + "\n", + "2019-08-21 22:49:38 (151 MB/s) - ‘Miniconda3-4.5.4-Linux-x86_64.sh’ saved [58468498/58468498]\n", + "\n", + "PREFIX=/usr/local\n", + "installing: python-3.6.5-hc3d631a_2 ...\n", + "Python 3.6.5 :: Anaconda, Inc.\n", + "installing: ca-certificates-2018.03.07-0 ...\n", + "installing: conda-env-2.6.0-h36134e3_1 ...\n", + "installing: libgcc-ng-7.2.0-hdf63c60_3 ...\n", + "installing: libstdcxx-ng-7.2.0-hdf63c60_3 ...\n", + "installing: libffi-3.2.1-hd88cf55_4 ...\n", + "installing: ncurses-6.1-hf484d3e_0 ...\n", + "installing: openssl-1.0.2o-h20670df_0 ...\n", + "installing: tk-8.6.7-hc745277_3 ...\n", + "installing: xz-5.2.4-h14c3975_4 ...\n", + "installing: yaml-0.1.7-had09818_2 ...\n", + "installing: zlib-1.2.11-ha838bed_2 ...\n", + "installing: libedit-3.1.20170329-h6b74fdf_2 ...\n", + "installing: readline-7.0-ha6073c6_4 ...\n", + "installing: sqlite-3.23.1-he433501_0 ...\n", + "installing: asn1crypto-0.24.0-py36_0 ...\n", + "installing: certifi-2018.4.16-py36_0 ...\n", + "installing: chardet-3.0.4-py36h0f667ec_1 ...\n", + "installing: idna-2.6-py36h82fb2a8_1 ...\n", + "installing: pycosat-0.6.3-py36h0a5515d_0 ...\n", + "installing: pycparser-2.18-py36hf9f622e_1 ...\n", + "installing: pysocks-1.6.8-py36_0 ...\n", + "installing: ruamel_yaml-0.15.37-py36h14c3975_2 ...\n", + "installing: six-1.11.0-py36h372c433_1 ...\n", + "installing: cffi-1.11.5-py36h9745a5d_0 ...\n", + "installing: setuptools-39.2.0-py36_0 ...\n", + "installing: cryptography-2.2.2-py36h14c3975_0 ...\n", + "installing: wheel-0.31.1-py36_0 ...\n", + "installing: pip-10.0.1-py36_0 ...\n", + "installing: pyopenssl-18.0.0-py36_0 ...\n", + "installing: urllib3-1.22-py36hbe7ace6_0 ...\n", + "installing: requests-2.18.4-py36he2e5f8d_1 ...\n", + "installing: conda-4.5.4-py36_0 ...\n", + "installation finished.\n", + "WARNING:\n", + " You currently have a PYTHONPATH environment variable set. This may cause\n", + " unexpected behavior when running the Python interpreter in Miniconda3.\n", + " For best results, please verify that your PYTHONPATH only points to\n", + " directories of packages that are compatible with the Python interpreter\n", + " in Miniconda3: /usr/local\n", + "Installing RAPIDS packages\n", + "Please standby, this will take a few minutes...\n", + "\n", + "\n", + "==> WARNING: A newer version of conda exists. <==\n", + " current version: 4.5.4\n", + " latest version: 4.7.11\n", + "\n", + "Please update conda by running\n", + "\n", + " $ conda update -n base conda\n", + "\n", + "\n", + "bzip2-1.0.8 | 396 KB | : 100% 1.0/1 [00:00<00:00, 6.99it/s] \n", + "requests-2.22.0 | 84 KB | : 100% 1.0/1 [00:00<00:00, 6.56it/s] \n", + "olefile-0.46 | 31 KB | : 100% 1.0/1 [00:00<00:00, 23.03it/s]\n", + "yaml-0.1.7 | 78 KB | : 100% 1.0/1 [00:00<00:00, 16.84it/s]\n", + "zlib-1.2.11 | 105 KB | : 100% 1.0/1 [00:00<00:00, 15.03it/s]\n", + "llvmlite-0.29.0 | 19.9 MB | : 100% 1.0/1 [00:03<00:00, 3.64s/it] \n", + "pyopenssl-19.0.0 | 81 KB | : 100% 1.0/1 [00:00<00:00, 16.66it/s]\n", + "thrift-cpp-0.12.0 | 2.4 MB | : 100% 1.0/1 [00:00<00:00, 1.76it/s] \n", + "toolz-0.10.0 | 46 KB | : 100% 1.0/1 [00:00<00:00, 17.97it/s]\n", + "libevent-2.1.10 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 2.23it/s] \n", + "libffi-3.2.1 | 46 KB | : 100% 1.0/1 [00:00<00:00, 18.49it/s]\n", + "cudf-0.10.0a | 4.8 MB | : 100% 1.0/1 [00:01<00:00, 1.50s/it] \n", + "snappy-1.1.7 | 39 KB | : 100% 1.0/1 [00:00<00:00, 19.74it/s]\n", + "cloudpickle-1.2.1 | 22 KB | : 100% 1.0/1 [00:00<00:00, 17.29it/s]\n", + "re2-2019.08.01 | 420 KB | : 100% 1.0/1 [00:00<00:00, 6.36it/s] \n", + "pyjwt-1.7.1 | 17 KB | : 100% 1.0/1 [00:00<00:00, 23.11it/s]\n", + "libstdcxx-ng-9.1.0 | 4.0 MB | : 100% 1.0/1 [00:00<00:00, 1.44it/s] \n", + "libgfortran-ng-7.3.0 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 3.53it/s] \n", + "cython-0.29.13 | 2.2 MB | : 100% 1.0/1 [00:00<00:00, 1.72it/s] \n", + "pyparsing-2.4.2 | 57 KB | : 100% 1.0/1 [00:00<00:00, 19.30it/s]\n", + "chardet-3.0.4 | 190 KB | : 100% 1.0/1 [00:00<00:00, 9.45it/s]\n", + "rsa-3.4.2 | 31 KB | : 100% 1.0/1 [00:00<00:00, 19.23it/s]\n", + "libxgboost-0.90.rapi | 33.2 MB | : 100% 1.0/1 [00:08<00:00, 8.58s/it] \n", + "pyasn1-modules-0.2.6 | 47 KB | : 100% 1.0/1 [00:00<00:00, 12.11it/s]\n", + "lz4-c-1.8.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 12.18it/s]\n", + "freetype-2.10.0 | 884 KB | : 100% 1.0/1 [00:00<00:00, 4.76it/s] \n", + "arrow-cpp-0.14.1 | 17.3 MB | : 100% 1.0/1 [00:03<00:00, 3.36s/it] \n", + "oauthlib-3.0.1 | 82 KB | : 100% 1.0/1 [00:00<00:00, 12.63it/s]\n", + "libcumlprims-0.9.0 | 3.9 MB | : 100% 1.0/1 [00:01<00:00, 1.55s/it] \n", + "libcugraph-0.10.0a | 11.2 MB | : 100% 1.0/1 [00:02<00:00, 2.33s/it] \n", + "dask-cuml-0.8.0a | 30 KB | : 100% 1.0/1 [00:00<00:00, 3.87it/s] \n", + "fastavro-0.22.3 | 408 KB | : 100% 1.0/1 [00:00<00:00, 6.77it/s] \n", + "scipy-1.3.1 | 18.1 MB | : 100% 1.0/1 [00:03<00:00, 3.52s/it] \n", + "certifi-2019.6.16 | 149 KB | : 100% 1.0/1 [00:00<00:00, 15.17it/s]\n", + "decorator-4.4.0 | 11 KB | : 100% 1.0/1 [00:00<00:00, 20.06it/s]\n", + "google-auth-1.6.3 | 45 KB | : 100% 1.0/1 [00:00<00:00, 16.56it/s]\n", + "parquet-cpp-1.5.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 27.43it/s]\n", + "rmm-0.10.0a | 14 KB | : 100% 1.0/1 [00:00<00:00, 3.98it/s] \n", + "glog-0.4.0 | 104 KB | : 100% 1.0/1 [00:00<00:00, 15.00it/s]\n", + "wheel-0.33.6 | 35 KB | : 100% 1.0/1 [00:00<00:00, 17.29it/s]\n", + "bokeh-1.3.4 | 4.0 MB | : 100% 1.0/1 [00:01<00:00, 1.56s/it] \n", + "scikit-learn-0.21.3 | 6.7 MB | : 100% 1.0/1 [00:01<00:00, 1.60s/it] \n", + "libtiff-4.0.10 | 587 KB | : 100% 1.0/1 [00:00<00:00, 6.63it/s] \n", + "idna-2.8 | 132 KB | : 100% 1.0/1 [00:00<00:00, 15.63it/s]\n", + "pillow-6.1.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.86it/s] \n", + "_libgcc_mutex-0.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 43.53it/s]\n", + "nccl-2.4.6.1 | 66.6 MB | : 100% 1.0/1 [00:10<00:00, 10.59s/it] \n", + "pyyaml-5.1.2 | 184 KB | : 100% 1.0/1 [00:00<00:00, 10.61it/s]\n", + "blinker-1.4 | 13 KB | : 100% 1.0/1 [00:00<00:00, 20.08it/s]\n", + "librmm-0.10.0a | 44 KB | : 100% 1.0/1 [00:00<00:00, 3.31it/s] \n", + "sortedcontainers-2.1 | 25 KB | : 100% 1.0/1 [00:00<00:00, 14.67it/s]\n", + "cytoolz-0.10.0 | 429 KB | : 100% 1.0/1 [00:00<00:00, 7.83it/s] \n", + "dask-cuda-0.10.0a | 911 KB | : 100% 1.0/1 [00:00<00:00, 1.66it/s] \n", + "libblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 5.23it/s] \n", + "distributed-2.3.0 | 366 KB | : 100% 1.0/1 [00:00<00:00, 5.36it/s] \n", + "libpng-1.6.37 | 343 KB | : 100% 1.0/1 [00:00<00:00, 8.59it/s] \n", + "jinja2-2.10.1 | 91 KB | : 100% 1.0/1 [00:00<00:00, 15.90it/s]\n", + "msgpack-python-0.6.1 | 89 KB | : 100% 1.0/1 [00:00<00:00, 17.11it/s]\n", + "numpy-1.17.0 | 5.2 MB | : 100% 1.0/1 [00:01<00:00, 1.30s/it] \n", + "gflags-2.2.2 | 177 KB | : 100% 1.0/1 [00:00<00:00, 11.98it/s]\n", + "tk-8.6.9 | 3.2 MB | : 100% 1.0/1 [00:00<00:00, 1.35it/s] \n", + "ca-certificates-2019 | 145 KB | : 100% 1.0/1 [00:00<00:00, 15.40it/s]\n", + "cffi-1.12.3 | 218 KB | : 100% 1.0/1 [00:00<00:00, 11.34it/s]\n", + "asn1crypto-0.24.0 | 154 KB | : 100% 1.0/1 [00:00<00:00, 11.99it/s]\n", + "dlpack-0.2 | 12 KB | : 100% 1.0/1 [00:00<00:00, 24.28it/s]\n", + "boost-cpp-1.70.0 | 21.1 MB | : 100% 1.0/1 [00:09<00:00, 9.52s/it] \n", + "pyarrow-0.14.1 | 2.8 MB | : 100% 1.0/1 [00:00<00:00, 1.14it/s] \n", + "markupsafe-1.1.1 | 26 KB | : 100% 1.0/1 [00:00<00:00, 21.22it/s]\n", + "six-1.12.0 | 22 KB | : 100% 1.0/1 [00:00<00:00, 17.89it/s]\n", + "python-3.6.7 | 34.6 MB | : 100% 1.0/1 [00:05<00:00, 5.94s/it] \n", + "icu-64.2 | 12.6 MB | : 100% 1.0/1 [00:02<00:00, 2.19s/it] \n", + "libopenblas-0.3.7 | 7.6 MB | : 100% 1.0/1 [00:01<00:00, 1.52s/it] \n", + "c-ares-1.15.0 | 100 KB | : 100% 1.0/1 [00:00<00:00, 17.03it/s]\n", + "numba-0.45.1 | 3.1 MB | : 100% 1.0/1 [00:00<00:00, 1.00it/s] \n", + "zstd-1.4.0 | 928 KB | : 100% 1.0/1 [00:00<00:00, 5.27it/s] \n", + "pycparser-2.19 | 173 KB | : 100% 1.0/1 [00:00<00:00, 11.22it/s]\n", + "openssl-1.1.1c | 2.1 MB | : 100% 1.0/1 [00:00<00:00, 2.22it/s] \n", + "dask-cudf-0.10.0a | 63 KB | : 100% 1.0/1 [00:00<00:00, 2.84it/s] \n", + "sqlite-3.29.0 | 2.0 MB | : 100% 1.0/1 [00:00<00:00, 2.75it/s] \n", + "readline-8.0 | 441 KB | : 100% 1.0/1 [00:00<00:00, 7.41it/s] \n", + "tblib-1.4.0 | 12 KB | : 100% 1.0/1 [00:00<00:00, 25.51it/s]\n", + "locket-0.2.0 | 6 KB | : 100% 1.0/1 [00:00<00:00, 29.95it/s]\n", + "pyasn1-0.4.6 | 52 KB | : 100% 1.0/1 [00:00<00:00, 15.07it/s]\n", + "pytz-2019.2 | 228 KB | : 100% 1.0/1 [00:00<00:00, 4.22it/s] \n", + "libcudf-0.10.0a | 26.0 MB | : 100% 1.0/1 [00:05<00:00, 5.98s/it] \n", + "double-conversion-3. | 85 KB | : 100% 1.0/1 [00:00<00:00, 15.44it/s]\n", + "fsspec-0.4.1 | 39 KB | : 100% 1.0/1 [00:00<00:00, 19.96it/s]\n", + "uriparser-0.9.3 | 49 KB | : 100% 1.0/1 [00:00<00:00, 19.50it/s]\n", + "requests-oauthlib-1. | 19 KB | : 100% 1.0/1 [00:00<00:00, 19.66it/s]\n", + "cryptography-2.7 | 607 KB | : 100% 1.0/1 [00:00<00:00, 3.52it/s] \n", + "cachetools-2.1.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 24.47it/s]\n", + "ncurses-6.1 | 1.3 MB | : 100% 1.0/1 [00:01<00:00, 1.02s/it] \n", + "gcsfs-0.3.0 | 19 KB | : 100% 1.0/1 [00:00<00:00, 15.81it/s]\n", + "libnvstrings-0.10.0a | 16.8 MB | : 100% 1.0/1 [00:07<00:00, 7.28s/it] \n", + "cudatoolkit-10.0.130 | 380.0 MB | : 100% 1.0/1 [00:56<00:00, 57.00s/it] \n", + "pip-19.2.2 | 1.9 MB | : 100% 1.0/1 [00:00<00:00, 1.62it/s] \n", + "liblapack-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 18.78it/s]\n", + "click-7.0 | 61 KB | : 100% 1.0/1 [00:00<00:00, 18.70it/s]\n", + "cuml-0.10.0a | 6.0 MB | : 100% 1.0/1 [00:01<00:00, 1.69s/it] \n", + "grpc-cpp-1.23.0 | 4.5 MB | : 100% 1.0/1 [00:01<00:00, 1.10s/it] \n", + "dask-2.3.0 | 4 KB | : 100% 1.0/1 [00:00<00:00, 27.57it/s]\n", + "brotli-1.0.7 | 1.0 MB | : 100% 1.0/1 [00:00<00:00, 5.00it/s] \n", + "nvstrings-0.10.0a | 124 KB | : 100% 1.0/1 [00:00<00:00, 3.47it/s] \n", + "tornado-6.0.3 | 636 KB | : 100% 1.0/1 [00:00<00:00, 4.58it/s] \n", + "pynvml-8.0.2 | 30 KB | : 100% 1.0/1 [00:00<00:00, 21.55it/s]\n", + "libgcc-ng-9.1.0 | 8.1 MB | : 100% 1.0/1 [00:01<00:00, 1.40s/it] \n", + "libcblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 22.83it/s]\n", + "joblib-0.13.2 | 180 KB | : 100% 1.0/1 [00:00<00:00, 8.76it/s]\n", + "pandas-0.24.2 | 11.1 MB | : 100% 1.0/1 [00:02<00:00, 2.68s/it] \n", + "psutil-5.6.3 | 322 KB | : 100% 1.0/1 [00:00<00:00, 7.88it/s] \n", + "heapdict-1.0.0 | 7 KB | : 100% 1.0/1 [00:00<00:00, 21.63it/s]\n", + "jpeg-9c | 251 KB | : 100% 1.0/1 [00:00<00:00, 10.08it/s]\n", + "zict-1.0.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 20.76it/s]\n", + "libprotobuf-3.8.0 | 4.7 MB | : 100% 1.0/1 [00:01<00:00, 1.06s/it] \n", + "packaging-19.0 | 23 KB | : 100% 1.0/1 [00:00<00:00, 20.95it/s]\n", + "xgboost-0.90.rapidsd | 12 KB | : 100% 1.0/1 [00:00<00:00, 2.77it/s] \n", + "cugraph-0.10.0a | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 1.74it/s] \n", + "urllib3-1.25.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 9.23it/s]\n", + "py-xgboost-0.90.rapi | 87 KB | : 100% 1.0/1 [00:00<00:00, 3.59it/s] \n", + "dask-core-2.3.0 | 574 KB | : 100% 1.0/1 [00:00<00:00, 4.29it/s] \n", + "setuptools-41.2.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.25it/s] \n", + "pysocks-1.7.0 | 26 KB | : 100% 1.0/1 [00:00<00:00, 21.18it/s]\n", + "libcuml-0.10.0a | 29.7 MB | : 100% 1.0/1 [00:07<00:00, 7.44s/it] \n", + "partd-1.0.0 | 16 KB | : 100% 1.0/1 [00:00<00:00, 21.76it/s]\n", + "google-auth-oauthlib | 18 KB | : 100% 1.0/1 [00:00<00:00, 23.67it/s]\n", + "python-dateutil-2.8. | 219 KB | : 100% 1.0/1 [00:00<00:00, 11.17it/s]\n", + "xz-5.2.4 | 366 KB | : 100% 1.0/1 [00:00<00:00, 7.94it/s] \n", + "Copying shared object files to /usr/lib\n", + "\n", + "*********************************************\n", + "Your Google Colab instance is RAPIDS ready!\n", + "*********************************************\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -132,7 +386,11 @@ "metadata": { "id": "x1dLRTm168Tk", "colab_type": "code", - "colab": {} + "outputId": "e4ee4a4e-64f3-4e87-8b87-472b02f84325", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 958 + } }, "source": [ "# Info on how to get your api key (kaggle.json) here: https://github.com/Kaggle/kaggle-api#api-credentials\n", @@ -152,7 +410,86 @@ "!unzip -q \"/content/properties_2017.csv.zip\"" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting kaggle\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e9/fc/0de659ea1f2096563204925b6660ae141f3d85bbe9e8a1571c3eb6cc1fdd/kaggle-1.5.5.tar.gz (56kB)\n", + "\u001b[K |████████████████████████████████| 61kB 2.9MB/s \n", + "\u001b[?25hCollecting urllib3<1.25,>=1.21.1 (from kaggle)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/01/11/525b02e4acc0c747de8b6ccdab376331597c569c42ea66ab0a1dbd36eca2/urllib3-1.24.3-py2.py3-none-any.whl (118kB)\n", + "\u001b[K |████████████████████████████████| 122kB 9.7MB/s \n", + "\u001b[?25hRequirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/site-packages (from kaggle) (1.12.0)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.6/site-packages (from kaggle) (2019.6.16)\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/site-packages (from kaggle) (2.8.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/site-packages (from kaggle) (2.22.0)\n", + "Collecting tqdm (from kaggle)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a5/83/06029af22fe06b8a7be013aeae5e104b3ed26867e5d4ca91408b30aa602e/tqdm-4.34.0-py2.py3-none-any.whl (50kB)\n", + "\u001b[K |████████████████████████████████| 51kB 12.9MB/s \n", + "\u001b[?25hCollecting python-slugify (from kaggle)\n", + " Downloading https://files.pythonhosted.org/packages/a2/5d/bd30413c00bbed3945558aca07c55944073e1e30abeee1f06515281f9811/python-slugify-3.0.3.tar.gz\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/site-packages (from requests->kaggle) (2.8)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/site-packages (from requests->kaggle) (3.0.4)\n", + "Collecting text-unidecode==1.2 (from python-slugify->kaggle)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/79/42/d717cc2b4520fb09e45b344b1b0b4e81aa672001dd128c180fabc655c341/text_unidecode-1.2-py2.py3-none-any.whl (77kB)\n", + "\u001b[K |████████████████████████████████| 81kB 28.8MB/s \n", + "\u001b[?25hBuilding wheels for collected packages: kaggle, python-slugify\n", + " Building wheel for kaggle (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for kaggle: filename=kaggle-1.5.5-cp36-none-any.whl size=71896 sha256=ee79b8c43069539b819caedf251aae4360d5dd43aec6a5bc2734275442177e60\n", + " Stored in directory: /root/.cache/pip/wheels/db/6a/80/6cd1892eb9b9b136333db3c74e16cba4e17e2c700f51541f06\n", + " Building wheel for python-slugify (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for python-slugify: filename=python_slugify-3.0.3-py2.py3-none-any.whl size=4789 sha256=a8f8df8b4a56a8db4fc841f6b6ff5f89a9a3c7e641ff4fc8c41d5e7a5c1ec087\n", + " Stored in directory: /root/.cache/pip/wheels/0f/96/ca/85f5b01165975402d1e37f8dd346df00dc39be1d0761bd17bb\n", + "Successfully built kaggle python-slugify\n", + "Installing collected packages: urllib3, tqdm, text-unidecode, python-slugify, kaggle\n", + " Found existing installation: urllib3 1.25.3\n", + " Uninstalling urllib3-1.25.3:\n", + " Successfully uninstalled urllib3-1.25.3\n", + "Successfully installed kaggle-1.5.5 python-slugify-3.0.3 text-unidecode-1.2 tqdm-4.34.0 urllib3-1.24.3\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "urllib3" + ] + } + } + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + "Downloading sample_submission.csv.zip to /content\n", + " 91% 9.00M/9.86M [00:00<00:00, 17.1MB/s]\n", + "100% 9.86M/9.86M [00:00<00:00, 22.0MB/s]\n", + "Downloading properties_2016.csv.zip to /content\n", + " 98% 156M/159M [00:01<00:00, 103MB/s] \n", + "100% 159M/159M [00:01<00:00, 92.1MB/s]\n", + "Downloading zillow_data_dictionary.xlsx.zip to /content\n", + " 0% 0.00/15.7k [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m train2016 = cudf.read_csv('/content/train_2016_v2.csv',\n\u001b[0;32m----> 2\u001b[0;31m parse_dates=[\"transactiondate\"])\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# peek display 2016 train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain2016\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: read_csv() got an unexpected keyword argument 'parse_dates'" - ] + "output_type": "stream", + "text": [ + " parcelid logerror transactiondate\n", + "0 11016594 0.0276 2016-01-01\n", + "1 14366692 -0.1684 2016-01-01\n", + "2 12098116 -0.0040 2016-01-01\n", + "3 12643413 0.0218 2016-01-02\n", + "4 14432541 -0.0050 2016-01-02\n" + ], + "name": "stdout" } ] }, @@ -287,10 +626,10 @@ "metadata": { "id": "2EfApIzCfEtr", "colab_type": "code", - "outputId": "eabb1351-f4f9-499c-9aea-2fa2953c11a7", + "outputId": "7e91f5f7-7b76-410a-b700-0380b29bd982", "colab": { "base_uri": "https://localhost:8080/", - "height": 146 + "height": 151 } }, "source": [ @@ -304,13 +643,14 @@ { "output_type": "stream", "text": [ - " parcelid airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid ... censustractandblock\n", - "0 10754147 0.0 0.0 ... \n", - "1 10759547 0.0 0.0 ... \n", - "2 10843547 0.0 0.0 ... \n", - "3 10859147 0.0 0.0 3.0 ... \n", - "4 10879947 0.0 0.0 4.0 ... \n", - "[50 more columns]\n" + " parcelid airconditioningtypeid ... taxdelinquencyyear censustractandblock\n", + "0 10754147 null ... null null\n", + "1 10759547 null ... null null\n", + "2 10843547 null ... null null\n", + "3 10859147 null ... null null\n", + "4 10879947 null ... null null\n", + "\n", + "[5 rows x 58 columns]\n" ], "name": "stdout" } @@ -339,10 +679,10 @@ "metadata": { "id": "o4CvSIcwm4B2", "colab_type": "code", - "outputId": "6db5ec53-8522-4483-e2fa-d79d9d9d75e8", + "outputId": "327cc4dd-bad3-40f2-9d09-41105b532abb", "colab": { "base_uri": "https://localhost:8080/", - "height": 146 + "height": 151 } }, "source": [ @@ -398,13 +738,14 @@ { "output_type": "stream", "text": [ - " parcelid logerror transactiondate ac_id architecturalstyletypeid basement_sqft total_bath ... transaction_month\n", - "0 11827818 0.0402 2016-03-15T00:00:00.000 4.0 ... 3\n", - "1 12123024 0.0296 2016-03-15T00:00:00.000 3.0 ... 3\n", - "2 13867327 0.0344 2016-03-15T00:00:00.000 2.0 ... 3\n", - "3 12681894 0.006 2016-03-15T00:00:00.000 3.0 ... 3\n", - "4 12848541 0.06949999999999999 2016-03-15T00:00:00.000 1.0 4.0 ... 3\n", - "[53 more columns]\n" + " parcelid logerror ... censustractandblock transaction_month\n", + "0 11827818 0.0402 ... 6.037532e+13 3\n", + "1 12123024 0.0296 ... 6.037463e+13 3\n", + "2 13867327 0.0344 ... 6.059011e+13 3\n", + "3 12681894 0.0060 ... 6.037651e+13 3\n", + "4 12848541 0.0695 ... 6.037409e+13 3\n", + "\n", + "[5 rows x 61 columns]\n" ], "name": "stdout" } @@ -463,35 +804,32 @@ "metadata": { "id": "B3-1V93smA9A", "colab_type": "code", - "outputId": "66d7335e-bc42-4108-a1c1-80f1afb06a4b", + "outputId": "28a73c5c-abf2-4325-a575-b654c9ddd9f4", "colab": { "base_uri": "https://localhost:8080/", - "height": 380 + "height": 67 } }, "source": [ - "# when poolcnt=1 and has_hottub_or_spa=1 and just_hottub_or_spa is null, then just_hottub_or_spa =0\n", + "# if poolcnt=1 and has_hottub_or_spa=1 and just_hottub_or_spa is null\n", "conditions = ((df_train['pool_count'] == 1) \n", " & (df_train['has_hottub_or_spa'] == 1) \n", " & (df_train['just_hottub_or_spa'].isna() == True))\n", - "df_train['just_hottub_or_spa'] = df_train['just_hottub_or_spa'].masked_assign(0, conditions) " + "# then just_hottub_or_spa = 0\n", + "df_train.just_hottub_or_spa.loc[conditions] = 0\n", + "\n", + "print(df_train.just_hottub_or_spa.value_counts())" ], "execution_count": 0, "outputs": [ { - "output_type": "error", - "ename": "TypeError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m conditions = ((df_train['pool_count'] == 1) \n\u001b[1;32m 2\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_hottub_or_spa'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m & (df_train['just_hottub_or_spa'].isna() == True))\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'just_hottub_or_spa'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'just_hottub_or_spa'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmasked_assign\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconditions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m__eq__\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 811\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__eq__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 813\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_unordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'eq'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 815\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mequals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m_unordered_compare\u001b[0;34m(self, other, cmpops)\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_unordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcmpops\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 782\u001b[0m \u001b[0mnvtx_range_push\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"CUDF_UNORDERED_COMP\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"orange\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 783\u001b[0;31m \u001b[0mother\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_normalize_binop_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 784\u001b[0m \u001b[0moutcol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcmpops\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 785\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_copy_construct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m_normalize_binop_value\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormalize_binop_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 780\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_unordered_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcmpops\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/string.py\u001b[0m in \u001b[0;36mnormalize_binop_value\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 705\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cannot broadcast {}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 707\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_na_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: cannot broadcast " - ] + "output_type": "stream", + "text": [ + "0.0 1204\n", + "1.0 1161\n", + "Name: just_hottub_or_spa, dtype: int32\n" + ], + "name": "stdout" } ] }, @@ -502,6 +840,7 @@ "colab_type": "text" }, "source": [ + "\n", "- when `has_hottub_or_spa` is null and `just_hottub_or_spa` is null\n", " - both should be zero\n" ] @@ -518,13 +857,13 @@ "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", " & (df_train['just_hottub_or_spa'].isna() == True))\n", "# just hottub or spa = 0 \n", - "df_train['just_hottub_or_spa'] = df_train['just_hottub_or_spa'].masked_assign(0, conditions) \n", + "df_train.just_hottub_or_spa.loc[conditions] = 0\n", "\n", "# now, if has hottub is null and just hottub is 0 \n", "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", " & (df_train['just_hottub_or_spa'] == 0))\n", "# has hottub or spa = 0 \n", - "df_train['has_hottub_or_spa'] = df_train['has_hottub_or_spa'].masked_assign(0, conditions) " + "df_train.has_hottub_or_spa.loc[conditions] = 0" ], "execution_count": 0, "outputs": [] @@ -546,17 +885,33 @@ "metadata": { "id": "FBgs7zJm3qk-", "colab_type": "code", - "colab": {} + "outputId": "3c3935ec-9d5e-4806-c701-1191f563ccdd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 67 + } }, "source": [ "# when poolcnt=0, has_hottub_or_spa=1\n", "conditions = ((df_train['pool_count'] == 0) \n", " & (df_train['has_hottub_or_spa'] == 1))\n", "# just_hottub_or_spa=1\n", - "df_train['just_hottub_or_spa'] = df_train['just_hottub_or_spa'].masked_assign(1, conditions) \n" + "df_train.just_hottub_or_spa.loc[conditions] = 1\n", + "\n", + "print(df_train.just_hottub_or_spa.value_counts())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.0 89114\n", + "1.0 1161\n", + "Name: just_hottub_or_spa, dtype: int32\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -579,7 +934,7 @@ "# where there is no pool\n", "conditions = df_train['pool_count']==0\n", "# square footage of non existant pool is 0 \n", - "df_train['pool_sqft'] = df_train['pool_sqft'].masked_assign(0, conditions)" + "df_train.pool_sqft.loc[conditions] = 0" ], "execution_count": 0, "outputs": [] @@ -606,7 +961,7 @@ "# where there is no basement\n", "conditions = df_train['basement_flag'] == 0\n", "# fun fact: we just did this with the pool\n", - "df_train['basement_sqft'] = df_train['basement_sqft'].masked_assign(0, conditions) " + "df_train.basement_sqft.loc[conditions] = 0" ], "execution_count": 0, "outputs": [] @@ -630,14 +985,27 @@ "metadata": { "id": "OZM6lXmmpj5k", "colab_type": "code", - "colab": {} + "outputId": "1d5124b4-31fa-43ae-ae0b-712ac79fde3b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50 + } }, "source": [ "print(f\"there are {df_train['fireplace_count'].isna().sum()} fireplace_count \\\n", "nulls\\nthere are {df_train['fireplaceflag'].isna().sum()} fireplaceflag nulls\")" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "there are 80668 fireplace_count nulls\n", + "there are 90053 fireplaceflag nulls\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -657,25 +1025,41 @@ "metadata": { "id": "i3YRZgU_qZhA", "colab_type": "code", - "colab": {} + "outputId": "a6231c9e-37cd-4766-9743-c85f3aa61654", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50 + } }, "source": [ "# null flags with null counts are zero\n", "conditions = ((df_train['fireplace_count'].isna()==True) \n", " & (df_train['fireplaceflag'].isna()==True))\n", - "df_train['fireplaceflag'] = df_train['fireplaceflag'].masked_assign(False, conditions)\n", + "df_train.fireplaceflag.loc[conditions] = False\n", "\n", "# true flags for positive fireplace counts\n", "conditions = df_train['fireplace_count'] > 0\n", - "df_train['fireplaceflag'] = df_train['fireplaceflag'].masked_assign(True, conditions)\n", + "df_train.fireplaceflag.loc[conditions] = True\n", "\n", "# set fireplace count nulls to 0 where false flags are\n", "conditions = ((df_train['fireplace_count'].isna()==True) \n", " & (df_train['fireplaceflag']==False))\n", - "df_train['fireplace_count'] = df_train['fireplace_count'].masked_assign(0, conditions)" + "df_train.fireplace_count.loc[conditions] = 0\n", + "\n", + "print(f\"there are {df_train['fireplace_count'].isna().sum()} fireplace_count \\\n", + "nulls\\nthere are {df_train['fireplaceflag'].isna().sum()} fireplaceflag nulls\")" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "there are 222 fireplace_count nulls\n", + "there are 0 fireplaceflag nulls\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -697,11 +1081,11 @@ }, "source": [ "garage = ['garagecarcnt', 'garage_sqft']\n", - "# where garage car count and garage square feet are null, set both to 0\n", + "# where garage car count and garage square feet are null\n", "conditions = ((df_train['garagecarcnt'].isna()==True) \n", " & (df_train['garage_sqft'].isna()==True))\n", - "for i in garage:\n", - " df_train[i] = df_train[i].masked_assign(0, conditions)" + "# set both to 0\n", + "df_train[garage].loc[conditions] = 0" ], "execution_count": 0, "outputs": [] @@ -722,15 +1106,32 @@ "metadata": { "id": "gbbUIbwJ-ouS", "colab_type": "code", - "colab": {} + "outputId": "115cac03-580c-477e-b5c3-0d191c333b2d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 118 + } }, "source": [ "# show rows where garage count and square feet don't add up\n", "conditions = (df_train.garagecarcnt > 0) & (df_train.garage_sqft == 0)\n", - "print(df_train.loc[conditions][garage])" + "print(df_train.loc[conditions][garage].head())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + " garagecarcnt garage_sqft\n", + "16 2.0 0.0\n", + "29 1.0 0.0\n", + "32 1.0 0.0\n", + "35 1.0 0.0\n", + "36 2.0 0.0\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -754,7 +1155,7 @@ "# where garage count and square feet don't add up\n", "conditions = (df_train.garagecarcnt>0) & (df_train.garage_sqft==0)\n", "# insert a NaN value\n", - "df_train['garage_sqft'] = df_train['garage_sqft'].masked_assign(cupy.nan, conditions)" + "df_train.garage_sqft.loc[conditions] = np.nan" ], "execution_count": 0, "outputs": [] @@ -787,14 +1188,13 @@ "\n", "# if full_bath is null & half_bath is null\n", "conditions = ((df_train['full_bath'].isnull()==True) \n", - " & (df_train['half_bath'].isnull()==True))\n", + " & (df_train['half_bath'].isnull()==True) \n", + " & (df_train['total_bath']==0))\n", "# total_bath=0\n", - "df_train['total_bath'] = df_train['total_bath'].masked_assign(0, conditions)\n", + "df_train.total_bath.loc[conditions] = np.nan\n", "\n", - "# when full_bath==total_bath\n", - "conditions = df_train.full_bath == df_train.total_bath\n", - "# half_bath=0 \n", - "df_train['half_bath'] = df_train['half_bath'].masked_assign(0, conditions)" + "# when full_bath==total_bath, half_bath=0 \n", + "df_train.half_bath.loc[df_train.full_bath == df_train.total_bath] = 0" ], "execution_count": 0, "outputs": [] @@ -821,8 +1221,8 @@ "colab": {} }, "source": [ - "df_train['latitude'] = [lat/100000 for lat in df_train['latitude']]\n", - "df_train['longitude'] = [long/100000 for long in df_train['longitude']]" + "df_train['latitude'] = df_train.latitude / 100000\n", + "df_train['longitude'] = df_train.longitude / 100000" ], "execution_count": 0, "outputs": [] @@ -844,12 +1244,15 @@ "metadata": { "id": "yHZH4rMNLfBA", "colab_type": "code", - "colab": {} + "outputId": "6ba5f661-caa5-44b8-b492-b9f5708181db", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 202 + } }, "source": [ "# where room count is 0, go ahead and NaN it\n", - "conditions = df_train['roomcnt'] == 0\n", - "df_train['roomcnt'] = df_train['roomcnt'].masked_assign(cupy.nan, conditions)\n", + "df_train.roomcnt.loc[df_train['roomcnt'] == 0] = np.nan\n", "\n", "\"\"\"\n", "propertylandusetypeid & unitcnt are related \n", @@ -876,29 +1279,50 @@ "for one in ones:\n", " # adjust conditions to one unit indicator\n", " conditions = ((df_train['propertylandusetypeid'] == one) \n", - " & (df_train['unitcnt'].isnull()))\n", - " df_train['unitcnt'] = df_train['unitcnt'].masked_assign(1, conditions)\n", + " & (df_train['unitcnt'].isna()))\n", + " df_train.unitcnt.loc[conditions] = 1\n", "\n", "# two units \n", "twos = [31,47,246]\n", "for two in twos:\n", " # adjust conditions to two unit indicator\n", " conditions = ((df_train['propertylandusetypeid'] == two) \n", - " & (df_train['unitcnt'].isnull()))\n", - " df_train['unitcnt'] = df_train['unitcnt'].masked_assign(2, conditions)\n", + " & (df_train['unitcnt'].isna()))\n", + " df_train.unitcnt.loc[conditions] = 2\n", "\n", "# three units\n", "conditions = ((df_train['propertylandusetypeid'] == 247) \n", - " & (df_train['unitcnt'].isnull()))\n", - "df_train['unitcnt'] = df_train['unitcnt'].masked_assign(3, conditions)\n", + " & (df_train['unitcnt'].isna()))\n", + "df_train.unitcnt.loc[conditions] = 3\n", "\n", "# four units\n", "conditions = ((df_train['propertylandusetypeid'] == 248) \n", - " & (df_train['unitcnt'].isnull()))\n", - "df_train['unitcnt'] = df_train['unitcnt'].masked_assign(4, conditions)" + " & (df_train['unitcnt'].isna()))\n", + "df_train.unitcnt.loc[conditions] = 4\n", + "\n", + "# let's see how out unit counts look\n", + "print(df_train.unitcnt.value_counts())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "1.0 86035\n", + "2.0 2372\n", + "4.0 884\n", + "3.0 622\n", + "5.0 1\n", + "6.0 1\n", + "9.0 1\n", + "11.0 1\n", + "70.0 1\n", + "143.0 1\n", + "Name: unitcnt, dtype: int32\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -972,7 +1396,7 @@ "metadata": { "id": "8lYcO_T5XKNN", "colab_type": "code", - "outputId": "0b77457e-0eed-4e21-be79-1df380432abc", + "outputId": "2440dccb-bc7d-459c-ae1a-cc31388be45e", "colab": { "base_uri": "https://localhost:8080/", "height": 303 @@ -991,9 +1415,9 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1136\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1137\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1138\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1139\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 709\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 710\u001b[0m ):\n\u001b[0;32m--> 711\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 712\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 717\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m ):\n\u001b[0;32m--> 719\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 720\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 721\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: fill_value must be a string or a string series" ] } @@ -1025,7 +1449,11 @@ "metadata": { "id": "Svp6J0cJ5dL0", "colab_type": "code", - "colab": {} + "outputId": "352d2f36-658f-4698-bfdb-5c748b67f0d7", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 67 + } }, "source": [ "# if bool 'Y'/None is already set, change string to int bool column via .isna()\n", @@ -1044,7 +1472,17 @@ "print(df_train['taxdelinquencyflag'].value_counts())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "0 88492\n", + "1 1783\n", + "Name: taxdelinquencyflag, dtype: int32\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -1058,30 +1496,84 @@ " - to 2016 - yyyy \n" ] }, + { + "cell_type": "code", + "metadata": { + "id": "lHh95mAIMrMy", + "colab_type": "code", + "outputId": "244b62b2-299c-4440-83d2-b5545712ba3e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 67 + } + }, + "source": [ + "print(df_train.taxdelinquencyflag.value_counts())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0 88492\n", + "1 1783\n", + "Name: taxdelinquencyflag, dtype: int32\n" + ], + "name": "stdout" + } + ] + }, { "cell_type": "code", "metadata": { "id": "6Bic66I9LfGC", "colab_type": "code", - "colab": {} + "outputId": "4311fb13-6d49-44e1-83ef-73e27d4720c4", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + } }, "source": [ - "# set year paris -- e.g. from 5 to 2016 - 2005\n", - "year_pairs = [(99,2016-1999),(6,2016-2006),(7,2016-2007),(8,2016-2008),\n", - " (9,2016-2009),(10,2016-2010),(11,2016-2011),(12,2016-2012),\n", - " (13,2016-2013),(14,2016-2014),(15,2016-2015)]\n", - "# go though year pairs\n", - "for pair in year_pairs:\n", - " # tag old value and new value it will be replaced with\n", - " old, new = pair\n", - " # replace old value with new value\n", - " df_train['taxdelinquencyyear'] = df_train['taxdelinquencyyear'].replace(old, \n", - " new)\n", - "# what're we lookin at?\n", - "print(df_train['taxdelinquencyyear'].value_counts())" + "# no delinquency? set year to 0\n", + "df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyflag == 0] = 0\n", + "# collect x and xx formatted delinquency years w/ matching xxxx year format pair\n", + "year_pairs = [(99,1999), (6,2006), (7,2007), (8,2008), (9,2009), (10,2010),\n", + " (11,2011), (12,2012), (13,2013), (14,2014), (15,2015)]\n", + "# go through the pairs individually \n", + "for year in year_pairs:\n", + " # split the pair in question \n", + " old, new = year\n", + " # replace old year (e.g. 99) with new year (e.g. 1999)\n", + " df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear == old] = new\n", + "\n", + "# adjust delinquency year relative to training year (2016) \n", + "df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear>0] = 2016 - df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear>0]\n", + "# what've we got? \n", + "print(df_train.taxdelinquencyyear.value_counts())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.0 88492\n", + "2.0 628\n", + "1.0 518\n", + "3.0 210\n", + "4.0 154\n", + "6.0 89\n", + "5.0 85\n", + "7.0 63\n", + "8.0 24\n", + "9.0 8\n", + "10.0 3\n", + "17.0 1\n", + "Name: taxdelinquencyyear, dtype: int32\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -1099,12 +1591,14 @@ { "cell_type": "code", "metadata": { - "id": "b3sh8aGovTLT", + "id": "AWx7lq0xkDV2", "colab_type": "code", "colab": {} }, "source": [ - "print(df_train['rawcensustractandblock'].head())" + "# make a copy of dataframe at this point\n", + "# safe = df_train.copy()\n", + "df_train = safe.copy()" ], "execution_count": 0, "outputs": [] @@ -1112,39 +1606,106 @@ { "cell_type": "code", "metadata": { - "id": "AJrFMIuvvqUr", + "id": "Sg0eN-K1QdZy", "colab_type": "code", - "colab": {} + "outputId": "0e6ca58c-3b13-4c9e-c902-4d8a9c98a855", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 474 + } }, "source": [ - "# using series instead of dataframe\n", - "tractnumber = s_rawcensustractandblock.values_to_string()\n", - "# adjust tract number\n", - "for i in range(len(cudf_tractnumber)):\n", - " funct = slice(4,11)\n", - " tractnumber[i] = tractnumber[i][funct]\n", - "# set new tract number column\n", - "df_train['census_tractnumber'] = census_tractnumber\n", + "# copy rawcensustractandblock with values as string instead of float\n", + "string_data = cudf.Series(df_train['rawcensustractandblock'].values_to_string())\n", + "\n", + "# print(type(string_data))\n", + "# print(len(string_data))\n", + "# print(string_data)\n", + "\n", + "\"\"\"\n", + "CURRENT ERROR IN CONVERSION OF VALUES\n", + "\"\"\"\n", + "print(f\"\\nNOTE: THERE APPEARS TO BE AN ERROR WHEN CONVERTING TO STRING\\n\"\n", + " f\" > somewhat random numbers added to end of some values\\n >> e.g. 004, 006\"\n", + " f\"\\n\\n\\ndf_train['rawcensustractandblock'].head(10).values\\n\"\n", + " f\"{df_train['rawcensustractandblock'].head(10).values}\\n\\n\"\n", + " f\"data.head(10).values\\n{string_data.head(10).values}\\n\\n\\n\"\n", + " f\"THE SAME NUMBERS OCCOUR IN THE FIRST WHEN PUT INTO A LIST\\n\"\n", + " f\" > not sure how to deal with this now\\n\"\n", + " f\" >> difficult to reproduce without data\\n\\n\")\n", + "\"\"\"\n", + "CURRENT ERROR IN CONVERSION OF VALUES\n", + "\"\"\"\n", + "\n", + "# set new tract number \n", + "df_train['census_tractnumber'] = string_data.str.slice(4, 11)\n", "\n", - "# using series instead of dataframe\n", - "block_number = s_rawcensustractandblock.values_to_string()\n", "# set/adjust block number\n", - "for i in range(len(block_number)):\n", - " funct = slice(11, None)\n", - " block_number[i] = block_number[i][funct]\n", - " block_number[i] = block_number[i][:4]+'.'+block_number[i][4:]+'0'\n", - " block_number[i] = int(round(float(block_number[i]), 0))\n", - " block_number[i] = str(block_number[i]).ljust(4,'0')\n", - "# add block number column to dataframe\n", - "df_train['block_number'] = block_number\n", - "\n", - "# rawcensustractandblock values have been converted\n", - "df_train = df_train.drop('rawcensustractandblock', axis=1)\n", - "# let's see what we've got\n", - "print(df_train[['census_tractnumber', 'block_number']].head(3))" + "df_train['block_number'] = string_data.str.slice(11)\n", + "df_train['block_number'] = df_train.block_number.str.slice(0,4).str.cat(df_train.block_number.str.slice(4), '.')\n", + "df_train['block_number'] = df_train.block_number.astype('float').round(0).astype('int')\n", + "df_train['block_number'] = df_train.block_number.astype('str').str.ljust(4, '0')\n", + "\n", + "# drop raw census tract and block column, no longer needed\n", + "df_train=df_train.drop('rawcensustractandblock', axis=1)\n", + "\n", + "\"\"\"\n", + "CORRECT NUMBERS THAT SHOULD BE DISPLAYED BY BELOW PRINT STATEMENT\n", + " > currently not being seen due to prior mentioned error\n", + "\n", + "tractnumber\n", + "0 1066.46\n", + "1 0524.22\n", + "2 4638.00\n", + "3 2963.00\n", + "4 0423.38\n", + "dtype: object\n", + "\n", + "blocknumber\n", + "0 1001\n", + "1 2024\n", + "2 3004\n", + "3 2002\n", + "4 1006\n", + "dtype: object\n", + "\"\"\"\n", + "print(df_train[['census_tractnumber', 'block_number']].head())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "NOTE: THERE APPEARS TO BE AN ERROR WHEN CONVERTING TO STRING\n", + " > somewhat random numbers added to end of some values\n", + " >> e.g. 004, 006\n", + "\n", + "\n", + "df_train['rawcensustractandblock'].head(10).values\n", + "[60375315.031013 60374625.001017 60590114.012017 60376513.02100401\n", + " 60374087.031018 60375759.011001 60590630.044 60374061.011006\n", + " 60378001.022007 60590524.19100901]\n", + "\n", + "data.head(10).values\n", + "['60375315.031013004', '60374625.001017004', '60590114.012017', '60376513.021004006', '60374087.031018004', '60375759.011001', '60590630.044', '60374061.011006', '60378001.022007', '60590524.19100901']\n", + "\n", + "\n", + "THE SAME NUMBERS OCCOUR IN THE FIRST WHEN PUT INTO A LIST\n", + " > not sure how to deal with this now\n", + " >> difficult to reproduce without data\n", + "\n", + "\n", + " census_tractnumber block_number\n", + "0 5315.03 1013\n", + "1 4625.00 1017\n", + "2 0114.01 2017\n", + "3 6513.02 1004\n", + "4 4087.03 1018\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -1165,7 +1726,11 @@ "metadata": { "id": "xhCosNpXvTVU", "colab_type": "code", - "colab": {} + "outputId": "b8ca9fb3-6c67-4466-d7cc-98ff52504659", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 84 + } }, "source": [ "# calculate null value % for each column & frame it\n", @@ -1185,7 +1750,18 @@ "df_train = df_train.drop(missingvaluescols['field'], axis=1)" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + " field percentage\n", + "7 buildingclasstypeid 0.999823\n", + "3 architecturalstyletypeid 0.997109\n", + "33 typeconstructiontypeid 0.996688\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -1216,92 +1792,50 @@ "metadata": { "id": "yB2lzAyopS_S", "colab_type": "code", - "colab": {} - }, - "source": [ - "# highly related propertylandusetypeid\n", - "conditions = df_train['propertylandusetypeid'] == 265\n", - "# unitcnt 360\n", - "df_train['unitcnt'] = df_train['unitcnt'].masked_assign(10, conditions)\n", - "# let's see what we've got\n", - "print(df_train['unitcnt'].value_counts())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ofZIC0EdKJ0Y", - "colab_type": "text" - }, - "source": [ - "# -----current: test ready-----" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "c8Zfn-YhlSBO", - "colab_type": "code", - "outputId": "2087fa66-8683-4040-a3e1-7654942367b7", + "outputId": "2860febf-c7ad-4823-d170-2633c4be8ae5", "colab": { "base_uri": "https://localhost:8080/", - "height": 34 + "height": 218 } }, "source": [ - "poolsizesum_mean = df_train.loc[df_train['pool_count'] > 0].pool_sqft.mean()\n", - "\"\"\"\n", - "NEEDS TO BE CONFIRMED WITH OG\n", - "> is this supposed to only consider if pool_sqft > 0 as well?\n", - "\"\"\"\n", - "poolsizesum_mean" + "# highly related propertylandusetypeid\n", + "df_train['unitcnt'].loc[df_train['propertylandusetypeid'] == 265] = 10\n", + "# let's see what we've got\n", + "print(df_train['unitcnt'].value_counts())" ], "execution_count": 0, "outputs": [ { - "output_type": "execute_result", - "data": { - "text/plain": [ - "28.13881906038769" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 86 + "output_type": "stream", + "text": [ + "1.0 86035\n", + "2.0 2372\n", + "4.0 884\n", + "3.0 622\n", + "10.0 356\n", + "5.0 1\n", + "6.0 1\n", + "9.0 1\n", + "11.0 1\n", + "70.0 1\n", + "143.0 1\n", + "Name: unitcnt, dtype: int32\n" + ], + "name": "stdout" } ] }, { - "cell_type": "code", + "cell_type": "markdown", "metadata": { - "id": "cA30ozCWo5x3", - "colab_type": "code", - "outputId": "fda7011f-6bee-4b60-e137-ec04d05e440b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 118 - } + "id": "iR1rBlz-dOdH", + "colab_type": "text" }, "source": [ - "print(df_train.loc[df_train['pool_count'] > 0].pool_sqft.head())" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "5 0.0\n", - "8 0.0\n", - "11 0.0\n", - "13 0.0\n", - "23 0.0\n", - "Name: pool_sqft, dtype: float64\n" - ], - "name": "stdout" - } + "- a number of pool sizes are null despite there being a pool\n", + " - let's calculate the average pool size\n", + " - and assume those null values are pools of average size" ] }, { @@ -1309,35 +1843,23 @@ "metadata": { "id": "-icFDeLSoJwl", "colab_type": "code", - "outputId": "9c5035bd-b766-4509-c5a8-f3a475093dd4", + "outputId": "5ea8e799-3105-4601-82d4-54bd00c5056b", "colab": { "base_uri": "https://localhost:8080/", - "height": 907 + "height": 34 } }, "source": [ - "print(df_train.loc[df_train.pool_count>0].pool_sqft.value_counts())\n", - "print(df_train.pool_sqft.value_counts())\n", - "print(df_train.loc[df_train.pool_count>0].pool_sqft.isna().sum())\n", - "print(df_train.pool_sqft.isna().sum())\n", - "\n", - "\n", - "\n", "# calculate the average pool square footage for properties with a pool(s)\n", - "new_value = df_train.loc[df_train['pool_count'] > 0, 'pool_sqft'].mean()\n", + "poolsizesum_mean = df_train.pool_sqft.loc[df_train['pool_count'] > 0].mean()\n", "\n", "# where the property has a pool(s) but pool square feet is 0\n", "conditions = ((df_train['pool_count'] > 0) \n", - " & (df_train['pool_sqft'] == 0))\n", + " & (df_train['pool_sqft'].isna()==True))\n", "\n", "# set pool square feet to the average pool square footage of pool properties\n", - "df_train['pool_sqft'] = df_train['pool_sqft'].masked_assign(new_value, conditions)\n", - "\n", + "df_train['pool_sqft'].loc[conditions] = poolsizesum_mean\n", "\n", - "print(df_train.loc[df_train.pool_count>0].pool_sqft.value_counts())\n", - "print(df_train.pool_sqft.value_counts())\n", - "print()\n", - "print(df_train.loc[df_train.pool_count>0].pool_sqft.isna().sum())\n", "print(df_train.pool_sqft.isna().sum())" ], "execution_count": 0, @@ -1345,191 +1867,70 @@ { "output_type": "stream", "text": [ - "0.0 16932\n", - "450.0 105\n", - "400.0 41\n", - "800.0 39\n", - "500.0 36\n", - "600.0 35\n", - "512.0 30\n", - "480.0 27\n", - "648.0 18\n", - "420.0 17\n", - "[264 more rows]\n", - "dtype: int64\n", - "0.0 89306\n", - "450.0 105\n", - "400.0 41\n", - "800.0 39\n", - "500.0 36\n", - "600.0 35\n", - "512.0 30\n", - "480.0 27\n", - "648.0 18\n", - "420.0 17\n", - "[264 more rows]\n", - "dtype: int64\n", - "0\n", - "0\n", - "28.13881906038769 16932\n", - "450.0 105\n", - "400.0 41\n", - "800.0 39\n", - "500.0 36\n", - "600.0 35\n", - "512.0 30\n", - "480.0 27\n", - "648.0 18\n", - "420.0 17\n", - "[264 more rows]\n", - "dtype: int64\n", - "0.0 72374\n", - "28.13881906038769 16932\n", - "450.0 105\n", - "400.0 41\n", - "800.0 39\n", - "500.0 36\n", - "600.0 35\n", - "512.0 30\n", - "480.0 27\n", - "648.0 18\n", - "[265 more rows]\n", - "dtype: int64\n", - "\n", - "0\n", "0\n" ], "name": "stdout" } ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "AyGeXJfEmJBU", + "colab_type": "text" + }, + "source": [ + "- total parcel tax\n", + "- structure tax\n", + "- land tax" + ] + }, { "cell_type": "code", "metadata": { "id": "3pVABkZTYK9F", "colab_type": "code", - "outputId": "42a0b5cc-42e2-41c5-8fdd-11485c45c933", + "outputId": "345e4225-6a09-4fae-efb3-c9abe56c622a", "colab": { "base_uri": "https://localhost:8080/", - "height": 774 - } - }, - "source": [ - "# where total tax and land tax are both greater than 0\n", - "\n", - "# TESTING (SWITCH TO OG)\n", - "# test = df_train.copy()\n", - "# test.loc[(test.total_parcel_tax>0) & (test.land_tax>0),'structure_tax']=test['total_parcel_tax']-test['land_tax']\n", - "hmm = df_train.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)]\n", - "print(f'{len(hmm)} rows where total and land are greater than 0')\n", - "print(f'{len(df_train)} total rows, hopefully the same as above number')\n", - "print()\n", - "print(len(hmm.loc[hmm.structure_tax!=hmm['total_parcel_tax']-hmm['land_tax']]))\n", - "print()\n", - "print(hmm.loc[hmm.structure_tax!=hmm['total_parcel_tax']-hmm['land_tax']])\n", - "print()\n", - "any_neg = hmm.loc[hmm.total_parcel_tax < hmm.land_tax]\n", - "# if this comes back as 0, setting all structures to total - land should work\n", - "print(f'{len(any_neg)} total taxes are less than same rows land tax\\n')\n", - "print(any_neg)\n", - "# SWITCH TO RAPIDS \n", - "\"\"\"current concern\n", - "are there places where total and land are not greater than 0 \n", - "and setting structure to their difference is not the best move\"\"\"\n", - "\n", - "\n", - "# # structure tax should be equal to total tax minus land tax\n", - "# df_train['structure_tax'] = df_train['total_parcel_tax'] - df_train['land_tax']\n", - "new_value = df_train['total_parcel_tax'] - df_train['land_tax']\n", - "conditions = (df_train.total_parcel_tax>0) & (df_train.land_tax>0)\n", - "df_train['structure_tax'] = df_train['structure_tax'].masked_assign(new_value, conditions)\n", - "\n", - "# # where structure tax is 0\n", - "conditions = df_train['structure_tax'] == 0\n", - "# # we do not know the structure tax, so insert a Nan value\n", - "df_train['structure_tax'] = df_train['structure_tax'].masked_assign(cupy.nan, conditions)\n", - "\n", - "# print(test.isna().sum())\n", - "# print(test.value_counts().head())\n", - "# print(test_1.isna().sum())\n", - "# print(test_1.value_counts().head())\n", - "\n", - "\n", - "# SWITCH TO OG \n", - "\"\"\"\n", + "height": 84 + } + }, + "source": [ "#total_parcel_tax\n", "#structure_tax\n", "#land_tax\n", "#total_property_tax_2016\n", - "#2)recalculate total_parcel_tax =structure_tax + land_tax\n", + "#2)recalculate total_parcel_tax = structure_tax + land_tax\n", "\n", "# total_parcel_tax =structure_tax + land_tax\n", "#->structure_tax=total_parcel_tax -land_tax\n", "\n", - "df_train.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0),'structure_tax']=df_train['total_parcel_tax']-df_train['land_tax']\n", + "# where parcel and land taxes are greater than 0\n", + "parcel_taxes = df_train.total_parcel_tax.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)]\n", + "land_taxes = df_train.land_tax.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)]\n", + "# set structure tax to be their difference\n", + "df_train['structure_tax'].loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)] = parcel_taxes - land_taxes\n", "\n", - "#structure_tax, i see a lot of structure tax is 0's, those must be NA's\n", + "# where structure tax is still 0, there isn't structure tax\n", + "df_train.structure_tax.loc[df_train.structure_tax==0] = np.nan\n", "\n", - "df_train.loc[df_train.structure_tax==0,'structure_tax']=np.nan\n", - "\"\"\"\n", "print(df_train.total_property_tax_2016.isnull().sum())\n", "print(df_train.structure_tax.isnull().sum())\n", "print(df_train.total_parcel_tax.isnull().sum())\n", - "print(df_train.land_tax.isnull().sum())\n", - "\n", - "# SWITCH TO RAPIDS\n", - "# print(test[['structure_tax','land_tax','total_parcel_tax']])" + "print(df_train.land_tax.isnull().sum())" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ - "90274 rows where total and land are greater than 0\n", - "90275 total rows, hopefully the same as above number\n", - "\n", - "379\n", - "\n", - " parcelid logerror ac_id basement_sqft total_bath bedroomcnt buildingqualitytypeid ... census_tractnumber\n", - "266 17188959 0.0944 0.0 0.0 0.0 ... 0056.00\n", - "297 12956410 -0.14850000000000002 0.0 0.0 0.0 ... 4080.05\n", - "336 12966610 0.0488 0.0 6.0 9.0 7.0 ... 4303.01\n", - "454 17188961 0.003 0.0 0.0 0.0 ... 0056.00\n", - "474 17188974 0.10260000000000001 0.0 0.0 0.0 ... 0056.00\n", - "555 17266056 -0.5175 0.0 0.0 0.0 ... 0059.08\n", - "601 17205423 0.0733 0.0 0.0 0.0 ... 0076.06\n", - "790 10858080 0.05450000000000001 0.0 2.0 3.0 7.0 ... 1412.01\n", - "791 10858080 0.08620000000000001 0.0 2.0 3.0 7.0 ... 1412.01\n", - "976 11325190 -0.024300000000000002 0.0 0.0 0.0 ... 9102.06\n", - "[369 more rows]\n", - "[38 more columns]\n", - "\n", - "0 total taxes are less than same rows land tax\n", - "\n", - "Empty DataFrame\n", - "Columns: ['parcelid', 'logerror', 'ac_id', 'basement_sqft', 'total_bath', 'bedroomcnt', 'buildingqualitytypeid', 'census_tractnumber']\n", - "Index: []\n" + "6\n", + "380\n", + "1\n", + "1\n" ], "name": "stdout" - }, - { - "output_type": "error", - "ename": "ValueError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mnew_value\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total_parcel_tax'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'land_tax'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mconditions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtotal_parcel_tax\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mland_tax\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'structure_tax'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'structure_tax'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmasked_assign\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_value\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconditions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;31m# # where structure tax is 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mmasked_assign\u001b[0;34m(self, value, mask)\u001b[0m\n\u001b[1;32m 1073\u001b[0m \"\"\"\n\u001b[1;32m 1074\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1075\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmasked_assign\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1076\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_copy_construct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1077\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/column.py\u001b[0m in \u001b[0;36mmasked_assign\u001b[0;34m(self, value, mask)\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_gpu_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmask_invert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 496\u001b[0;31m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 497\u001b[0m )\n\u001b[1;32m 498\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnull_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/utils/cudautils.py\u001b[0m in \u001b[0;36mfill_mask\u001b[0;34m(data, mask, value)\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 236\u001b[0m \u001b[0mconfigured\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgpu_fill_masked\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 237\u001b[0;31m \u001b[0mconfigured\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 238\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAutoJitCUDAKernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 224\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspecialize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 225\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mspecialize\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 761\u001b[0m '''\n\u001b[1;32m 762\u001b[0m argtypes = tuple(\n\u001b[0;32m--> 763\u001b[0;31m [self.typingctx.resolve_argument_type(a) for a in args])\n\u001b[0m\u001b[1;32m 764\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 765\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 761\u001b[0m '''\n\u001b[1;32m 762\u001b[0m argtypes = tuple(\n\u001b[0;32m--> 763\u001b[0;31m [self.typingctx.resolve_argument_type(a) for a in args])\n\u001b[0m\u001b[1;32m 764\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 765\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/typing/context.py\u001b[0m in \u001b[0;36mresolve_argument_type\u001b[0;34m(self, val)\u001b[0m\n\u001b[1;32m 296\u001b[0m \"\"\"\n\u001b[1;32m 297\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 298\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtypeof\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPurpose\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margument\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 299\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnumba\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_cuda_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/typing/typeof.py\u001b[0m in \u001b[0;36mtypeof\u001b[0;34m(val, purpose)\u001b[0m\n\u001b[1;32m 32\u001b[0m msg = _termcolor.errmsg(\n\u001b[1;32m 33\u001b[0m \"cannot determine Numba type of %r\") % (type(val),)\n\u001b[0;32m---> 34\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 35\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: cannot determine Numba type of " - ] } ] }, @@ -1538,7 +1939,11 @@ "metadata": { "id": "8SID48LOpYvu", "colab_type": "code", - "colab": {} + "outputId": "1d369c4a-759e-4331-b5fe-6c784ae66897", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } }, "source": [ "# regionidcounty is exact copy of fips code, dropping the dulicate column\n", @@ -1546,29 +1951,37 @@ "df_train.shape" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(90275, 45)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 51 + } + ] }, { "cell_type": "code", "metadata": { "id": "tWmM2J8_pkg1", "colab_type": "code", - "outputId": "2393cbab-218f-4849-c32c-700495dfb18e", + "outputId": "44689c09-a426-48c9-eae8-7e81af63080e", "colab": { "base_uri": "https://localhost:8080/", - "height": 622 + "height": 34 } }, "source": [ "#*******************************\n", "#bedroomcnt #1421 zero bed room houses ??, observed it's missing all other room count also missing\n", - "print(df_train.bedroomcnt.value_counts())\n", - "\n", - "conditions = df_train['bedroomcnt'] == 0\n", - "df_train['bedroomcnt'] = df_train['bedroomcnt'].masked_assign(cupy.nan, conditions)\n", - "\n", - "\n", - "print(df_train.bedroomcnt.value_counts())\n", + "# where there is no bedroom, null is a better representation \n", + "df_train['bedroomcnt'].loc[df_train['bedroomcnt'] == 0] = np.nan\n", "print(df_train.bedroomcnt.isnull().sum())" ], "execution_count": 0, @@ -1576,38 +1989,9 @@ { "output_type": "stream", "text": [ - "3.0 35447\n", - "2.0 22357\n", - "4.0 20279\n", - "5.0 5077\n", - "1.0 3897\n", - "0.0 1421\n", - "6.0 1120\n", - "8.0 274\n", - "7.0 234\n", - "9.0 91\n", - "[7 more rows]\n", - "dtype: int64\n" + "1421\n" ], "name": "stdout" - }, - { - "output_type": "error", - "ename": "RuntimeError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbedroomcnt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbedroomcnt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mvalue_counts\u001b[0;34m(self, method, sort)\u001b[0m\n\u001b[1;32m 1827\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnull_count\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1828\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mint64\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1829\u001b[0;31m \u001b[0mvals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcnts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1830\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcnts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mas_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvals\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1831\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/numerical.py\u001b[0m in \u001b[0;36mvalue_counts\u001b[0;34m(self, method)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"non sort based value_count() not implemented yet\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 217\u001b[0;31m \u001b[0msegs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msortedvals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_unique_segments\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 218\u001b[0m \u001b[0;31m# Return both values and their counts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mout_vals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcpp_copying\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_gather_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msortedvals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msegs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/column.py\u001b[0m in \u001b[0;36m_unique_segments\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 684\u001b[0m \u001b[0mdensecol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dense_buffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;31m# sort the column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 686\u001b[0;31m \u001b[0msortcol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdensecol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_by_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 687\u001b[0m \u001b[0;31m# find segments\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 688\u001b[0m \u001b[0msortedvals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msortcol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/numerical.py\u001b[0m in \u001b[0;36msort_by_values\u001b[0;34m(self, ascending, na_position)\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msort_by_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_position\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"last\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 163\u001b[0;31m \u001b[0msort_inds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_sorted_inds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_position\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 164\u001b[0m \u001b[0mcol_keys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcpp_copying\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_gather_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msort_inds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m col_inds = self.replace(\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/_sort.py\u001b[0m in \u001b[0;36mget_sorted_inds\u001b[0;34m(by, ascending, na_position)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Must use a boolean or list of booleans\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m \u001b[0mcpp_sort\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_order_by\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_inds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_position\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 80\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcol_inds\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mcudf/bindings/sort.pyx\u001b[0m in \u001b[0;36mcudf.bindings.sort.apply_order_by\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mcudf/bindings/sort.pyx\u001b[0m in \u001b[0;36mcudf.bindings.sort.apply_order_by\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mRuntimeError\u001b[0m: merge_sort: failed to synchronize: an illegal memory access was encountered" - ] } ] }, @@ -1616,10 +2000,10 @@ "metadata": { "id": "3qnP2L9LpmeJ", "colab_type": "code", - "outputId": "bc0119de-0644-414f-bf59-bd132c7c0e15", + "outputId": "a4e9550d-5ea8-4066-d3f3-ea73bfe04cef", "colab": { "base_uri": "https://localhost:8080/", - "height": 387 + "height": 101 } }, "source": [ @@ -1631,8 +2015,8 @@ "# roomcnt 1416\n", "\n", "\n", - "# roomcnt=(full_bath+half_bath)+ bedroomcnt\n", - "# total_bath=fullbath+ 0.5(half_bath)\n", + "# roomcnt = (full_bath + half_bath) + bedroomcnt\n", + "# total_bath = fullbath+ 0.5(half_bath)\n", "\n", "#caluculate full bath and half bath again from total bath as, it has few extra columns, (fixes 500 missing values in roomcnt )\n", "\n", @@ -1642,19 +2026,15 @@ " & (df_train['bedroomcnt'].isna() == False) \n", " & (df_train['roomcnt'].isna() == True))\n", "# calculate room count including all full & half baths along with bedroom count\n", - "new_values = df_train.full_bath + df_train.half_bath + df_train.bedroomcnt\n", - "df_train['roomcnt'] = df_train['roomcnt'].masked_assign(new_values, conditions)\n", - "\n", - "\"\"\"df_train.loc[(df_train.full_bath.notnull()) \n", - " & (df_train.half_bath.notnull()) \n", - " & (df_train.bedroomcnt.notnull()) \n", - " & (df_train.roomcnt.isnull()),['roomcnt']]=df_train.full_bath + df_train.half_bath + df_train.bedroomcnt\"\"\"\n", + "new_values = df_train.full_bath.loc[conditions] + df_train.half_bath.loc[conditions] + df_train.bedroomcnt.loc[conditions]\n", + "# df_train['roomcnt'] = df_train['roomcnt'].masked_assign(new_values, conditions)\n", + "df_train.roomcnt.loc[conditions] = new_values\n", "\n", "\n", "# most bedroom count and roomcount null are in same place\n", "# all column null count 1133 all columns are null\n", "\n", - "print(df_train.total_bath.isnull().sum())\n", + "print(df_train.total_bath.isna().sum())\n", "print(df_train.full_bath.isnull().sum())\n", "print(df_train.half_bath.isnull().sum())\n", "print(df_train.bedroomcnt.isnull().sum())\n", @@ -1665,28 +2045,13 @@ { "output_type": "stream", "text": [ - "ERROR:Call to cuOccupancyMaxPotentialBlockSize results in UNKNOWN_CUDA_ERROR\n" + "1165\n", + "1182\n", + "1182\n", + "1421\n", + "1416\n" ], - "name": "stderr" - }, - { - "output_type": "error", - "ename": "CudaAPIError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mCudaAPIError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'half_bath'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bedroomcnt'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m & (df_train['roomcnt'].isna() == True))\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;31m# calculate room count including all full & half baths along with bedroom count\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfull_bath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhalf_bath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbedroomcnt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36misna\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1238\u001b[0m \"\"\"Identify missing values in a Series. Alias for isnull.\n\u001b[1;32m 1239\u001b[0m \"\"\"\n\u001b[0;32m-> 1240\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1242\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mnotna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36misnull\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1232\u001b[0m )\n\u001b[1;32m 1233\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1234\u001b[0;31m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcudautils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnullmask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1235\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/utils/cudautils.py\u001b[0m in \u001b[0;36misnull_mask\u001b[0;34m(data, mask)\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_dary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 434\u001b[0;31m \u001b[0mgpu_isnull\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_dary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 435\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0moutput_dary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 228\u001b[0;31m \u001b[0mtpb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compute_thread_per_block\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 229\u001b[0m \u001b[0mtpbm1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtpb\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 230\u001b[0m \u001b[0mblkct\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mntasks\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtpbm1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0mtpb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m_compute_thread_per_block\u001b[0;34m(self, kernel)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;31m# Raises from the driver if the feature is unavailable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtpb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_max_potential_block_size\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;31m# Fallback to table-based approach.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36mget_max_potential_block_size\u001b[0;34m(self, func, b2d_func, memsize, blocksizelimit, flags)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mb2d_cb\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m memsize, blocksizelimit)\n\u001b[0m\u001b[1;32m 649\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m driver.cuOccupancyMaxPotentialBlockSizeWithFlags(byref(gridsize), byref(blocksize),\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36msafe_cuda_api_call\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0m_logger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'call driver api: %s'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibfn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0mretcode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 290\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretcode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 291\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msafe_cuda_api_call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36m_check_error\u001b[0;34m(self, fname, retcode)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0m_logger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcritical\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_getpid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCudaDriverError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"CUDA initialized before forking\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCudaAPIError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mretcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_device\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevnum\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mCudaAPIError\u001b[0m: [700] Call to cuOccupancyMaxPotentialBlockSize results in UNKNOWN_CUDA_ERROR" - ] + "name": "stdout" } ] }, @@ -1709,12 +2074,16 @@ "metadata": { "id": "IW4CG2InpolD", "colab_type": "code", - "colab": {} + "outputId": "47e46700-fe9c-4b98-9941-014ee6dea441", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 252 + } }, "source": [ - "# before\n", - "print(df_train.numberofstories.isnull().sum())\n", - "print(df_train.numberofstories.value_counts())\n", + "# before (what's it look like?)\n", + "print(f'BEFORE\\n{df_train.numberofstories.value_counts()}\\n'\n", + " f'{df_train.numberofstories.isnull().sum()} remaining null values\\n')\n", "\n", "#numberofstories\t69705\n", "\n", @@ -1725,121 +2094,126 @@ "# go through each id pair \n", "for type_id in zillow_type_ids:\n", " # split the pair into type id and number of stories\n", - " id, n_stories = type_id\n", + " t_id, n_stories = type_id\n", "\n", " # when type id matches and story count is not null\n", - " conditions = ((df_train['propertylandusetypeid'] == id) \n", + " conditions = ((df_train['propertylandusetypeid'] == t_id) \n", " & (df_train['numberofstories'].isna() == False))\n", + " \n", " # calculate the mode story count for matching id properties\n", - " mode_stories = df_train.loc[conditions, 'numberofstories'].mode()\n", + " mode_stories = df_train.numberofstories.loc[conditions].value_counts()\n", + " # when there is at least one value in the value_counts of this property type\n", + " if len(mode_stories) > 0:\n", + " # set mode stories to the most popular value\n", + " mode_stories = mode_stories[0]\n", + " # otherwise\n", + " else:\n", + " # set mode stories to the general average for this property type\n", + " mode_stories = n_stories\n", + " \n", " # and set those non null values to the most common value seen\n", - " df_train['numberofstories'] = df_train['numberofstories'].masked_assign(mode_stories, \n", - " conditions)\n", + " df_train['numberofstories'].loc[conditions] = mode_stories\n", " \n", " # when type id matches and story count is null\n", - " conditions = ((df_train['propertylandusetypeid'] == id) \n", + " conditions = ((df_train['propertylandusetypeid'] == t_id) \n", " & (df_train['numberofstories'].isna() == False))\n", " # set null values to the common number of stories seen in that type id\n", - " df_train['numberofstories'] = df_train['numberofstories'].masked_assign(n_stories, \n", - " conditions)\n", - " \n", - "# TO BE ADDRESSED\n", - "# #https://en.wikipedia.org/wiki/Townhouse , typical town house are usually large, and has atleast 6 rooms\n", - "# df_train.loc[(df_train.propertylandusetypeid==264) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", - "\n", - "\"\"\"\n", - "df_train.loc[(df_train.propertylandusetypeid==246) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==246) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", - "\n", - "df_train.loc[(df_train.propertylandusetypeid==247) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==247) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", - "\n", - "df_train.loc[(df_train.propertylandusetypeid==248) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==248) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", - "\n", - "df_train.loc[(df_train.propertylandusetypeid==260) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==260) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", - "\n", - "df_train.loc[(df_train.propertylandusetypeid==261) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==261) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", - "\n", - "df_train.loc[(df_train.propertylandusetypeid==263) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==263) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", + " df_train['numberofstories'].loc[conditions] = n_stories\n", "\n", - "df_train.loc[(df_train.propertylandusetypeid==266) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==266) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", + "# edge cases\n", + "conditions = ((df_train.propertylandusetypeid==264) \n", + " & (df_train.numberofstories.isnull()))\n", + "df_train.numberofstories.loc[conditions] = 2\n", "\n", - "df_train.loc[(df_train.propertylandusetypeid==269) & (df_train.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==269) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", - "\n", - "prop2016.loc[(prop2016.propertylandusetypeid==275) & (prop2016.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==275) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", - "\n", - "prop2016.loc[(prop2016.propertylandusetypeid==267) & (prop2016.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==267) & (df_train.numberofstories.isnull()),'numberofstories']=1\n", - "\n", - "#https://en.wikipedia.org/wiki/Townhouse , typical town house are usually large, and has atleast 6 rooms\n", - "df_train.loc[(df_train.propertylandusetypeid==264) & (df_train.numberofstories.isnull()),'numberofstories']=2\n", - "\n", - "prop2016.loc[(prop2016.propertylandusetypeid==31) & (prop2016.numberofstories.notnull()),'numberofstories'].mode()\n", - "df_train.loc[(df_train.propertylandusetypeid==31) & (df_train.numberofstories.isnull()),'numberofstories']=2\"\"\"\n", - "\n", - "# after\n", - "print(df_train.numberofstories.isnull().sum())\n", - "print(df_train.numberofstories.value_counts())" + "# what's it looking like? \n", + "print(f'AFTER\\n{df_train.numberofstories.value_counts()}\\n'\n", + " f'{df_train.numberofstories.isnull().sum()} remaining null values')" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "BEFORE\n", + "1.0 12016\n", + "2.0 8044\n", + "3.0 508\n", + "4.0 2\n", + "Name: numberofstories, dtype: int32\n", + "69705 remaining null values\n", + "\n", + "AFTER\n", + "1.0 20154\n", + "2.0 423\n", + "3.0 4\n", + "Name: numberofstories, dtype: int32\n", + "69694 remaining null values\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", "metadata": { "id": "AHcMsDCxprd4", "colab_type": "code", - "colab": {} + "outputId": "3a327d21-4675-41ce-aa9e-f52ae86eb491", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 286 + } }, "source": [ - "\"\"\"skeptical of this0 cell (and the one above)..\n", - "author provides no explination for moding\"\"\"\n", - "\n", - "# before\n", - "print(df_train.fireplace_count.isnull().sum())\n", - "print(df_train.fireplace_count.value_counts())\n", + "# before (what's it looking like?) \n", + "print(f'BEFORE\\n{df_train.fireplace_count.value_counts()}\\n'\n", + " f'{df_train.fireplace_count.isnull().sum()} remaining null values\\n')\n", "\n", "# where there is a fire place, and count is not null\n", "conditions = ((df_train.fireplaceflag==1) \n", " & (df_train.fireplace_count.isna() == False))\n", "# calculate the mode fireplace count \n", - "mode_fire_count = df_train.loc[conditions, 'fireplace_count'].mode()\n", + "mode_fire_count = df_train.loc[conditions, 'fireplace_count'].value_counts()[0]\n", "# and set those non null values to the most common fireplace count\n", - "df_train['fireplace_count'] = df_train['fireplace_count'].masked_assign(mode_fire_count, \n", - " conditions)\n", + "df_train['fireplace_count'].loc[conditions] = mode_fire_count\n", "\n", "# where there is a fire place, and count is null\n", "conditions = ((df_train.fireplaceflag==1) \n", " & (df_train.fireplace_count.isna() == True))\n", "# set null values to the most common fireplace count\n", - "df_train['fireplace_count'] = df_train['fireplace_count'].masked_assign(1, \n", - " conditions)\n", + "df_train.fireplace_count.loc[conditions] = 1\n", "\n", "# df_train.loc[(df_train.fireplaceflag==1) & (df_train.fireplace_count.notnull()),'fireplace_count'].mode()\n", "# df_train.loc[(df_train.fireplaceflag==1) & (df_train.fireplace_count.isnull()),'fireplace_count']=1\n", "\n", "# after\n", - "print(df_train.fireplace_count.isnull().sum())\n", - "print(df_train.fireplace_count.value_counts())" + "print(f'AFTER\\n{df_train.fireplace_count.value_counts()}\\n'\n", + " f'{df_train.fireplace_count.isnull().sum()} remaining null values')" ], "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DVgF1c_p_bN1", - "colab_type": "text" - }, - "source": [ - "# -----current: break-----" + "outputs": [ + { + "output_type": "stream", + "text": [ + "BEFORE\n", + "0.0 80446\n", + "1.0 8165\n", + "2.0 1106\n", + "3.0 312\n", + "4.0 21\n", + "5.0 3\n", + "Name: fireplace_count, dtype: int32\n", + "222 remaining null values\n", + "\n", + "AFTER\n", + "0.0 80446\n", + "8165.0 9607\n", + "1.0 222\n", + "Name: fireplace_count, dtype: int32\n", + "0 remaining null values\n" + ], + "name": "stdout" + } ] }, { @@ -1847,36 +2221,75 @@ "metadata": { "id": "FIuSWoJspt3H", "colab_type": "code", - "colab": {} + "outputId": "9c5daebd-4b2a-461b-8490-350d19fa7ba8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 317 + } }, "source": [ - "import seaborn as sns\n", + "\n", + "# set basic sns \n", "color = sns.color_palette()\n", "sns.set(style=\"darkgrid\")\n", - "\n", - "\n", - "ax = sns.countplot(x=\"buildingqualitytypeid\", data=df_train)\n", - "\n", + "# convert dataframe to pandas for ease of use with sns\n", + "pd_train = df_train.to_pandas()\n", + "# set ax plot\n", + "ax = sns.countplot(x=\"buildingqualitytypeid\", data=pd_train)\n", + "# adjust fringe aesthetics\n", "plt.xticks(rotation='vertical')\n", "plt.title(\"Frequency of Bathroom count\", fontsize=15)\n", + "# display the graph\n", "plt.show()" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEsCAYAAACFRGf6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XlYVGX/P/D3DAiCghMIBFQuGEii\nYhBgYiaKIKGoaeKammtuZe4S9OAWiGEuqaVfzUxNS0UWBZdv+eRjmpUVov6U3NmUAXEDYeb8/uDL\neRwBHVC4B3y/rovrYs59Zs7n3LO859znzDkKSZIkEBER1TKl6AKIiOjZxAAiIiIhGEBERCQEA4iI\niIRgABERkRAMICIiEoIB9AxasWIFXFxcyv2NGDFCdGn1WkpKCgIDA+Hm5gZ/f/8K57l06ZLOc+Lq\n6oo333wT4eHhyMvLq/IyExMTsXv37nLTBw0ahA8//LDKj0dVc/LkSaxcuVJ0GQbLWHQBJIaFhQXW\nrVtXbhrVjJKSEsyaNQt+fn5YsGABGjVq9Mj558yZA3d3d2g0GqSnpyM2NhaZmZn46quvqrTcxMRE\n3L17F3369HmS8qmaTp48iS+++AKTJk0SXYpBYgA9o4yMjODu7q73/IWFhWjYsGENVlS/ZWVl4e7d\nu+jduzc8PT0fO3/Lli3l58fDwwOFhYVYvHix8OdB9PKpfuEQHJVTUlICFxcXfP3111iwYAF8fHx0\nvkGnpKSgX79+aNu2LXx9fRETE4OSkhKdx0hKSkKPHj3Qrl07DBs2DH/99RdcXFwQFxens4ytW7fq\n3C82NhadOnXSmXbt2jV88MEHeO2119C+fXuMHj0aFy9elNvLhq2Sk5MRFhYGDw8PvPHGG1i5ciUe\nPtHHmTNnMHbsWHh4eKBDhw545513cPToURQXF+P111/HF198Ua4/Bg0ahClTpjyyzxITExEcHAw3\nNze8+eab+Pzzz6HRaAAAO3bsQLdu3QAAY8eOhYuLS4XLeZRGjRpBq9VCq9XK03744QeEhobCy8sL\nXl5eePfdd3Hq1Cm5ffr06Th48CCOHj0qD+k9vNzdu3eje/fuePXVVzFmzBhkZ2fLbWX9mpiYiOnT\np8PT0xMTJ04EAGg0GixbtgxdunSBm5sbgoODkZiYWKV+KesbFxcXnD59GkOGDEH79u3Rt29fnD59\nGnfu3MGsWbPw6quvonv37khKSnpsP2k0GqxevRo9evSAm5sb3njjDcydO1dnnk2bNsHf3x9ubm7o\n0aMHNm3apNM+ffp0vPPOOzrTyvri8OHDAP77+t28eTNiYmLg7e2Njh07Yv78+bh//768bosXL4ZG\no+EwdyW4BfQMezg0jIyMoFAo5NtfffUVvL29ER0dLX+Qx8fHY+bMmRg0aBCmTZuGS5cuYenSpQBK\n37gA8Ndff+Gjjz5CQEAAwsLCcObMGXzwwQfVqlGtVmPQoEFo2rQpIiMjYWpqirVr12LUqFHYt28f\nTExM5HmjoqIQEBCA5cuX4+eff8aKFSvg7OyMHj16AADOnTuHQYMGwcnJCZGRkWjSpAlSU1ORmZmJ\nBg0aICQkBLt378b7778vP+bFixfx+++/48svv6y0xp9++gnTpk1Dv379MHPmTJw5cwbLly/HzZs3\nER4ejm7dusHCwgJTp06Vh9bs7e0fud6SJKGkpARarRbnz5/Hhg0b0KlTJ5ibm8vzZGRkoF+/fnjx\nxRdx//597NmzB0OGDEFiYiIcHR0xZcoUZGVlobCwEGFhYQCgs9zff/8dWVlZmDNnDu7evYtFixYh\nIiICa9as0all8eLFcr8qlaXfWT/77DN8/fXXmDRpEtq0aYO9e/di2rRpUCqV6Nmzp1798qCZM2di\n6NChGDt2LGJiYjB16lS0bt0aLVu2xIoVK7B9+3bMnDkTnp6esLW1rbTf5s2bh4SEBIwZMwaenp7I\nz8/HgQMH5PYtW7Zg0aJFGDlyJDp16oSjR49i0aJFKC4uxnvvvffI56Qi69atw+uvv46YmBicPn0a\nsbGxeOGFFzBy5Eh069YN586dw+bNm7FlyxYAHOYuR6JnzvLlyyVnZ+dyf0eOHJEkSZKKi4slZ2dn\nqV+/fjr302g0UufOnaV58+bpTN+2bZvUrl07KT8/X5IkSZo4caIUHBwsabVaeZ4VK1ZIzs7O0u7d\nu3WWsWXLFp3H+uyzz6TXX39dvh0TEyN5e3tLN2/elKep1WrJ3d1d2rp1qyRJknTx4kXJ2dlZmj17\nts5jvfXWW9JHH30k3548ebL05ptvSoWFhRX2y/nz5yVnZ2fp119/lactXbpU8vX1lUpKSiq8jyRJ\nUt++faURI0boTFu9erXk6uoqZWdn69T4008/Vfo4D8738F9wcLCUlZVV6f00Go1UXFwsde/eXVq9\nerU8fcKECdK7775bbv7Q0FDJ09NTKigokKetW7dOcnFxkYqKinRqmTx5ss59c3NzpbZt20pffPGF\nzvSRI0dKQUFBVeqX7du3S87OzlJcXJw8z4EDByRnZ2cpLCxMnpafny+1bt1a+u677yrtg7Nnz0rO\nzs7S5s2bK2wvLi6WXn/99XKv37CwMMnT01Ne748++kgaMGCAzjwPP39lr99hw4bpzDd27FgpNDRU\nvr1hwwbJ1dW10pqfdRyCe0ZZWFjg+++/1/lr166dzjxvvvmmzu309HRkZ2ejZ8+eKCkpkf98fHxQ\nWFiI8+fPAyjdAvLz89PZmqrsqK/H+c9//gNfX1+Ym5vLy7OwsECbNm2QmpqqM6+vr6/ObScnJ50h\npWPHjiEoKAimpqYVLsvJyQkdOnTAzp07AQBarRZxcXHo06cPjIyMKrxPcXExzpw5g8DAQJ3pQUFB\n0Gg0+PPPP6u8zgAQFhaG77//Hjt27MDKlSvRsGFDjB07Fvfu3ZPnOXfuHN5//328/vrrcHV1RZs2\nbXD58mWd4clHadeunc438latWkGSJOTk5OjM9/Dr4OzZsygqKiq3zj179sT58+eRn59f5X7p2LGj\n/H+zZs0AAD4+PvK0Jk2aQKVS6TyfDzt27BgAoF+/fhW2Z2Zm4saNGxXWVFBQIL9+q+Jxrzl6NA7B\nPaOMjIzQtm3bR85jbW2tc7vsMOBRo0ZVOH9WVhYAIDc3t9x9H76tr7y8PKSmpiI+Pr5c28NB8vDw\nRoMGDVBUVASgdEgrPz8fNjY2j1xe//79sWjRInz88cc4ceIEsrKyKv1AA0qHCDUaTaXrm5+f/8jl\nVaZZs2Y6z0+HDh3g6+uL3bt3Y9CgQbh16xZGjRoFOzs7zJkzB/b29jA1NcXcuXPldX4cS0tLndsN\nGjQAgHL3f3jdrl+/DgBo2rSpzvSy2wUFBSgqKqpSvzxYS1kdj3o+K5Kfnw8LCwuYmZlV2F4WrA/X\nXVbTzZs3K33sylS1RtLFAKJKPbgFA5R+CwWARYsWwdnZudz8L774IoDSN3Rubq5O28O3jYyMYGxs\njOLiYp3pBQUFOrdVKhVeeeUVjBs3rtzyGjdurOealK6LSqWSPzwrExQUhEWLFiE5ORmHDx/Gq6++\nihYtWlQ6v5WVFYyMjKBWq3Wml62vSqXSu8ZHadq0KZo0aYL09HQApftvcnJysHnzZnmLASjff0/D\nw6+DshDPzc3V+QC+ceMGgNIwadSoUa30y4NUKhVu3bqFe/fuVRhCZfuOKnttlr2+TU1Ny70uqxNO\n9HgcgiO9tWrVCjY2Nrh27Rratm1b7q/sQ6Vt27Y4dOiQzhFo+/fv13kshUIBOzs7+QMVKD2C6ejR\nozrz+fj44Ny5c3BxcSm3vEcFQ0V8fHyQlJQkH6VUEXNzcwQFBeGbb77BgQMHHrn1A5R+43V1dcW+\nfft0pu/duxdGRkZo3759lWqsTHZ2NvLz8+WDCAoLCwFA5yCMX3/9Vd4KfbC+p/2N3MXFBaampuXW\ned++fWjVqhVUKlWt9cuDyobxKvrhLQA4ODigadOmFdbUpEkTtGrVCgDw/PPP4+rVqzqvkyNHjlSr\npgYNGkCj0ZQ74IdKcQuI9GZkZISZM2di7ty5KCgoQOfOnWFsbIwrV65g//79WL16NUxMTDBmzBiE\nhoZi2rRp6Nu3L86ePSvvV3lQ9+7dsX37drRu3RoODg7YsWOH/MFa5r333kNCQgKGDx+OoUOHwtbW\nFjdu3MDx48fh5eWFoKAgveufMmUK+vfvj6FDh2LEiBFQqVQ4deoUmjZtir59+8rz9e/fHwMHDoS5\nubl8RNfjHnfs2LGYN28eAgMDcebMGaxYsQKhoaGPPGLrUf755x9YWlpCkiRkZWVh3bp1sLS0lNe3\nQ4cOMDMzQ1hYGEaNGoWMjAysWrWq3PJatmyJw4cP48CBA7Czs4OdnV21aypjZWWFYcOGYeXKlVAq\nlXjllVewb98+/Pzzz1i2bJk8X030y6O0atUKb7/9NhYuXIgbN27Aw8MDN2/exIEDB7B06VIYGRlh\n4sSJiIyMhKWlJTp27Ihjx45h+/btmDFjhhzm/v7+WLlyJcLCwtCnTx+kpqZWGmqP07JlSwDAxo0b\n4eXlBQsLiyp/carPGEBUJb1794alpSXWrl2L77//HkqlEi+99BK6du0KY+PSl5O7uzuWLl2K2NhY\nHDhwAO3atUNsbGy531ZMmTIFeXl5iI2NRYMGDTBs2DA4OTnh+++/l+extrbG9u3bERsbi0WLFqGg\noAC2trbw8PCAi4tLlWp3cnLCli1bEBMTg3nz5kGhUODll18ud0oad3d3NG3aFJ07d9ZrmK9Lly5Y\nunQp1qxZg7i4OFhZWWH06NGYPHlylep70OLFi+X/mzZtirZt22LhwoXyFpCtrS0+//xzREVFYfz4\n8WjRogUiIyOxevVqnccZOnQozp49izlz5qCgoABTp07VOcy8uj788EM0aNAAmzdvhlqtRvPmzbF0\n6VKdwK6Jfnmc+fPnw9HRETt37sTatWthbW2Nzp07y+2DBw9GcXExvvnmG3z99dewt7fHnDlz8O67\n78rztG7dGgsWLMDatWuRkpICHx8fLFy4EEOGDKlyPT4+Phg5ciQ2btyImJgY+Pj4YOPGjU9jVesF\nhSTxktxU8woKCvDaa68hOjoaISEhost5pDNnziAkJATffPMNvLy8RJdDVG9xC4jo/6jValy4cAHL\nli1D69atGT5ENYwHIRD9n4MHD2LIkCHIy8vTGQIjoprBITgiIhKCW0BERCQEA4iIiITgQQiVyMu7\nA62Wo5NERPpQKhV47rlHX2jxYQygSmi1EgOIiKgG1doQ3Pvvv4/evXujT58+GDx4ME6fPg0AuHDh\nAgYOHIiAgAAMHDhQ50y+NdFGRESGodaOgrt165Z84sIDBw5g1apV2LVrF4YPH463334bISEhiIuL\nww8//CBfobAm2vSVm3ubW0BERHpSKhWwttb/BMFALW4BPXjW3Nu3b0OhUCA3NxdpaWkIDg4GAAQH\nByMtLQ1qtbpG2oiIyHDU6j6gefPm4ciRI5AkCevWrUNmZibs7Ozki30ZGRnB1tYWmZmZkCTpqbdZ\nWVnpXWtVk5yIiKqmVgNo4cKFAEpPlx4dHY2pU6fW5uKrhENwRET6M+ghuAf16dMHx44dw/PPP4/s\n7GxoNBoApdeDycnJgb29Pezt7Z96GxERGY5aCaA7d+4gMzNTvn3o0CE0adIE1tbWcHV1RUJCAgAg\nISEBrq6usLKyqpE2IiIyHLVyFNyNGzfw/vvv4969e1AqlWjSpAlmzZqFNm3aID09HbNnz0ZBQQEs\nLS0RFRUlX8SpJtr0xSE4IiL9VWcIjicjrQQD6NmmsjBBg4amostAcWER8m9VfglxIkNRnQDimRCI\nKtCgoSmSho8UXQaCNm0AGEBUT/FkpEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAR\nEQnBACIiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQERE\nJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQEREJIRxbSwkLy8PM2fO\nxOXLl2FiYoJmzZohMjISVlZWcHFxgbOzM5TK0iyMjo6Gi4sLAODQoUOIjo6GRqNBmzZtsHjxYpiZ\nmT1RGxERGYZa2QJSKBQYPXo0kpOTER8fjxdffBExMTFy+7Zt2xAXF4e4uDg5fO7cuYOPP/4Ya9as\nwf79+9GoUSOsX7/+idqIiMhw1EoAqVQqeHt7y7fd3d2RkZHxyPscPnwYbm5uaN68OQAgNDQUe/fu\nfaI2IiIyHLUyBPcgrVaLrVu3ws/PT542bNgwaDQavPHGG5g8eTJMTEyQmZkJBwcHeR4HBwdkZmYC\nQLXbqsLaunGV70NUE2xsLESXQFQjaj2A5s+fD3NzcwwdOhQA8OOPP8Le3h63b9/GjBkzsGrVKnz4\n4Ye1XVY5ubm3odVKossgQQzpQ//69VuiSyB6LKVSUeUv7rV6FFxUVBQuXbqEZcuWyQcd2NvbAwAa\nN26MAQMG4Pfff5enPzhMl5GRIc9b3TYiIjIctRZAn332GVJTU7Fq1SqYmJgAAG7evInCwkIAQElJ\nCZKTk+Hq6goA6Ny5M/7++29cvHgRQOmBCj179nyiNiIiMhy1MgR37tw5rF27Fs2bN0doaCgA4IUX\nXsDo0aMRHh4OhUKBkpISdOjQAVOnTgVQukUUGRmJcePGQavVwtXVFfPmzXuiNiIiMhwKSZK4o6MC\n3Af0bLOxsUDS8JGiy0DQpg3cB0R1gsHvAyIiIirDACIiIiFq/TBsqjnPNTGBsYmp0BpK7hch7+Z9\noTUQUd3AAKpHjE1M8Vv0aKE1eMxcB4ABRESPxyE4IiISggFERERCMICIiEgIBhAREQnBACIiIiEY\nQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQEREJAQDiIiIhGAA\nERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQEREJEStBFBeXh7GjBmDgIAA9OrVC5Mm\nTYJarQYAnDx5Er1790ZAQABGjRqF3Nxc+X410UZERIahVgJIoVBg9OjRSE5ORnx8PF588UXExMRA\nq9VixowZCA8PR3JyMjw9PRETEwMANdJGRESGo1YCSKVSwdvbW77t7u6OjIwMpKamwtTUFJ6engCA\n0NBQ7Nu3DwBqpI2IiAxHre8D0mq12Lp1K/z8/JCZmQkHBwe5zcrKClqtFvn5+TXSRkREhsO4thc4\nf/58mJubY+jQodi/f39tL15v1taNRZdQZ9nYWIguoV5hf1J9VasBFBUVhUuXLmHNmjVQKpWwt7dH\nRkaG3K5Wq6FUKqFSqWqkrSpyc29Dq5WeYG1rn6F8UF2/fkt0CU/MUPoSqB/9SfWfUqmo8hf3WhuC\n++yzz5CamopVq1bBxMQEAODm5obCwkKcOHECALBt2zYEBgbWWBsRERmOWtkCOnfuHNauXYvmzZsj\nNDQUAPDCCy9g1apViI6ORkREBIqKiuDo6IglS5YAAJRK5VNvIyIiw6GQJKlujTPVkro6BPdb9Gih\nNXjMXFcvhoxsbCyQNHyk6DIQtGlDvehPqv8MegiOiIjoQQwgIiISggFERERCMICIiEgIBhAREQnB\nACIiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQmhdwCtX7++wukbNmx4\nasUQEdGzQ+8AWrVqVYXTV69e/dSKISKiZ8djL0h39OhRAIBWq8Uvv/yCBy8fdPXqVTRq1KjmqiMi\nonrrsQE0b948AEBRURHmzp0rT1coFLCxsUFYWFjNVUdERPXWYwPo0KFDAICZM2ciOjq6xgsiIqJn\nw2MDqMyD4aPVanXalEoeTEdERFWjdwCdOnUKkZGROHv2LIqKigAAkiRBoVDg9OnTNVYgERHVT3oH\n0OzZs9G1a1csWrQIDRs2rMmaiIjoGaB3AF27dg0ffvghFApFTdZDRETPCL133vj7++Pnn3+uyVqI\niOgZovcWUFFRESZNmgQPDw80bdpUp41HxxERUVXpHUCtWrVCq1atarIWIiJ6hugdQJMmTarJOoiI\n6BmjdwCVnZKnIh07dnwqxRAR0bND7wAqOyVPmby8PBQXF8POzg4HDx587P2joqKQnJyMa9euIT4+\nHs7OzgAAPz8/mJiYwNTUFAAwffp0dO7cGQBw8uRJhIeHo6ioCI6OjliyZAmsra2fqI2IiAyD3kfB\nHTp0SOfvxIkTGD9+PIYOHarX/bt164Zvv/0Wjo6O5dqWL1+OuLg4xMXFyeGj1WoxY8YMhIeHIzk5\nGZ6enoiJiXmiNiIiMhzVPoeOkZERxo8fj3Xr1uk1v6enJ+zt7fV+/NTUVJiamsLT0xMAEBoain37\n9j1RGxERGQ69h+AqcuTIkafyw9Tp06dDkiR4eHhg2rRpsLS0RGZmJhwcHOR5rKysoNVqkZ+fX+02\nlUqld03W1o2feL2eVTY2FqJLqFfYn1Rf6R1AXbp00Qmbe/fu4f79+4iIiHiiAr799lvY29vj/v37\nWLhwISIjIw1iyCw39za0WunxMxoQQ/mgun79lugSnpih9CVQP/qT6j+lUlHlL+56B9CSJUt0bpuZ\nmaFFixZo3PjJthTKhuVMTEwwePBgTJgwQZ6ekZEhz6dWq6FUKqFSqardRkREhkPvfUBeXl7w8vKC\np6cnmjdvjjZt2jxx+Ny9exe3bpV+u5MkCUlJSXB1dQUAuLm5obCwECdOnAAAbNu2DYGBgU/URkRE\nhkPvLaDbt28jMjISSUlJKCkpgbGxMd566y2EhYXBwuLxwxULFixASkoKbty4gZEjR0KlUmHNmjWY\nPHkyNBoNtFotnJyc5CE9pVKJ6OhoRERE6BxO/SRtRERkOBSSJOm1o2P27Nm4c+cOpk2bBkdHR1y7\ndg2xsbEwMzNDVFRUTddZ6+rqPqDfokcLrcFj5rp6sc/CxsYCScNHii4DQZs21Iv+pPqvRvcB/fvf\n/8aBAwdgZmYGAGjRogUWL14Mf3//qlVJRESEKuwDMjU1hVqt1pmWl5cHExOTp14UERHVf3pvAfXv\n3x+jRo3CiBEj4ODggIyMDGzcuBEDBgyoyfqIiKie0juAJkyYADs7O8THxyMnJwe2trYYPXo0A4iI\niKpF7yG4hQsXokWLFti4cSOSkpKwceNGODk5YeHChTVZHxER1VN6B1BCQgLc3Nx0prm5uSEhIeGp\nF0VERPWf3gGkUCig1Wp1ppX9foeIiKiq9A4gT09PfP7553LgaLVarFixQj7rNBERUVVU6YJ048aN\ng6+vLxwcHJCZmQkbGxusWbOmJusjIqJ6Su8Aev7557Fr1y789ddfyMzMhL29Pdq1awelstqXFCIi\nomdYla4HpFQq4e7uDnd395qqh4iInhHcfCEiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERC\nMICIiEgIBhAREQnBACIiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEiIWgmgqKgo\n+Pn5wcXFBf/v//0/efqFCxcwcOBABAQEYODAgbh48WKNthERkeGolQDq1q0bvv32Wzg6OupMj4iI\nwODBg5GcnIzBgwcjPDy8RtuIiMhw1EoAeXp6wt7eXmdabm4u0tLSEBwcDAAIDg5GWloa1Gp1jbQR\nEZFhqdIluZ+mzMxM2NnZwcjICABgZGQEW1tbZGZmQpKkp95mZWVVpfqsrRs/xbV9ttjYWIguoV5h\nf1J9JSyADF1u7m1otZLoMqrEUD6orl+/JbqEJ2YofQnUj/6k+k+pVFT5i7uwALK3t0d2djY0Gg2M\njIyg0WiQk5MDe3t7SJL01NuIiMiwCDsM29raGq6urkhISAAAJCQkwNXVFVZWVjXSRkREhkUhSVKN\njzMtWLAAKSkpuHHjBp577jmoVCokJiYiPT0ds2fPRkFBASwtLREVFYWWLVsCQI20VUVdHYL7LXq0\n0Bo8Zq6rF0NGNjYWSBo+UnQZCNq0oV70J9V/1RmCq5UAqosYQNXDAHq6GEBUV1QngHgmBCIiEoIB\nREREQjCAiIhICAYQEREJwQAiIiIhGEBERCQEA4iIiIRgABERkRAMICIiEoIBREREQjCAiIhICAYQ\nEREJwQAiIiIhGEBERCQEA4iIiIRgABERkRAMICIiEoIBREREQjCAiIhICAYQEREJwQAiIiIhGEBE\nRCQEA4iIiIRgABERkRAMICIiEoIBREREQhiLLgAA/Pz8YGJiAlNTUwDA9OnT0blzZ5w8eRLh4eEo\nKiqCo6MjlixZAmtrawCodhsRERkGg9kCWr58OeLi4hAXF4fOnTtDq9VixowZCA8PR3JyMjw9PRET\nEwMA1W4jIiLDYTAB9LDU1FSYmprC09MTABAaGop9+/Y9URsRERkOgxiCA0qH3SRJgoeHB6ZNm4bM\nzEw4ODjI7VZWVtBqtcjPz692m0qlqtV1IiKiyhlEAH377bewt7fH/fv3sXDhQkRGRsLf319oTdbW\njYUuvy6zsbEQXUK9wv6k+sogAsje3h4AYGJigsGDB2PChAkYPnw4MjIy5HnUajWUSiVUKhXs7e2r\n1VYVubm3odVKT7hmtctQPqiuX78luoQnZih9CdSP/qT6T6lUVPmLu/B9QHfv3sWtW6VvMEmSkJSU\nBFdXV7i5uaGwsBAnTpwAAGzbtg2BgYEAUO02IiIyHMK3gHJzczF58mRoNBpotVo4OTkhIiICSqUS\n0dHRiIiI0DmcGkC124iIyHAoJEmqW+NMtaSuDsH9Fj1aaA0eM9fViyEjGxsLJA0fKboMBG3aUC/6\nk+q/OjkER0REzyYGEBERCcEAIiIiIRhAREQkBAOIiIiEYAAREZEQDCAiIhKCAUREREIwgIiISAgG\nEBERCcEAIiIiIYSfjLQusLBsiIamDYTWUFhUjFsFhUJrICJ6mhhAemho2gCDZ34rtIYt0UNwCwwg\nIqo/OARHRERCMICIiEgIBhAREQnBACIiIiEYQEREJAQDiIiIhGAAERGREPwdEBHVuCaWJjAxNRVd\nBu4XFeFmwX3RZdD/YQARUY0oWKqDAAAQ7ElEQVQzMTXFZ3PGiS4D0xavBcAAMhQcgiMiIiEYQERE\nJAQDiIiIhGAAERGREAwgIiISot4G0IULFzBw4EAEBARg4MCBuHjxouiSiIjoAfU2gCIiIjB48GAk\nJydj8ODBCA8PF10SERE9oF7+Dig3NxdpaWnYsGEDACA4OBjz58+HWq2GlZWVXo+hVCp0bjd9rtFT\nr7OqHq6pIiaW1rVQyaPpU2ddYNZUfF8C9ac/LVXsz6fFwsIUJiZir9J8/34xbt0qkm9Xp18VkiRJ\nT7MoQ5CamopZs2YhMTFRnhYUFIQlS5agTZs2AisjIqIy9XYIjoiIDFu9DCB7e3tkZ2dDo9EAADQa\nDXJycmBvby+4MiIiKlMvA8ja2hqurq5ISEgAACQkJMDV1VXv/T9ERFTz6uU+IABIT0/H7NmzUVBQ\nAEtLS0RFRaFly5aiyyIiov9TbwOIiIgMW70cgiMiIsPHACIiIiEYQEREJAQDiIiIhGAAERGREAwg\nIiISol6ejJT0c+/ePRw+fBiZmZkASs8g0blzZ5ibmwuujIieBfwd0DPqxx9/RFhYGNzc3ORTFGVm\nZiI1NRXz589H165dBVf4XxkZGdi3b59OUAYEBMDR0VFwZbpYJxkyQ/zCyQCqIYb+Jg8KCsLq1avR\nrFkznekXL17EhAkTsHfvXkGV6dqxYwdWrlyJ7t276wTlwYMHMXHiRAwYMEBwhaVY59Nn6O+hMnWh\nTkP9wskAqgF14U3eo0cPpKSkVLmttgUEBGDr1q3lzuOnVqsRGhrKOquortRZF95DQN2p01C/cHIf\nUA1Yt24ddu3aVe5NPnHiRISGhhrEi7JNmzYIDw/HwIED4eDgAKD0m9x3330HV1dXwdX9l1arrfAk\nss899xwM6bsT63y66sJ7CKg7dZaUlJQLHwBo3ry5fNUAERhANaAuvMkXL16M9evXY9asWcjIyIBC\noYCDgwMCAgIwZ84c0eXJfH19MXr0aLzzzjs6Qbl9+3Z06tRJcHX/xTqfrrrwHgLqTp2G+oWTQ3A1\n4F//+heuXLlS4Zv8hRdewCeffCK2wDpEq9Viz5492Lt3LzIyMgAADg4OCAwMREhICJRKw/glAet8\nuurKe6iu1FlYWIj169fLz/uDXzjfe+89mJmZCamLAVQD6sqbvDLXr1+HjY2N6DLoGVZX3kN1pU5D\nxQCicvr06YPdu3eLLuOxTp06hTZt2ogu47FYJxkykV84Gc+17NSpU6JLqNR//vMfAKgT4QMAn3/+\nuegS9MI6ny5Dfg89qK7UOWbMGGHL5hZQLRs7diy+/PJL0WXg/Pnz5aa99957+J//+R9IkoRWrVoJ\nqOrx7ty5g4sXL6JZs2Zo3Lix6HLqvHv37iE9PR0vvfQSLC0tRZejF0N5Dz1OXalTJAbQM6p169Zw\ndHTUOVInOzsbdnZ2UCgUOHjwoMDq/is8PBwffPABrKys8Ntvv2Hy5Ml47rnnoFarsWTJEvj6+oou\nEQDg7e2NXr164e233zaow9gftn//fsyaNQu2traIjo7G1KlTYWZmhtzcXCxevBh+fn6iS6RniUS1\nKjg4WHQJkiRJ0ooVK6TRo0dL165dk6d17dpVYEUV69Wrl/z/sGHDpD///FOSJEn6559/pL59+4oq\nq5yuXbtKCxculHx8fKQ+ffpI33zzjZSfny+6rHJCQkKkM2fOSMePH5e8vLyk3377TZIkSTp//rwU\nEhIiuLry1Gq1lJaWJqWlpUlqtVp0OXWWWq2W5s6dK40cOVLavHmzTtukSZMEVSVJ/B1QDahoeKtM\nXl5eLVZSuUmTJiEtLQ3Tpk1DSEgIBg0aBIVCIbqscoqKiuT/79y5g3bt2gEAWrRogeLiYlFlldOk\nSRPMnTsXM2bMwMGDB7Fz504sXboUb775Jvr3729Qv7FxcXEBADRq1AivvvoqAMDJyUlkSeVcvnwZ\nH3/8MdLS0mBrawsAyMnJwSuvvIJ//etfaN68udgC9dCrVy/Ex8eLLgMAEBERgRdeeAFdunTB1q1b\ncfToUSxbtgzGxsa4cuWKsLoYQDUgODi43PBWmfz8fAEVVeyVV17Bpk2bsHz5cowYMcKgPtDLdOzY\nEZ9++immTp0Kb29vJCUlISgoCEeOHIFKpRJdXjkNGjRAYGAgAgMDkZ2djV27dmH+/PnYt2+f6NIA\nAAqFAunp6SgoKMDdu3dx8uRJuLu748KFC0J/Ef+wmTNnYvDgwdiwYYN8KLNWq0V8fDxmzZqF7777\nTnCFperCl02g9JQ7y5cvBwD4+/sjMjIS48aNwxdffCG0Lu4DqgHdunXDli1bYGdnV66tS5cu+Omn\nnwRU9WgnT57E8ePHMXbsWNGl6Lh//z6io6MRFxcHlUqFK1euwNjYGN7e3vjkk0/w4osvii4RQN05\ndP1///d/MWvWLCiVSsTGxuLLL7/E9evXkZWVhU8++QTBwcGiSwQABAYGVhraj2qrbRXtSy2Tk5OD\n1NRUAVWV17Nnz3Lne4uKikJaWhpycnLEnXxY2OBfPfbpp5/KY+sPmz9/fi1XUz/cuXNHOn36tHTq\n1CmD3Bdw9epV0SVUS0lJifT3339L169fF12KjoEDB0rx8fGSVquVp2m1WikuLk4aMGCAwMp0+fn5\nSVlZWRW2vfHGG7VcTeXGjBkjHT9+vNz0pUuXSi4uLgIqKsUtICIyOBcvXkRERAROnz4tjyRkZ2ej\ndevW+OSTT9CyZUvBFZaKioqCv7+/vC/tQQsWLEBYWJiAqsrLz8+HQqFAkyZNyrWdP39e2M8uGEBE\nZLDUarXOdXYqOvEnPRmRB0vwIAQiMlhWVlblQseQji57FEOq01APlmAAEZHBqewDU5Ikgzq6zFA/\n2B9mqEfmMoCIyOAY6gfmw+pKnY6Ojo88MlcUBhARGRxD/cB8WF2ps0ePHrh27VqFdfr7+wuoqBQD\niIgMjqF+YD6srtQ5a9asSttEHqnHo+CIiEgIXg+IiIiEYAAREZEQDCCqc/z8/OSrt1ZFeHg4Vq1a\nBQA4duwY3njjjUrnnT17NmJjYwEAJ06cQEBAQPWKrUUP9suaNWswb948wRU93p49ezBq1KhK24cN\nG4YdO3bUYkVUm3gQAj0zIiMjq3U/T09PJCcnP+Vqatb48ePl/69evYpu3brh1KlTMDZ+/Fv+2LFj\nmDFjBg4fPlyTJQIAevfujd69e9f4csgwcQuIiIiEYABRnfT3338jKCgIr732GubMmYOioiLs3LkT\ngwYN0pnPxcUFly5dAqA7rPawtLQ09O3bFx06dMAHH3ygcyG8h4fr/Pz8sH79evTq1QseHh7l5v/q\nq6/g6+sLX19f7NixQ6eGvLw8jB8/Hq+++ir69++PZcuWyTVfvXoVLi4uKCkpkR/rwSGoy5cvY/jw\n4fD29oa3tzc++ugjFBQUVLg+K1aswPTp0wEAQ4cOBQC89tpr6NChA44fPw4vLy+cPXtWnj83Nxft\n27fHtWvXMGbMGOTk5KBDhw7o0KEDsrOz0b59e51f9p86dQo+Pj4oLi7Gzp07ERoaisjISHh4eCAw\nMBBHjx6V57116xbmzp0LX19fdO7cGbGxsfK1hx5+zo4cOYLAwEB4eHggMjKywh94Uv3BAKI6KT4+\nHuvXr8f+/ftx4cKFJ7qw1v379zFx4kSEhITg+PHjCAwMREpKyiPvs3fvXqxbtw4HDx7E2bNnsXPn\nTgDA4cOHsXHjRmzYsAH79+/HsWPHdO4XGRkJU1NT/Pzzz1i0aBF++OEHveuUJAnjxo3Dv//9b+zd\nuxdZWVlYsWLFY++3efNmAMCvv/6KP/74A15eXggKCsKePXvkeRISEtCxY0c4Ojriq6++gq2tLf74\n4w/88ccfsLOzg5eXl841Y+Li4vDWW2+hQYMGAIC//voLL730En755RdMmTIFkyZNks8EMHv2bBgb\nGyMlJQW7d+/GkSNHKtyvo1arMWnSJHzwwQf45Zdf8NJLL+H333/Xu3+o7mEAUZ00ZMgQ2NvbQ6VS\nYcKECUhMTKz2Y/35558oLi7Gu+++K1/RtG3bto+8z7Bhw2BnZweVSoWuXbvi9OnTAEqDqV+/fnj5\n5ZdhZmaGyZMny/fRaDRISUnBlClTYG5uDmdnZ/Tt21fvOps1a4ZOnTrBxMQEVlZWGDlyJH799ddq\nrXPfvn2RmJgob2HExcU9cl9M37595cDSaDRITExESEiI3G5lZSX3X1BQEFq0aIEff/wRN27cwE8/\n/YS5c+fC3Nwc1tbWGDFiRIXP1+HDh/Hyyy8jMDAQDRo0wLvvvoumTZtWa/2obuBBCFQn2dvby/87\nODggJyen2o+Vk5MDOzs7KBQKncd8FBsbG/l/MzMzefk5OTlwc3OrsE61Wo2SkpJytevrxo0bWLhw\nIU6cOIE7d+5AkiRYWlrqff8HtW/fHg0bNsSxY8dgY2ODy5cvo1u3bpXO361bN0RERODKlSu4cOEC\nGjdujHbt2sntFfVfTk4OMjIyUFJSAl9fX7lNq9Xq9EGZnJwcPP/88/JthUJR4XxUfzCAqE4qu0YM\nAGRkZMDW1hZmZmYoLCyUp1+/fl2vx7KxsUF2djYkSZI/RDMyMqp1uW9bW1tkZ2dXWKeVlRWMjY2R\nmZkJJyencu3m5uYAgMLCQjRu3LjcOnz22WdQKBSIj4+HSqXCgQMH9Dqy78FgeFDZVo2NjQ0CAgJg\nampa6fympqbo2bMn9uzZg3/++Udn6wdAuf7LzMyEn58fnn/+eZiYmOCXX3557BF4NjY2yMrKkm9L\nkqTTP1T/cAiO6qQtW7YgKysL+fn5WLNmDYKCgtC6dWucO3cOp0+fRlFRkV77RwDA3d0dxsbG2LRp\nE4qLi5GSkoK///67WnUFBgZi586dSE9Px71793T2TRkZGcHf3x8rV67EvXv3cP78eezatUtut7Ky\ngp2dHeLi4qDRaPD999/jypUrcvudO3dgbm4OCwsLZGdnY926dXrVZGVlBaVSqfNYQOkh0AcOHMCe\nPXvQp08febq1tTXy8/Nx69YtnflDQkKwa9cuHDp0qFwAqdVquf/27t2L9PR0dOnSBba2tujUqRM+\n/fRT3L59G1qtFpcvX8bx48fL1dmlSxecO3cOKSkpKCkpwaZNm3Djxg291pHqJgYQ1UnBwcEYNWoU\nunfvjpdeegkTJkxAixYtMHHiRIwYMQI9evSAh4eHXo9lYmKCFStWYNeuXfDy8kJSUlK1TyTZpUsX\nDBs2DMOHD4e/vz/at28vLwMo/THs3bt30alTJ8yePRv9+vXTuf/8+fOxfv16eHt74/z58+jQoYPc\nNmnSJKSlpcHT0xNjx45Fjx499KrJzMwM48ePx6BBg+Dp6YmTJ08CKB0efOWVV6BQKODp6SnP7+Tk\nhLfeegvdu3eHp6envEXn4eEBpVKJNm3awNHRUWcZ7dq1w6VLl+Dj44Nly5Zh+fLleO655wAA0dHR\nKC4ulo9anDJlSoVbp1ZWVvj888+xdOlSeHt749KlSxVe6prqD56MlKgGpaenIzg4GH///XeFQ1A7\nd+7Ejh07sHXrVgHVAXPmzIGtrS0+/PBDveYfPnw4evXqhQEDBsjTRK8D1V3cAiJ6yvbv34/79+/j\n5s2bWLJkCbp27arXGQhq29WrV7F//370799fr/n/+usvpKWloWfPnjVcGT0rGEBET9m2bdvQsWNH\n+Pv7w8jICJ988onokspZtmwZevXqhffee0+vgy1mzZqFkSNHYu7cufIBEkRPikNwREQkBLeAiIhI\nCAYQEREJwQAiIiIhGEBERCQEA4iIiIT4/9XIitKxsMjJAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] }, { "cell_type": "code", "metadata": { "id": "KOHPCFRSp5y9", "colab_type": "code", - "colab": {} + "outputId": "3aa099cd-791f-4a5a-9ea7-29168fc239b9", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 274 + } }, "source": [ - "plt.plot(df_train.yearbuilt,df_train.buildingqualitytypeid , 'ro')\n", + "# let's look more into year built vs type \n", + "plt.plot(pd_train.yearbuilt, pd_train.buildingqualitytypeid, 'ro')\n", + "# display the graph\n", "plt.show()" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEBCAYAAACQbKXWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGx1JREFUeJzt3WtwE+ehBuB3JSFjA6ot2YC5ppnG\n1JmG0DqDGTeFYAimDZDWzJSUi5OBhLYpKbRhUkLTkgYCUUlomEBi0kNLSDLwxx7aQE8hHS4pUCg0\nF+oMxdQY8Az4ItkcY4LtSPudHyDFF0nWZaXd9fc+v8iu9tt3V6tX8mqjVYQQAkREJBWL3gGIiCj1\nWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEbHoHaGm5AVUVcLkGw+tt0ztOXJhdH8yeembNDfSf7BaLgqysQQmPqXv5q6qAqorgv82K\n2fXB7Kln1twAs3fF0z5ERBJi+RMRSYjlT0QkoajK3+12o7i4GOPGjUN1dTUAoKWlBU888QRKSkow\ne/ZsLFu2DM3NzUkNS0RE2ojqC99p06ahrKwMCxYsCE5TFAWPP/44CgsLAdx6g3j55Zexfv365CSl\npGs9cRyeygr4mr2AxQKoKmxOF7JL58Ixqaj3YwIsFjgmT8HwhY9GHLPnWPFmC4yTM7sk7m1NRLTb\nFHJf3VYdYtzAWNeO/R3tZ8+GnNfXemxOFzLGj8dnZ850ywcg7HPbdV5g2mf/PY/WD44Aqtrr+a1/\n560v5vWkKEDP24LfXp8yaBAURYHa1hacFjOLBQPHjYOvofHWtqSlAZ2d3dYZbj/Wj78HnZ/7eu1b\nTVitgN/fLWe414RRKLHcwL24uBjl5eXIy8vrNW///v3YtWsXduzYEVMAr7cNqiqQkzMETU3XY1rW\nKPpD9tYTx9GwcwdEZ2evxyh2O4aVPQYAYR8DAI4HpnY72EONGRgrljeAcON8ZdmPodz99ajH0UK0\n2xRpf0YUqjwTXY/VCkAB/L7o5oXJ4HhgKtLT7Wj43/1RboyOIuzHVOr5mohX146xWBS4XIMTHlOT\nc/6qqmLXrl0oLi7WYjjSgaeyImyBiM5OeCorIj4GwK1Pg32MGRgr0WyisxOX3343pnG0EO029bWv\nwopQWHGvx+8PXfzh5oXJ0PrBETTsfz/yuozCAMUP9H5NGIkm1/mvXbsWGRkZWLhwYczLdn0Hy8kZ\nokUcXZg9e3VL5O9rfH3MBwCoarf9EG5MX0tzTPsr3DgdHm/K93u029TX/oxXqtYTUjynaWTX4zWR\nCK2P9YTL3+1249KlSygvL4fFEvsfEjzto69AdluWM+S56QBblhMAIj4GFku3/RBuTFuWM6b9FW6c\ntGxXyvd7tNvU1/5MZP2pWE9Igdc33wSi1+M1ES/DnfbZtGkTqqqqsHXrVtjt9oTDkH6yS+dCCfMc\nKnY7skvnRnwMADgmT+lzzMBYiWZT7HaMWbQgzBLJE+029bWvwlKU8LPiXY/VCljDfM4LNS9MBsfk\nKRhW8mDkdRlFhP2YSj1fE0Ziff7555/v60Hr1q3Dc889h8bGRuzfvx+VlZWYOHEinn76aQwYMAB7\n9uzB7t27cfToUTz00EMxBbh5sxNCAIMGpeGzz+I4R2oA/SF72qjRGOByof3iRag3b976lCcEbE4X\nhj4yH45JRb0fE2CxwDHlgV5fbPV8fNexYhFunNEzpqV8v0e7TWH3VQQ2pwtDFyyE7+Zn8Hk8vedF\nsR6b04XBhZPgb73+Rb75CzD4618P/dz2mBfIYHE40HH58q1z512e39FTivB/9U1fzOspVOneXp8y\naBAsaWm3vqO4PS1mFgsGfvWrgCpubUtaWq+/RMLtR8f4e6A4Xb32rSas1u7bE+Y1Ea+uHaMoCjIy\nEv+wHdPVPsnA0z76YnZ9mDW7WXMD/Se7IU77EBGRObH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgk\nxPInIpIQy5+ISEIsfyIiCbH8iYgkpMlPOhMRxUqru7xRfFj+RJRyPe9A5mv2omHnDgDgG0CK8LQP\nEaWcVnd5o/ix/Iko5cLdgCZlN6Yhlj8RpZ7N6YppOmmP5U9EKafVXd4ofvzCl4hSLvClLq/20Q/L\nn4h04ZhUxLLXEU/7EBFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJ\nqM/yd7vdKC4uxrhx41BdXR2cXltbi3nz5qGkpATz5s3DxYsXk5mTiIg01OfPO0ybNg1lZWVYsGBB\nt+lr1qzB/Pnz8fDDD+NPf/oTfv3rX2Pnzp1JC6q3+nfeQusHRwBVBSwWOCZPQcZX7oKnsgLVzV7A\nYgFUNam/UdL1zkdBigIMGAB0dkaVoecY1b0ekTqBnPVv/RH4/POYlw+XfWB+PsY8/Yte0y+/4kb7\n2bNfTEhPB27ejHm9WtBzvyfCrLkBg2VPT0fea2/oGkERQohoHlhcXIzy8nLk5eXB6/WipKQEJ0+e\nhNVqhd/vR2FhIQ4cOACn0xlTAK+3DaoqkJMzBE1N1+PaiGSrf+cttB4+1HuGogAhdp9it2NY2WOa\nvgH0vPNRX0JliHUMM+v5BtCr+In0FsMbQNd+tFgUuFyDE159XOf8r169imHDhsFqtQIArFYrhg4d\niqtXryYcyIhaPzgSekaY981k3JEo1J2PIgmVIdYxzKxn0bP4yXB0+qszQPdf9ez6DpaTM0THJOFV\nq2rMy/hamjXdnuqW5oQzxDOGmXXbdh1zEIUTS0do3Y9xlX9ubi4aGhrg9/uDp30aGxuRm5sb81hm\nOO0TOJceC1uWU9PtsWU5Y77FXc8M8YxhZoY9nohui/YYNcxpH5fLhfz8fOzduxcAsHfvXuTn58d8\nvt8sHJOnhJ6hKKEnJ+GORKHufBRJqAyxjmFmA/PzI/43ke7S03VdfZ9f+K5btw4HDhyAx+NBVlYW\nMjMzsW/fPtTU1GDVqlVobW2Fw+GA2+3GnXfeGXMAU3zyR+SrfXwmvtpHT4le7ROOGa72IcnFeLVP\nMj75R321T7KYpfwjYXZ9MHvqmTU30H+y63rah4iIzI3lT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMR\nSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEbIkO\ncOjQIWzevBlCCAghsGzZMsyYMUOLbERElCQJlb8QAs888wzeffdd5OXl4T//+Q9+8IMfYPr06bBY\n+EeFLFpPHIensgK+Zi9sTheyS+fCMalI71hx6U/bYib9ab+H2hYAhtu+hD/5WywWXL9+HQBw/fp1\nDB06lMUvkdYTx9GwcwdEZycAwNfsRcPOHQCg+8Edq/60LWbSn/Z7qG2p/+MfAAjA7w9OM8L2JdTS\niqLg1VdfxZNPPompU6fiJz/5Cdxut1bZyAQ8lRXBAz1AdHbCU1mhU6L49adtMZP+tN9DbQv8vmDx\nBxhh+xL65O/z+bBt2za8/vrrKCgowL/+9S+sWLEC+/btw6BBg6Iaw+UaHPx3Ts6QROLoStbs1S3N\nIaf7WppTsk+0XEeqt8Wsx4zWuVO535O9z8NtSyixbp/W2RMq/7Nnz6KxsREFBQUAgIKCAqSnp6Om\npgbjx4+Pagyvtw2qKpCTMwRNTdcTiaMbmbPbspzwNXtDTk/2PtF6v6dyW8x6zCQjd6r2eyr2ebht\nCffYaPN0zW6xKN0+NMcrodM+w4cPR319PS5cuAAAqKmpgdfrxZgxYxIORuaQXToXit3ebZpitwe/\n5DKT/rQtZtKf9nuobYHVBlit3SYZYfsS+uSfk5OD559/HsuXL4eiKACA9evXIzMzU5NwZHyBL6yM\ndiVDPPrTtphJf9rv4bYl1DS9t08RQgg9A/C0j76YXR9mzW7W3ED/yW6I0z5ERGROLH8iIgmx/ImI\nJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKSEMufiEhCLH8iIgklfDOXVKt+6sfAzZvdpgV+K+Oz\n/55H6wdHAFUFLBY4Jk9B6+FDyc+U9DUkD7Prw6zZzZob0DG7ogBCABYLoKr8bZ+AWH7bJ1TxBwV2\nMBGRwSl2O4aVPRb1GwB/2ydc8QMsfiIyDSPcyctc5U9E1E9Ee9OXZGH5ExHpwOZ06bp+c5V/enr4\nebdvJkNEZHRGuJOXqco/77U3Qr4B2JwuDF/yBBwPTL31jTpw62qfB6amOCERUQ+BD6a3u8nmdMX0\nZW+ymOpqH6Nidn0we+qZNTfQf7LLebUPERFpguVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8R\nkYRY/kREEmL5ExFJKOGbuXR0dGD9+vX4xz/+gbS0NEyYMAFr167VIhsRESVJwuW/ceNGpKWlYf/+\n/VAUBR6PR4tcRESm13riODyVFfA1ew1zB6+AhMr/xo0b2LNnD44cOQLl9o8XZWdnaxKMiMjMWk8c\nR8POHRCdnQBu/X5/w84dAGCIN4CEzvnX1dUhMzMTW7ZsQWlpKRYtWoTTp09rlY2IyLQ8lRXB4g8w\nwh28AhL65O/3+1FXV4e7774bv/jFL/DJJ5/gRz/6Ed5//30MHhzdr851/XW6nJwhicTRFbPrg9lT\nz6y5gdRmr25pDjnd19IcVw6tsydU/rm5ubDZbJg1axYA4N5770VWVhZqa2txzz33RDUGf9JZX8yu\nD7NmN2tuIPXZbVnOkLdqtGU5Y85huJ90djqdKCwsxLFjxwAAtbW18Hq9GDt2bMLBiIjMLLt0LhS7\nvds0I9zBKyDhq31+85vfYPXq1XC73bDZbPjtb38Lh8OhRTYiItMKfKnbL6/2AYDRo0fj7bff1iIL\nEVG/4phUZJiy74n/hy8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8R\nkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVP\nRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJSLPy37JlC8aNG4fq6mqthiQi\noiSxaTHIp59+io8//hgjR47UYriYtZ44Dk9lBXzNXticLmSXzoVjUlGf8+Id8/IrbrSfPRt8bLe3\nO0UBBgwAOjsBiwVQ1eDyAIJjBubpzcxv1cyeembNDRg3e97/7NBlvQmXf2dnJ1544QW88sorKCsr\n0yJTTFpPHEfDzh0QnZ0AAF+zFw07dwTnh5sX6Q0g0pjXjv29W/H3IsSt4geC5e5r9qL+j38AIAC/\nv9s8IpJb9eOP6fIGkHD5b968GXPmzMGoUaO0yBMzT2VFsKQDRGcnPJUVwX+Hmhep/CON6Wv2xhfU\n74tvOSKiJEio/D/66CNUVVVh5cqVcY/hcg0O/jsnZ0jMy1e3NIec7gszPTAv0rriGZOIKF7RdF88\n/RhJQuV/6tQp1NTUYNq0aQCA+vp6LFmyBBs2bMD9998f1RhebxtUVSAnZwiamq7HnMGW5Qz5adyW\n5QSAsPMirSvSmHF/8iciCqOv7uvajxaL0u1Dc7wSutpn6dKlOHr0KA4ePIiDBw9i+PDh2L59e9TF\nr4Xs0rlQ7PZu0xS7HdmlcyPOi3fMgfn58QW12gCrNb5liYg0psnVPnoKnLuPdEVPrFf7RBrTMamo\n19U+3Zjsah8i0pdeV/soQgihy5pvS/S0jxEwuz6YPfXMmhvoP9kNcdqHiIjMieVPRCQhlj8RkYRY\n/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQh\nlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJ\niOVPRCQhlj8RkYRsiSzc0tKCZ555BpcvX4bdbsfYsWPxwgsvwOl0apWPiIiSIKFP/oqi4PHHH8f+\n/fvx3nvvYfTo0Xj55Ze1ykZEREmSUPlnZmaisLAw+N8TJkzAlStXEg5FRETJpQghhBYDqaqKxYsX\no7i4GGVlZVoMSURESZLQOf+u1q5di4yMDCxcuDCm5bzeNqiqQE7OEDQ1XdcqTkoxuz6YPfXMmhvo\nP9ktFgUu1+CEx9Sk/N1uNy5duoTy8nJYLLyAiIjI6BIu/02bNqGqqgpvvvkm7Ha7FpmIiCjJEir/\n8+fPY9u2bbjjjjvwyCOPAABGjRqFrVu3ahKOiIiSI6Hyv+uuu3Du3DmtshARUYrwBD0RkYRY/kRE\nEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYQ0+1VPuqX1xHF4Kivga/bC5nQh\nu3QuHJOKYl4uY/x4fHbmTMhxuj7WMngwhBAQN270uVyo9VU3ewGLBVBVzfdFslWHma6kpUF0dMDm\ndME2bCjaz50z3PaFy250Zs0NGDC7xQLH5CkYvvBRXVav2e/5x6s//aRz64njaNi5A6KzMzhPsdsx\nrOyxiG8AoZbrKTAOgD4fG2q5ruuPZn1ElBqOB6b2+QaQjJ905mkfDXkqK3oVqujshKeyIublegqM\nE81j+1p/rGMQUfK0fnBEl/XytI+GfM3emKZHOz/Wx/W1XLzjEFES6HRKkp/8NWRzumKaHu38ro+L\n9rGRxo9nDCJKEp1ugMXy11B26VwoPW5oo9jtyC6dG/NyPQXGieaxfa0/1jGIKHkck6fosl6e9tFQ\n4EvVWK/2CbVcX1ftJHK1T8/1mfVqn3DMcLUPEa/26UdX+5gRs+vDrNnNmhvoP9l5tQ8REcWN5U9E\nJCGWPxGRhFj+REQS0v1qH4tFCflvs2F2fTB76pk1N9A/smu1Dbpf7UNERKnH0z5ERBJi+RMRSYjl\nT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJKSvm73W4UFxdj3LhxqK6uDk4/dOgQ\nvvvd7+Lhhx/GnDlzcODAgajm1dbWYt68eSgpKcG8efNw8eLFZMSOmP3w4cP43ve+h9mzZ2PhwoWo\nq6uLKp+Rs7e0tOCJJ55ASUkJZs+ejWXLlqG5uTm43Mcff4w5c+agpKQEixcvhtebvHv/xrPfA7Zs\n2dJrOaNn7+jowJo1azBjxgzMnj0bv/rVr4LzjHzMAMZ4rUY6diM99/HO0zt7bW0tFi1ahJkzZ2LW\nrFl49tln0d7eHhzz4MGDmDlzJh588EGsWLECN2/e7DuISIJTp06JK1euiKlTp4pz584JIYRQVVXc\nd999wf8+e/asmDBhgvD7/RHnCSHEokWLxJ49e4QQQuzZs0csWrQoGbHDZr927ZqYOHGiuHDhQjDD\n4sWLg8tEymfk7C0tLeLEiRPB5V966SXx7LPPCiGE8Pv9Yvr06eLUqVNCCCG2bt0qVq1aZZjsAVVV\nVWLJkiXdljND9rVr14oXX3xRqKoqhBCiqakpOM/Ix4xRXqvhjt1Iz32884yQva6uTnz66afBrMuX\nLxdbtmwRQgjR1tYmioqKRG1trRBCiNWrV4vXXnutzxxJKf+AnuU/ceJEcfr0aSGEEP/85z/FjBkz\n+pzn8XhEQUGB8Pl8QgghfD6fKCgoEF6vN5nRu2X/5JNPxHe+853gvJaWFpGXlye8Xm/EfEbP3tNf\n//pX8eijjwaXe+ihh4LzvF6vmDBhQlJzCxFb9o6ODvH9739f1NXV9VrOyNnb2tpEQUGBaGtr6zWG\n0Y8ZI75Whfji2I303Mc7zwjZe9q+fbtYvXq1EEKIv/zlL2Lp0qXBeWfOnOn2/IWTsl/1VBQFr776\nKp588klkZGTgxo0bePPNN/ucd/XqVQwbNgxWqxUAYLVaMXToUFy9ehVOpzMl2b/85S/D4/HgzJkz\nGD9+PN57771gNiFE2HyR5hkhe9cMqqpi165dKC4uDs4fMWJEcL7T6YSqqrh27RoyMzMNkX3z5s2Y\nM2cORo0a1W05o2e3Wq3IzMzEli1bcPLkSQwaNAjLly/HfffdZ/jj3el0Gu612vXYjfTcxzsvmcdM\ntNm7Zmhvb0dFRQV+/vOfA+h9vI8YMQJXr17tc90p+8LX5/Nh27ZteP3113Ho0CG88cYbWLFiBW7c\nuBFxnhEMGTIEv/vd77BhwwaUlpbC6/XC4XAED3Ijizb72rVrkZGRgYULF+qUtLdI2T/66CNUVVVh\n/vz5escMKVJ2v9+Puro63H333aisrMTKlSvx1FNPoa2tTe/YACJnN+Jr1YjHbrRize7z+fCzn/0M\nkyZNwrRp0xJad8o++Z89exaNjY0oKCgAABQUFCA9PR01NTVQFCXsvJEjR6KhoQF+vz/4wmlsbERu\nbm6qogMAioqKUFRUBADweDzYvn07xowZg5s3b4bNJ4QwdPYAt9uNS5cuoby8HBbLrc8Dubm5uHLl\nSvAxzc3NsFgsKfvk3Ff2d955BzU1NcEXQH19PZYsWYINGzYYPnt7eztsNhtmzZoFALj33nuRlZWF\n2tpajBgxwtDHTKTXsR6v1Z7HbqTnPt55RsgOAH6/HytXrsSXvvQlPPfcc8HH5ebm4uTJk8H/vnLl\nSlT7PGWf/IcPH476+npcuHABAFBTUwOv14sxY8ZEnOdyuZCfn4+9e/cCAPbu3Yv8/PyU/Qkc0NTU\nBODWn2mbNm3CI488goyMjIj5jJ4dADZt2oSqqips3boVdrs9uMzXvvY1tLe34/Tp0wCA3bt3Y+bM\nmSnNHSn70qVLcfToURw8eBAHDx7E8OHDsX37dtx///2Gz+50OlFYWIhjx44BuHUlh9frxdixYw1/\nzBjptRrq2I303Mc7zwjZVVXFqlWrYLVa8eKLL0JRvrihy7e+9S38+9//Dl5ZtXv3bnz729/uM0NS\nbuaybt06HDhwAB6PB1lZWcjMzMS+ffvw5z//Gb///e+DwX/6059i+vTpABBxXk1NDVatWoXW1lY4\nHA643W7ceeedWseOmP2Xv/wlPvzwQ3z++ef45je/idWrVyMtLa3PfEbOfv78ecyaNQt33HEHBg4c\nCAAYNWoUtm7dCgD48MMPsWbNGnR0dGDkyJHYuHEjsrOzDZG9p+LiYpSXlyMvL88U2evq6rB69Wpc\nu3YNNpsNK1aswJQpUwAY+5gBjPFajXTsRnru452nd/bDhw/jhz/8IfLy8oJ/nX/jG9/AmjVrAAB/\n+9vfsHHjRqiqivz8fLz00kvBD3jh8E5eREQS4v/hS0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/\nEZGEWP5ERBJi+RMRSej/AZusTW/jKGeJAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] }, { "cell_type": "markdown", @@ -1889,6 +2302,16 @@ "- filling nans" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "ofZIC0EdKJ0Y", + "colab_type": "text" + }, + "source": [ + "# -----current: test ready-----" + ] + }, { "cell_type": "code", "metadata": { @@ -1937,16 +2360,258 @@ { "cell_type": "code", "metadata": { - "id": "uCyRxp-7qEXf", + "id": "AT8Osn51lD9v", + "colab_type": "code", + "outputId": "9a3af301-2c19-4bfd-faca-3dba219a270c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50 + } + }, + "source": [ + "print(df_train.buildingqualitytypeid.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['buildingqualitytypeid']=temp['buildingqualitytypeid'].fillna(-1)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "32911\n", + "(90275, 45)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "f8rNxkrxACGe", "colab_type": "code", "colab": {} }, "source": [ + "\"\"\"RESET WIRE\"\"\"\n", + "# hold_df = df_train.copy()\n", + "df_train = hold_df.copy()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "OkyuebKaACxa", + "colab_type": "code", + "outputId": "d0dc876b-b02f-4179-91d0-d9a9b42e0e27", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 185 + } + }, + "source": [ + "\n", "print(df_train.buildingqualitytypeid.isnull().sum())\n", "print(df_train.shape)\n", "temp=df_train.copy()\n", "temp['buildingqualitytypeid']=temp['buildingqualitytypeid'].fillna(-1)\n", - "temp=temp.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", + "print(temp.to_pandas().head())\n" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "32911\n", + "(90275, 45)\n", + " parcelid logerror ac_id ... transaction_month census_tractnumber block_number\n", + "0 11827818 0.0402 NaN ... 3 5315.03 1013\n", + "1 12123024 0.0296 NaN ... 3 4625.00 1017\n", + "2 13867327 0.0344 NaN ... 3 0114.01 2017\n", + "3 12681894 0.0060 NaN ... 3 6513.02 1004\n", + "4 12848541 0.0695 1.0 ... 3 4087.03 1018\n", + "\n", + "[5 rows x 45 columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "79bB7JKdAEtX", + "colab_type": "code", + "outputId": "29f38a6a-dac2-4917-8f1b-8a4b198afe67", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 118 + } + }, + "source": [ + "print(temp.to_pandas().buildingqualitytypeid.head())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0 7.0\n", + "1 -1.0\n", + "2 -1.0\n", + "3 7.0\n", + "4 4.0\n", + "Name: buildingqualitytypeid, dtype: float64\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DVgF1c_p_bN1", + "colab_type": "text" + }, + "source": [ + "# -----current: break-----" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mAB9bsrPAGzQ", + "colab_type": "code", + "outputId": "2f9eaa73-a7b2-4634-e24d-9aec777b2536", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 387 + } + }, + "source": [ + "# say we run this whole thing by buildingqualitytypeid\n", + "# temp=temp.groupby(\"buildingqualitytypeid\")\n", + "# drop building types that aren't seen at least 3 times in the data\n", + "# .filter(lambda x: x.buildingqualitytypeid.size > 3)\n", + "# conditions = (temp.buildingqualitytypeid.value_counts > 3)\n", + "# print(temp.loc[temp.buildingqualitytypeid.astype(int) > 3].head())\n", + "# temp.loc[temp.census_tractnumber.value_counts() > 3]\n", + "print(temp.loc[temp.census_tractnumber.value_counts().values > 3].to_pandas().head())\n", + "\n", + "# temp = temp.loc[]\n", + "print(temp.to_pandas().buildingqualitytypeid.head())\n" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "error", + "ename": "RuntimeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcensus_tractnumber\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# temp = temp.loc[]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 110\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_tuple_arg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m_getitem_tuple_arg\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcolumns_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 212\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcolumns_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 213\u001b[0m \u001b[0;31m# Step 3: Gather index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# we have a single row\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loc_to_iloc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 392\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 393\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 394\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/column.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_bool_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_boolean_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 533\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/columnops.py\u001b[0m in \u001b[0;36mapply_boolean_mask\u001b[0;34m(self, mask)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mas_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"bool\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 118\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mapply_apply_boolean_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 119\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcolumn_empty_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnewsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mcudf/bindings/stream_compaction.pyx\u001b[0m in \u001b[0;36mcudf.bindings.stream_compaction.apply_apply_boolean_mask\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: cuDF failure at: /conda/conda-bld/libcudf_1566412619056/work/cpp/src/stream_compaction/apply_boolean_mask.cu:64: Column size mismatch" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "QCyed1SjAJFP", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(temp.to_pandas().head())\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1JgQ1Tq2NRsz", + "colab_type": "code", + "outputId": "c113cc08-3a69-4aa1-d05e-7b4d2a5df9fa", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + } + }, + "source": [ + "df_train.loc[df_train.buildingqualitytypeid>3]" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "error", + "ename": "NameError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'buildingqualitytypeid' is not defined" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XFkPwjUmHu4Y", + "colab_type": "code", + "outputId": "00b5fdb3-25fc-460a-bbd3-aaa421a93555", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 279 + } + }, + "source": [ + "temp=temp.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "error", + "ename": "AttributeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"buildingqualitytypeid\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/groupby/groupby.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 133\u001b[0m )\n\u001b[1;32m 134\u001b[0m raise AttributeError(\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0;34m\"'DataFrameGroupBy' object has no attribute \"\u001b[0m \u001b[0;34m\"'{}'\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m )\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'DataFrameGroupBy' object has no attribute 'filter'" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uCyRxp-7qEXf", + "colab_type": "code", + "outputId": "969848f0-fbc6-4388-dca2-08f8bde03990", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 380 + } + }, + "source": [ + "\n", "temp['buildingqualitytypeid'] = temp['buildingqualitytypeid'].replace(-1,np.nan)\n", "print(temp.buildingqualitytypeid.isnull().sum())\n", "print(temp.shape)\n", @@ -1962,7 +2627,28 @@ "print(df_train.buildingqualitytypeid.isnull().sum())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "32911\n", + "(90275, 45)\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "AttributeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"buildingqualitytypeid\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/groupby/groupby.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 133\u001b[0m )\n\u001b[1;32m 134\u001b[0m raise AttributeError(\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0;34m\"'DataFrameGroupBy' object has no attribute \"\u001b[0m \u001b[0;34m\"'{}'\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m )\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'DataFrameGroupBy' object has no attribute 'filter'" + ] + } + ] }, { "cell_type": "code", From 6936eed3c12bd592bbddddb899ffed00aca5065e Mon Sep 17 00:00:00 2001 From: Winston Date: Wed, 4 Sep 2019 23:19:21 -0700 Subject: [PATCH 3/7] current issues labeled --- .../zillow_kaggle_zestimate_comp.ipynb | 853 +++++++++--------- 1 file changed, 433 insertions(+), 420 deletions(-) diff --git a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb index cda0658e..f05586f5 100644 --- a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb +++ b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb @@ -43,10 +43,10 @@ "metadata": { "id": "W-um5d-x7o46", "colab_type": "code", - "outputId": "35d83399-515c-4172-e915-3886511baba2", + "outputId": "a3d473ea-3028-49fb-b769-c78616b388ae", "colab": { "base_uri": "https://localhost:8080/", - "height": 302 + "height": 312 } }, "source": [ @@ -56,20 +56,20 @@ "# display gpu specs\n", "!nvidia-smi" ], - "execution_count": 0, + "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ - "Wed Aug 21 22:49:26 2019 \n", + "Thu Sep 5 06:04:00 2019 \n", "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 430.40 Driver Version: 410.79 CUDA Version: 10.0 |\n", + "| NVIDIA-SMI 430.40 Driver Version: 418.67 CUDA Version: 10.1 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", - "| N/A 49C P8 16W / 70W | 0MiB / 15079MiB | 0% Default |\n", + "| N/A 39C P8 10W / 70W | 0MiB / 15079MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", @@ -98,16 +98,16 @@ "metadata": { "id": "p129YxxnihcV", "colab_type": "code", - "outputId": "a7de3ee2-b456-45d7-ab54-03eb1d72a956", + "outputId": "ce0d1990-45c5-4c91-d1f2-86cedd666bbc", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 } }, "source": [ - "!wget -nc https://raw.githubusercontent.com/randerzander/notebooks-contrib/master/utils/rapids-colab.sh\n", - "# RAPIDS 0.9 nightly\n", - "!bash rapids-colab.sh 0.9\n", + "!wget -nc https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh\n", + "# RAPIDS 0.10 nightly\n", + "!bash rapids-colab.sh \n", "\n", "import sys, os\n", "\n", @@ -115,32 +115,32 @@ "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'" ], - "execution_count": 0, + "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ - "--2019-08-21 22:49:32-- https://raw.githubusercontent.com/randerzander/notebooks-contrib/master/utils/rapids-colab.sh\n", + "--2019-09-05 06:04:07-- https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1606 (1.6K) [text/plain]\n", + "Length: 1609 (1.6K) [text/plain]\n", "Saving to: ‘rapids-colab.sh’\n", "\n", "\rrapids-colab.sh 0%[ ] 0 --.-KB/s \rrapids-colab.sh 100%[===================>] 1.57K --.-KB/s in 0s \n", "\n", - "2019-08-21 22:49:33 (231 MB/s) - ‘rapids-colab.sh’ saved [1606/1606]\n", + "2019-09-05 06:04:08 (510 MB/s) - ‘rapids-colab.sh’ saved [1609/1609]\n", "\n", - "--2019-08-21 22:49:33-- https://github.com/rapidsai/notebooks-extended/raw/master/utils/env-check.py\n", - "Resolving github.com (github.com)... 140.82.113.3\n", - "Connecting to github.com (github.com)|140.82.113.3|:443... connected.\n", + "--2019-09-05 06:04:09-- https://github.com/rapidsai/notebooks-extended/raw/master/utils/env-check.py\n", + "Resolving github.com (github.com)... 13.114.40.48\n", + "Connecting to github.com (github.com)|13.114.40.48|:443... connected.\n", "HTTP request sent, awaiting response... 301 Moved Permanently\n", "Location: https://github.com/rapidsai/notebooks-contrib/raw/master/utils/env-check.py [following]\n", - "--2019-08-21 22:49:33-- https://github.com/rapidsai/notebooks-contrib/raw/master/utils/env-check.py\n", + "--2019-09-05 06:04:09-- https://github.com/rapidsai/notebooks-contrib/raw/master/utils/env-check.py\n", "Reusing existing connection to github.com:443.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/env-check.py [following]\n", - "--2019-08-21 22:49:33-- https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/env-check.py\n", + "--2019-09-05 06:04:10-- https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/env-check.py\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -149,7 +149,7 @@ "\n", "env-check.py 100%[===================>] 783 --.-KB/s in 0s \n", "\n", - "2019-08-21 22:49:33 (125 MB/s) - ‘env-check.py’ saved [783/783]\n", + "2019-09-05 06:04:10 (162 MB/s) - ‘env-check.py’ saved [783/783]\n", "\n", "Checking for GPU type:\n", "*********************************************\n", @@ -164,16 +164,16 @@ "Uninstalling distributed-1.25.3:\n", " Successfully uninstalled distributed-1.25.3\n", "Installing conda\n", - "--2019-08-21 22:49:38-- https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\n", + "--2019-09-05 06:04:14-- https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\n", "Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c94f, ...\n", "Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 58468498 (56M) [application/x-sh]\n", "Saving to: ‘Miniconda3-4.5.4-Linux-x86_64.sh’\n", "\n", - "Miniconda3-4.5.4-Li 100%[===================>] 55.76M 151MB/s in 0.4s \n", + "Miniconda3-4.5.4-Li 100%[===================>] 55.76M 65.1MB/s in 0.9s \n", "\n", - "2019-08-21 22:49:38 (151 MB/s) - ‘Miniconda3-4.5.4-Linux-x86_64.sh’ saved [58468498/58468498]\n", + "2019-09-05 06:04:15 (65.1 MB/s) - ‘Miniconda3-4.5.4-Linux-x86_64.sh’ saved [58468498/58468498]\n", "\n", "PREFIX=/usr/local\n", "installing: python-3.6.5-hc3d631a_2 ...\n", @@ -217,7 +217,7 @@ " For best results, please verify that your PYTHONPATH only points to\n", " directories of packages that are compatible with the Python interpreter\n", " in Miniconda3: /usr/local\n", - "Installing RAPIDS packages\n", + "Installing RAPIDS 0.10 packages\n", "Please standby, this will take a few minutes...\n", "\n", "\n", @@ -230,133 +230,133 @@ " $ conda update -n base conda\n", "\n", "\n", - "bzip2-1.0.8 | 396 KB | : 100% 1.0/1 [00:00<00:00, 6.99it/s] \n", - "requests-2.22.0 | 84 KB | : 100% 1.0/1 [00:00<00:00, 6.56it/s] \n", - "olefile-0.46 | 31 KB | : 100% 1.0/1 [00:00<00:00, 23.03it/s]\n", - "yaml-0.1.7 | 78 KB | : 100% 1.0/1 [00:00<00:00, 16.84it/s]\n", - "zlib-1.2.11 | 105 KB | : 100% 1.0/1 [00:00<00:00, 15.03it/s]\n", - "llvmlite-0.29.0 | 19.9 MB | : 100% 1.0/1 [00:03<00:00, 3.64s/it] \n", - "pyopenssl-19.0.0 | 81 KB | : 100% 1.0/1 [00:00<00:00, 16.66it/s]\n", - "thrift-cpp-0.12.0 | 2.4 MB | : 100% 1.0/1 [00:00<00:00, 1.76it/s] \n", - "toolz-0.10.0 | 46 KB | : 100% 1.0/1 [00:00<00:00, 17.97it/s]\n", - "libevent-2.1.10 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 2.23it/s] \n", - "libffi-3.2.1 | 46 KB | : 100% 1.0/1 [00:00<00:00, 18.49it/s]\n", - "cudf-0.10.0a | 4.8 MB | : 100% 1.0/1 [00:01<00:00, 1.50s/it] \n", - "snappy-1.1.7 | 39 KB | : 100% 1.0/1 [00:00<00:00, 19.74it/s]\n", - "cloudpickle-1.2.1 | 22 KB | : 100% 1.0/1 [00:00<00:00, 17.29it/s]\n", - "re2-2019.08.01 | 420 KB | : 100% 1.0/1 [00:00<00:00, 6.36it/s] \n", - "pyjwt-1.7.1 | 17 KB | : 100% 1.0/1 [00:00<00:00, 23.11it/s]\n", - "libstdcxx-ng-9.1.0 | 4.0 MB | : 100% 1.0/1 [00:00<00:00, 1.44it/s] \n", - "libgfortran-ng-7.3.0 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 3.53it/s] \n", - "cython-0.29.13 | 2.2 MB | : 100% 1.0/1 [00:00<00:00, 1.72it/s] \n", - "pyparsing-2.4.2 | 57 KB | : 100% 1.0/1 [00:00<00:00, 19.30it/s]\n", - "chardet-3.0.4 | 190 KB | : 100% 1.0/1 [00:00<00:00, 9.45it/s]\n", - "rsa-3.4.2 | 31 KB | : 100% 1.0/1 [00:00<00:00, 19.23it/s]\n", - "libxgboost-0.90.rapi | 33.2 MB | : 100% 1.0/1 [00:08<00:00, 8.58s/it] \n", - "pyasn1-modules-0.2.6 | 47 KB | : 100% 1.0/1 [00:00<00:00, 12.11it/s]\n", - "lz4-c-1.8.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 12.18it/s]\n", - "freetype-2.10.0 | 884 KB | : 100% 1.0/1 [00:00<00:00, 4.76it/s] \n", - "arrow-cpp-0.14.1 | 17.3 MB | : 100% 1.0/1 [00:03<00:00, 3.36s/it] \n", - "oauthlib-3.0.1 | 82 KB | : 100% 1.0/1 [00:00<00:00, 12.63it/s]\n", - "libcumlprims-0.9.0 | 3.9 MB | : 100% 1.0/1 [00:01<00:00, 1.55s/it] \n", - "libcugraph-0.10.0a | 11.2 MB | : 100% 1.0/1 [00:02<00:00, 2.33s/it] \n", - "dask-cuml-0.8.0a | 30 KB | : 100% 1.0/1 [00:00<00:00, 3.87it/s] \n", - "fastavro-0.22.3 | 408 KB | : 100% 1.0/1 [00:00<00:00, 6.77it/s] \n", - "scipy-1.3.1 | 18.1 MB | : 100% 1.0/1 [00:03<00:00, 3.52s/it] \n", - "certifi-2019.6.16 | 149 KB | : 100% 1.0/1 [00:00<00:00, 15.17it/s]\n", - "decorator-4.4.0 | 11 KB | : 100% 1.0/1 [00:00<00:00, 20.06it/s]\n", - "google-auth-1.6.3 | 45 KB | : 100% 1.0/1 [00:00<00:00, 16.56it/s]\n", - "parquet-cpp-1.5.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 27.43it/s]\n", - "rmm-0.10.0a | 14 KB | : 100% 1.0/1 [00:00<00:00, 3.98it/s] \n", - "glog-0.4.0 | 104 KB | : 100% 1.0/1 [00:00<00:00, 15.00it/s]\n", - "wheel-0.33.6 | 35 KB | : 100% 1.0/1 [00:00<00:00, 17.29it/s]\n", - "bokeh-1.3.4 | 4.0 MB | : 100% 1.0/1 [00:01<00:00, 1.56s/it] \n", - "scikit-learn-0.21.3 | 6.7 MB | : 100% 1.0/1 [00:01<00:00, 1.60s/it] \n", - "libtiff-4.0.10 | 587 KB | : 100% 1.0/1 [00:00<00:00, 6.63it/s] \n", - "idna-2.8 | 132 KB | : 100% 1.0/1 [00:00<00:00, 15.63it/s]\n", - "pillow-6.1.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.86it/s] \n", - "_libgcc_mutex-0.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 43.53it/s]\n", - "nccl-2.4.6.1 | 66.6 MB | : 100% 1.0/1 [00:10<00:00, 10.59s/it] \n", - "pyyaml-5.1.2 | 184 KB | : 100% 1.0/1 [00:00<00:00, 10.61it/s]\n", - "blinker-1.4 | 13 KB | : 100% 1.0/1 [00:00<00:00, 20.08it/s]\n", - "librmm-0.10.0a | 44 KB | : 100% 1.0/1 [00:00<00:00, 3.31it/s] \n", - "sortedcontainers-2.1 | 25 KB | : 100% 1.0/1 [00:00<00:00, 14.67it/s]\n", - "cytoolz-0.10.0 | 429 KB | : 100% 1.0/1 [00:00<00:00, 7.83it/s] \n", - "dask-cuda-0.10.0a | 911 KB | : 100% 1.0/1 [00:00<00:00, 1.66it/s] \n", - "libblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 5.23it/s] \n", - "distributed-2.3.0 | 366 KB | : 100% 1.0/1 [00:00<00:00, 5.36it/s] \n", - "libpng-1.6.37 | 343 KB | : 100% 1.0/1 [00:00<00:00, 8.59it/s] \n", - "jinja2-2.10.1 | 91 KB | : 100% 1.0/1 [00:00<00:00, 15.90it/s]\n", - "msgpack-python-0.6.1 | 89 KB | : 100% 1.0/1 [00:00<00:00, 17.11it/s]\n", - "numpy-1.17.0 | 5.2 MB | : 100% 1.0/1 [00:01<00:00, 1.30s/it] \n", - "gflags-2.2.2 | 177 KB | : 100% 1.0/1 [00:00<00:00, 11.98it/s]\n", - "tk-8.6.9 | 3.2 MB | : 100% 1.0/1 [00:00<00:00, 1.35it/s] \n", - "ca-certificates-2019 | 145 KB | : 100% 1.0/1 [00:00<00:00, 15.40it/s]\n", - "cffi-1.12.3 | 218 KB | : 100% 1.0/1 [00:00<00:00, 11.34it/s]\n", - "asn1crypto-0.24.0 | 154 KB | : 100% 1.0/1 [00:00<00:00, 11.99it/s]\n", - "dlpack-0.2 | 12 KB | : 100% 1.0/1 [00:00<00:00, 24.28it/s]\n", - "boost-cpp-1.70.0 | 21.1 MB | : 100% 1.0/1 [00:09<00:00, 9.52s/it] \n", - "pyarrow-0.14.1 | 2.8 MB | : 100% 1.0/1 [00:00<00:00, 1.14it/s] \n", - "markupsafe-1.1.1 | 26 KB | : 100% 1.0/1 [00:00<00:00, 21.22it/s]\n", - "six-1.12.0 | 22 KB | : 100% 1.0/1 [00:00<00:00, 17.89it/s]\n", - "python-3.6.7 | 34.6 MB | : 100% 1.0/1 [00:05<00:00, 5.94s/it] \n", - "icu-64.2 | 12.6 MB | : 100% 1.0/1 [00:02<00:00, 2.19s/it] \n", - "libopenblas-0.3.7 | 7.6 MB | : 100% 1.0/1 [00:01<00:00, 1.52s/it] \n", - "c-ares-1.15.0 | 100 KB | : 100% 1.0/1 [00:00<00:00, 17.03it/s]\n", - "numba-0.45.1 | 3.1 MB | : 100% 1.0/1 [00:00<00:00, 1.00it/s] \n", - "zstd-1.4.0 | 928 KB | : 100% 1.0/1 [00:00<00:00, 5.27it/s] \n", - "pycparser-2.19 | 173 KB | : 100% 1.0/1 [00:00<00:00, 11.22it/s]\n", - "openssl-1.1.1c | 2.1 MB | : 100% 1.0/1 [00:00<00:00, 2.22it/s] \n", - "dask-cudf-0.10.0a | 63 KB | : 100% 1.0/1 [00:00<00:00, 2.84it/s] \n", - "sqlite-3.29.0 | 2.0 MB | : 100% 1.0/1 [00:00<00:00, 2.75it/s] \n", - "readline-8.0 | 441 KB | : 100% 1.0/1 [00:00<00:00, 7.41it/s] \n", - "tblib-1.4.0 | 12 KB | : 100% 1.0/1 [00:00<00:00, 25.51it/s]\n", - "locket-0.2.0 | 6 KB | : 100% 1.0/1 [00:00<00:00, 29.95it/s]\n", - "pyasn1-0.4.6 | 52 KB | : 100% 1.0/1 [00:00<00:00, 15.07it/s]\n", - "pytz-2019.2 | 228 KB | : 100% 1.0/1 [00:00<00:00, 4.22it/s] \n", - "libcudf-0.10.0a | 26.0 MB | : 100% 1.0/1 [00:05<00:00, 5.98s/it] \n", - "double-conversion-3. | 85 KB | : 100% 1.0/1 [00:00<00:00, 15.44it/s]\n", - "fsspec-0.4.1 | 39 KB | : 100% 1.0/1 [00:00<00:00, 19.96it/s]\n", - "uriparser-0.9.3 | 49 KB | : 100% 1.0/1 [00:00<00:00, 19.50it/s]\n", - "requests-oauthlib-1. | 19 KB | : 100% 1.0/1 [00:00<00:00, 19.66it/s]\n", - "cryptography-2.7 | 607 KB | : 100% 1.0/1 [00:00<00:00, 3.52it/s] \n", - "cachetools-2.1.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 24.47it/s]\n", - "ncurses-6.1 | 1.3 MB | : 100% 1.0/1 [00:01<00:00, 1.02s/it] \n", - "gcsfs-0.3.0 | 19 KB | : 100% 1.0/1 [00:00<00:00, 15.81it/s]\n", - "libnvstrings-0.10.0a | 16.8 MB | : 100% 1.0/1 [00:07<00:00, 7.28s/it] \n", - "cudatoolkit-10.0.130 | 380.0 MB | : 100% 1.0/1 [00:56<00:00, 57.00s/it] \n", - "pip-19.2.2 | 1.9 MB | : 100% 1.0/1 [00:00<00:00, 1.62it/s] \n", - "liblapack-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 18.78it/s]\n", - "click-7.0 | 61 KB | : 100% 1.0/1 [00:00<00:00, 18.70it/s]\n", - "cuml-0.10.0a | 6.0 MB | : 100% 1.0/1 [00:01<00:00, 1.69s/it] \n", - "grpc-cpp-1.23.0 | 4.5 MB | : 100% 1.0/1 [00:01<00:00, 1.10s/it] \n", - "dask-2.3.0 | 4 KB | : 100% 1.0/1 [00:00<00:00, 27.57it/s]\n", - "brotli-1.0.7 | 1.0 MB | : 100% 1.0/1 [00:00<00:00, 5.00it/s] \n", - "nvstrings-0.10.0a | 124 KB | : 100% 1.0/1 [00:00<00:00, 3.47it/s] \n", - "tornado-6.0.3 | 636 KB | : 100% 1.0/1 [00:00<00:00, 4.58it/s] \n", - "pynvml-8.0.2 | 30 KB | : 100% 1.0/1 [00:00<00:00, 21.55it/s]\n", - "libgcc-ng-9.1.0 | 8.1 MB | : 100% 1.0/1 [00:01<00:00, 1.40s/it] \n", - "libcblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 22.83it/s]\n", - "joblib-0.13.2 | 180 KB | : 100% 1.0/1 [00:00<00:00, 8.76it/s]\n", - "pandas-0.24.2 | 11.1 MB | : 100% 1.0/1 [00:02<00:00, 2.68s/it] \n", - "psutil-5.6.3 | 322 KB | : 100% 1.0/1 [00:00<00:00, 7.88it/s] \n", - "heapdict-1.0.0 | 7 KB | : 100% 1.0/1 [00:00<00:00, 21.63it/s]\n", - "jpeg-9c | 251 KB | : 100% 1.0/1 [00:00<00:00, 10.08it/s]\n", - "zict-1.0.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 20.76it/s]\n", - "libprotobuf-3.8.0 | 4.7 MB | : 100% 1.0/1 [00:01<00:00, 1.06s/it] \n", - "packaging-19.0 | 23 KB | : 100% 1.0/1 [00:00<00:00, 20.95it/s]\n", - "xgboost-0.90.rapidsd | 12 KB | : 100% 1.0/1 [00:00<00:00, 2.77it/s] \n", - "cugraph-0.10.0a | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 1.74it/s] \n", - "urllib3-1.25.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 9.23it/s]\n", - "py-xgboost-0.90.rapi | 87 KB | : 100% 1.0/1 [00:00<00:00, 3.59it/s] \n", - "dask-core-2.3.0 | 574 KB | : 100% 1.0/1 [00:00<00:00, 4.29it/s] \n", - "setuptools-41.2.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.25it/s] \n", - "pysocks-1.7.0 | 26 KB | : 100% 1.0/1 [00:00<00:00, 21.18it/s]\n", - "libcuml-0.10.0a | 29.7 MB | : 100% 1.0/1 [00:07<00:00, 7.44s/it] \n", - "partd-1.0.0 | 16 KB | : 100% 1.0/1 [00:00<00:00, 21.76it/s]\n", - "google-auth-oauthlib | 18 KB | : 100% 1.0/1 [00:00<00:00, 23.67it/s]\n", - "python-dateutil-2.8. | 219 KB | : 100% 1.0/1 [00:00<00:00, 11.17it/s]\n", - "xz-5.2.4 | 366 KB | : 100% 1.0/1 [00:00<00:00, 7.94it/s] \n", + "dask-cuda-0.10.0a | 921 KB | : 100% 1.0/1 [00:02<00:00, 2.81s/it] \n", + "jpeg-9c | 251 KB | : 100% 1.0/1 [00:00<00:00, 8.31it/s]\n", + "ca-certificates-2019 | 145 KB | : 100% 1.0/1 [00:00<00:00, 10.93it/s]\n", + "joblib-0.13.2 | 180 KB | : 100% 1.0/1 [00:00<00:00, 7.30it/s]\n", + "blinker-1.4 | 13 KB | : 100% 1.0/1 [00:00<00:00, 13.37it/s]\n", + "dask-core-2.3.0 | 574 KB | : 100% 1.0/1 [00:00<00:00, 4.16it/s] \n", + "cudf-0.10.0a | 4.7 MB | : 100% 1.0/1 [00:01<00:00, 1.74s/it] \n", + "pyasn1-modules-0.2.6 | 47 KB | : 100% 1.0/1 [00:00<00:00, 10.35it/s]\n", + "jinja2-2.10.1 | 91 KB | : 100% 1.0/1 [00:00<00:00, 11.34it/s]\n", + "grpc-cpp-1.23.0 | 4.5 MB | : 100% 1.0/1 [00:01<00:00, 1.05s/it] \n", + "boost-cpp-1.70.0 | 21.1 MB | : 100% 1.0/1 [00:08<00:00, 8.53s/it] \n", + "idna-2.8 | 132 KB | : 100% 1.0/1 [00:00<00:00, 11.10it/s]\n", + "numba-0.45.1 | 3.1 MB | : 100% 1.0/1 [00:00<00:00, 1.04it/s] \n", + "numpy-1.17.1 | 5.2 MB | : 100% 1.0/1 [00:01<00:00, 1.13s/it] \n", + "yaml-0.1.7 | 78 KB | : 100% 1.0/1 [00:00<00:00, 12.23it/s]\n", + "click-7.0 | 61 KB | : 100% 1.0/1 [00:00<00:00, 12.19it/s]\n", + "python-dateutil-2.8. | 219 KB | : 100% 1.0/1 [00:00<00:00, 11.76it/s]\n", + "google-auth-1.6.3 | 45 KB | : 100% 1.0/1 [00:00<00:00, 11.44it/s]\n", + "gcsfs-0.3.0 | 19 KB | : 100% 1.0/1 [00:00<00:00, 15.31it/s]\n", + "tk-8.6.9 | 3.2 MB | : 100% 1.0/1 [00:00<00:00, 1.57it/s] \n", + "pytz-2019.2 | 228 KB | : 100% 1.0/1 [00:00<00:00, 4.04it/s] \n", + "pip-19.2.3 | 1.9 MB | : 100% 1.0/1 [00:00<00:00, 1.82it/s] \n", + "cachetools-2.1.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 15.55it/s]\n", + "zict-1.0.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 14.96it/s]\n", + "cloudpickle-1.2.1 | 22 KB | : 100% 1.0/1 [00:00<00:00, 14.84it/s]\n", + "dask-cudf-0.10.0a | 62 KB | : 100% 1.0/1 [00:01<00:00, 1.34s/it] \n", + "libcudf-0.10.0a | 26.0 MB | : 100% 1.0/1 [00:07<00:00, 7.09s/it] \n", + "pillow-6.1.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.42it/s] \n", + "libcumlprims-0.9.0 | 3.9 MB | : 100% 1.0/1 [00:02<00:00, 2.24s/it] \n", + "cytoolz-0.10.0 | 429 KB | : 100% 1.0/1 [00:00<00:00, 6.44it/s] \n", + "requests-oauthlib-1. | 19 KB | : 100% 1.0/1 [00:00<00:00, 14.79it/s]\n", + "six-1.12.0 | 22 KB | : 100% 1.0/1 [00:00<00:00, 13.72it/s]\n", + "bzip2-1.0.8 | 396 KB | : 100% 1.0/1 [00:00<00:00, 7.69it/s] \n", + "llvmlite-0.29.0 | 19.9 MB | : 100% 1.0/1 [00:03<00:00, 3.15s/it] \n", + "re2-2019.09.01 | 431 KB | : 100% 1.0/1 [00:00<00:00, 7.14it/s] \n", + "zstd-1.4.0 | 928 KB | : 100% 1.0/1 [00:00<00:00, 4.81it/s] \n", + "pycparser-2.19 | 173 KB | : 100% 1.0/1 [00:00<00:00, 9.89it/s]\n", + "urllib3-1.25.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 7.69it/s]\n", + "uriparser-0.9.3 | 49 KB | : 100% 1.0/1 [00:00<00:00, 10.99it/s]\n", + "gflags-2.2.2 | 177 KB | : 100% 1.0/1 [00:00<00:00, 9.88it/s]\n", + "libpng-1.6.37 | 343 KB | : 100% 1.0/1 [00:00<00:00, 8.51it/s] \n", + "certifi-2019.6.16 | 149 KB | : 100% 1.0/1 [00:00<00:00, 12.26it/s]\n", + "libcblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 15.55it/s]\n", + "_libgcc_mutex-0.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 18.61it/s]\n", + "psutil-5.6.3 | 322 KB | : 100% 1.0/1 [00:00<00:00, 7.35it/s] \n", + "lz4-c-1.8.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 9.43it/s]\n", + "zlib-1.2.11 | 105 KB | : 100% 1.0/1 [00:00<00:00, 11.30it/s]\n", + "fsspec-0.4.4 | 39 KB | : 100% 1.0/1 [00:00<00:00, 13.92it/s]\n", + "thrift-cpp-0.12.0 | 2.4 MB | : 100% 1.0/1 [00:00<00:00, 2.11it/s] \n", + "double-conversion-3. | 85 KB | : 100% 1.0/1 [00:00<00:00, 11.97it/s]\n", + "heapdict-1.0.0 | 7 KB | : 100% 1.0/1 [00:00<00:00, 16.08it/s]\n", + "libffi-3.2.1 | 46 KB | : 100% 1.0/1 [00:00<00:00, 13.59it/s]\n", + "chardet-3.0.4 | 190 KB | : 100% 1.0/1 [00:00<00:00, 8.67it/s] \n", + "pynvml-8.0.3 | 30 KB | : 100% 1.0/1 [00:00<00:00, 2.88it/s] \n", + "bokeh-1.3.4 | 4.0 MB | : 100% 1.0/1 [00:01<00:00, 1.30s/it] \n", + "freetype-2.10.0 | 884 KB | : 100% 1.0/1 [00:00<00:00, 4.89it/s] \n", + "nvstrings-0.10.0a | 124 KB | : 100% 1.0/1 [00:01<00:00, 1.37s/it] \n", + "libxgboost-0.90.rapi | 32.8 MB | : 100% 1.0/1 [00:09<00:00, 9.67s/it] \n", + "pyasn1-0.4.6 | 52 KB | : 100% 1.0/1 [00:00<00:00, 12.03it/s]\n", + "brotli-1.0.7 | 1.0 MB | : 100% 1.0/1 [00:00<00:00, 4.92it/s] \n", + "setuptools-41.2.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.53it/s] \n", + "wheel-0.33.6 | 35 KB | : 100% 1.0/1 [00:00<00:00, 13.26it/s]\n", + "libgcc-ng-9.1.0 | 8.1 MB | : 100% 1.0/1 [00:01<00:00, 1.23s/it] \n", + "libcuml-0.10.0a | 29.3 MB | : 100% 1.0/1 [00:10<00:00, 10.22s/it] \n", + "dlpack-0.2 | 12 KB | : 100% 1.0/1 [00:00<00:00, 1.39it/s] \n", + "pandas-0.24.2 | 11.1 MB | : 100% 1.0/1 [00:02<00:00, 2.32s/it] \n", + "dask-cuml-0.8.0a | 30 KB | : 100% 1.0/1 [00:01<00:00, 1.14s/it] \n", + "sqlite-3.29.0 | 1.9 MB | : 100% 1.0/1 [00:00<00:00, 2.74it/s] \n", + "libgfortran-ng-7.3.0 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 3.61it/s] \n", + "toolz-0.10.0 | 46 KB | : 100% 1.0/1 [00:00<00:00, 11.64it/s]\n", + "asn1crypto-0.24.0 | 154 KB | : 100% 1.0/1 [00:00<00:00, 9.53it/s]\n", + "liblapack-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 15.48it/s]\n", + "packaging-19.0 | 23 KB | : 100% 1.0/1 [00:00<00:00, 3.70it/s] \n", + "cryptography-2.7 | 607 KB | : 100% 1.0/1 [00:00<00:00, 3.62it/s] \n", + "olefile-0.46 | 31 KB | : 100% 1.0/1 [00:00<00:00, 15.17it/s]\n", + "libopenblas-0.3.7 | 7.6 MB | : 100% 1.0/1 [00:01<00:00, 1.29s/it] \n", + "libtiff-4.0.10 | 587 KB | : 100% 1.0/1 [00:00<00:00, 6.35it/s] \n", + "cffi-1.12.3 | 218 KB | : 100% 1.0/1 [00:00<00:00, 8.74it/s]\n", + "ncurses-6.1 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 1.19it/s] \n", + "rmm-0.10.0a | 14 KB | : 100% 1.0/1 [00:00<00:00, 1.98it/s] \n", + "libprotobuf-3.8.0 | 4.7 MB | : 100% 1.0/1 [00:01<00:00, 1.71s/it] \n", + "pyopenssl-19.0.0 | 81 KB | : 100% 1.0/1 [00:00<00:00, 12.69it/s]\n", + "libevent-2.1.10 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 2.66it/s] \n", + "librmm-0.10.0a | 44 KB | : 100% 1.0/1 [00:00<00:00, 1.99it/s] \n", + "scipy-1.3.1 | 18.1 MB | : 100% 1.0/1 [00:03<00:00, 3.18s/it] \n", + "readline-8.0 | 441 KB | : 100% 1.0/1 [00:00<00:00, 6.79it/s] \n", + "msgpack-python-0.6.1 | 89 KB | : 100% 1.0/1 [00:00<00:00, 13.43it/s]\n", + "requests-2.22.0 | 84 KB | : 100% 1.0/1 [00:00<00:00, 1.49it/s]\n", + "py-xgboost-0.90.rapi | 86 KB | : 100% 1.0/1 [00:00<00:00, 1.41it/s] \n", + "cuml-0.10.0a | 5.9 MB | : 100% 1.0/1 [00:02<00:00, 2.12s/it] \n", + "libblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 13.55it/s]\n", + "c-ares-1.15.0 | 100 KB | : 100% 1.0/1 [00:00<00:00, 12.24it/s]\n", + "glog-0.4.0 | 104 KB | : 100% 1.0/1 [00:00<00:00, 12.09it/s]\n", + "pyarrow-0.14.1 | 2.8 MB | : 100% 1.0/1 [00:00<00:00, 1.27it/s] \n", + "xz-5.2.4 | 366 KB | : 100% 1.0/1 [00:00<00:00, 7.05it/s] \n", + "arrow-cpp-0.14.1 | 17.3 MB | : 100% 1.0/1 [00:02<00:00, 2.84s/it] \n", + "icu-64.2 | 12.6 MB | : 100% 1.0/1 [00:01<00:00, 1.93s/it] \n", + "distributed-2.3.2 | 370 KB | : 100% 1.0/1 [00:00<00:00, 5.21it/s] \n", + "xgboost-0.90.rapidsd | 11 KB | : 100% 1.0/1 [00:01<00:00, 1.01s/it] \n", + "locket-0.2.0 | 6 KB | : 100% 1.0/1 [00:00<00:00, 15.02it/s]\n", + "snappy-1.1.7 | 39 KB | : 100% 1.0/1 [00:00<00:00, 14.68it/s]\n", + "pyjwt-1.7.1 | 17 KB | : 100% 1.0/1 [00:00<00:00, 13.20it/s]\n", + "libstdcxx-ng-9.1.0 | 4.0 MB | : 100% 1.0/1 [00:00<00:00, 1.61it/s] \n", + "pysocks-1.7.0 | 26 KB | : 100% 1.0/1 [00:00<00:00, 15.49it/s]\n", + "dask-2.3.0 | 4 KB | : 100% 1.0/1 [00:00<00:00, 15.11it/s]\n", + "sortedcontainers-2.1 | 25 KB | : 100% 1.0/1 [00:00<00:00, 14.59it/s]\n", + "parquet-cpp-1.5.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 14.74it/s]\n", + "nccl-2.4.6.1 | 66.6 MB | : 100% 1.0/1 [00:12<00:00, 12.58s/it] \n", + "google-auth-oauthlib | 18 KB | : 100% 1.0/1 [00:00<00:00, 13.18it/s]\n", + "cugraph-0.10.0a | 1.3 MB | : 100% 1.0/1 [00:10<00:00, 3.84s/it] \n", + "libcugraph-0.10.0a | 11.3 MB | : 100% 1.0/1 [00:18<00:00, 18.20s/it] \n", + "python-3.6.7 | 34.6 MB | : 100% 1.0/1 [00:05<00:00, 5.00s/it] \n", + "openssl-1.1.1c | 2.1 MB | : 100% 1.0/1 [00:00<00:00, 2.56it/s] \n", + "tornado-6.0.3 | 636 KB | : 100% 1.0/1 [00:00<00:00, 4.46it/s] \n", + "partd-1.0.0 | 16 KB | : 100% 1.0/1 [00:00<00:00, 13.42it/s]\n", + "markupsafe-1.1.1 | 26 KB | : 100% 1.0/1 [00:00<00:00, 14.08it/s]\n", + "fastavro-0.22.4 | 405 KB | : 100% 1.0/1 [00:00<00:00, 7.21it/s] \n", + "cython-0.29.13 | 2.2 MB | : 100% 1.0/1 [00:00<00:00, 1.86it/s] \n", + "rsa-3.4.2 | 31 KB | : 100% 1.0/1 [00:00<00:00, 13.62it/s]\n", + "pyyaml-5.1.2 | 184 KB | : 100% 1.0/1 [00:00<00:00, 10.32it/s]\n", + "scikit-learn-0.21.3 | 6.7 MB | : 100% 1.0/1 [00:01<00:00, 1.44s/it] \n", + "decorator-4.4.0 | 11 KB | : 100% 1.0/1 [00:00<00:00, 15.90it/s]\n", + "oauthlib-3.0.1 | 82 KB | : 100% 1.0/1 [00:00<00:00, 9.79it/s]\n", + "pyparsing-2.4.2 | 57 KB | : 100% 1.0/1 [00:00<00:00, 13.40it/s]\n", + "tblib-1.4.0 | 12 KB | : 100% 1.0/1 [00:00<00:00, 15.25it/s]\n", + "cudatoolkit-10.0.130 | 380.0 MB | : 100% 1.0/1 [00:46<00:00, 46.86s/it] \n", + "libnvstrings-0.10.0a | 24.8 MB | : 100% 1.0/1 [00:07<00:00, 7.82s/it] \n", "Copying shared object files to /usr/lib\n", "\n", "*********************************************\n", @@ -386,10 +386,10 @@ "metadata": { "id": "x1dLRTm168Tk", "colab_type": "code", - "outputId": "e4ee4a4e-64f3-4e87-8b87-472b02f84325", + "outputId": "406a519a-e019-46cf-f0bf-7bba3dd2bb79", "colab": { "base_uri": "https://localhost:8080/", - "height": 958 + "height": 1000 } }, "source": [ @@ -397,7 +397,7 @@ "!pip install kaggle\n", "!mkdir /root/.kaggle\n", "# plug api -- get your own API key\n", - "!echo '{\"username\":\"warobson\",\"key\":\"\"}' > /root/.kaggle/kaggle.json\n", + "!echo '{\"username\":\"warobson\",\"key\":\"5b4ecdb3cb122fb692a8349124960424\"}' > /root/.kaggle/kaggle.json\n", "!chmod 600 /root/.kaggle/kaggle.json\n", "# !kaggle datasets download\n", "!kaggle competitions download -c zillow-prize-1\n", @@ -409,44 +409,44 @@ "!unzip -q \"/content/train_2017.csv.zip\"\n", "!unzip -q \"/content/properties_2017.csv.zip\"" ], - "execution_count": 0, + "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "Collecting kaggle\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e9/fc/0de659ea1f2096563204925b6660ae141f3d85bbe9e8a1571c3eb6cc1fdd/kaggle-1.5.5.tar.gz (56kB)\n", - "\u001b[K |████████████████████████████████| 61kB 2.9MB/s \n", + "\u001b[K |████████████████████████████████| 61kB 31.1MB/s \n", "\u001b[?25hCollecting urllib3<1.25,>=1.21.1 (from kaggle)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/01/11/525b02e4acc0c747de8b6ccdab376331597c569c42ea66ab0a1dbd36eca2/urllib3-1.24.3-py2.py3-none-any.whl (118kB)\n", - "\u001b[K |████████████████████████████████| 122kB 9.7MB/s \n", + "\u001b[K |████████████████████████████████| 122kB 35.9MB/s \n", "\u001b[?25hRequirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/site-packages (from kaggle) (1.12.0)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.6/site-packages (from kaggle) (2019.6.16)\n", "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/site-packages (from kaggle) (2.8.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.6/site-packages (from kaggle) (2.22.0)\n", "Collecting tqdm (from kaggle)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a5/83/06029af22fe06b8a7be013aeae5e104b3ed26867e5d4ca91408b30aa602e/tqdm-4.34.0-py2.py3-none-any.whl (50kB)\n", - "\u001b[K |████████████████████████████████| 51kB 12.9MB/s \n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/dc/88/d3213e2f3492daf09d8b41631ad6899f56db17ce83ea9c8a579902bafe5e/tqdm-4.35.0-py2.py3-none-any.whl (50kB)\n", + "\u001b[K |████████████████████████████████| 51kB 29.7MB/s \n", "\u001b[?25hCollecting python-slugify (from kaggle)\n", " Downloading https://files.pythonhosted.org/packages/a2/5d/bd30413c00bbed3945558aca07c55944073e1e30abeee1f06515281f9811/python-slugify-3.0.3.tar.gz\n", "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/site-packages (from requests->kaggle) (2.8)\n", "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/site-packages (from requests->kaggle) (3.0.4)\n", "Collecting text-unidecode==1.2 (from python-slugify->kaggle)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/79/42/d717cc2b4520fb09e45b344b1b0b4e81aa672001dd128c180fabc655c341/text_unidecode-1.2-py2.py3-none-any.whl (77kB)\n", - "\u001b[K |████████████████████████████████| 81kB 28.8MB/s \n", + "\u001b[K |████████████████████████████████| 81kB 32.1MB/s \n", "\u001b[?25hBuilding wheels for collected packages: kaggle, python-slugify\n", " Building wheel for kaggle (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for kaggle: filename=kaggle-1.5.5-cp36-none-any.whl size=71896 sha256=ee79b8c43069539b819caedf251aae4360d5dd43aec6a5bc2734275442177e60\n", + " Created wheel for kaggle: filename=kaggle-1.5.5-cp36-none-any.whl size=71896 sha256=d9815b0d9eae6d3594e8dc1a57a33174b1dbe24f623e1d688a92a2588f4e1be0\n", " Stored in directory: /root/.cache/pip/wheels/db/6a/80/6cd1892eb9b9b136333db3c74e16cba4e17e2c700f51541f06\n", " Building wheel for python-slugify (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for python-slugify: filename=python_slugify-3.0.3-py2.py3-none-any.whl size=4789 sha256=a8f8df8b4a56a8db4fc841f6b6ff5f89a9a3c7e641ff4fc8c41d5e7a5c1ec087\n", + " Created wheel for python-slugify: filename=python_slugify-3.0.3-py2.py3-none-any.whl size=4789 sha256=ccca227a48fbd1c2f5ba45701b0b52f1a12b7d1484ba459889128b3712c17b88\n", " Stored in directory: /root/.cache/pip/wheels/0f/96/ca/85f5b01165975402d1e37f8dd346df00dc39be1d0761bd17bb\n", "Successfully built kaggle python-slugify\n", "Installing collected packages: urllib3, tqdm, text-unidecode, python-slugify, kaggle\n", " Found existing installation: urllib3 1.25.3\n", " Uninstalling urllib3-1.25.3:\n", " Successfully uninstalled urllib3-1.25.3\n", - "Successfully installed kaggle-1.5.5 python-slugify-3.0.3 text-unidecode-1.2 tqdm-4.34.0 urllib3-1.24.3\n" + "Successfully installed kaggle-1.5.5 python-slugify-3.0.3 text-unidecode-1.2 tqdm-4.35.0 urllib3-1.24.3\n" ], "name": "stdout" }, @@ -469,23 +469,23 @@ "output_type": "stream", "text": [ "Downloading sample_submission.csv.zip to /content\n", - " 91% 9.00M/9.86M [00:00<00:00, 17.1MB/s]\n", - "100% 9.86M/9.86M [00:00<00:00, 22.0MB/s]\n", + " 51% 5.00M/9.86M [00:00<00:00, 15.9MB/s]\n", + "100% 9.86M/9.86M [00:00<00:00, 29.2MB/s]\n", "Downloading properties_2016.csv.zip to /content\n", - " 98% 156M/159M [00:01<00:00, 103MB/s] \n", - "100% 159M/159M [00:01<00:00, 92.1MB/s]\n", + " 91% 145M/159M [00:02<00:00, 54.3MB/s]\n", + "100% 159M/159M [00:02<00:00, 59.1MB/s]\n", "Downloading zillow_data_dictionary.xlsx.zip to /content\n", " 0% 0.00/15.7k [00:00 0) & (df_train.garage_sqft == 0)\n", "print(df_train.loc[conditions][garage].head())" ], - "execution_count": 0, + "execution_count": 18, "outputs": [ { "output_type": "stream", @@ -1125,9 +1125,9 @@ " garagecarcnt garage_sqft\n", "16 2.0 0.0\n", "29 1.0 0.0\n", - "32 1.0 0.0\n", - "35 1.0 0.0\n", - "36 2.0 0.0\n" + "36 2.0 0.0\n", + "54 2.0 0.0\n", + "65 1.0 0.0\n" ], "name": "stdout" } @@ -1244,10 +1244,10 @@ "metadata": { "id": "yHZH4rMNLfBA", "colab_type": "code", - "outputId": "6ba5f661-caa5-44b8-b492-b9f5708181db", + "outputId": "53844d43-16cb-41f2-8684-5268848f1476", "colab": { "base_uri": "https://localhost:8080/", - "height": 202 + "height": 208 } }, "source": [ @@ -1303,7 +1303,7 @@ "# let's see how out unit counts look\n", "print(df_train.unitcnt.value_counts())" ], - "execution_count": 0, + "execution_count": 22, "outputs": [ { "output_type": "stream", @@ -1396,10 +1396,10 @@ "metadata": { "id": "8lYcO_T5XKNN", "colab_type": "code", - "outputId": "2440dccb-bc7d-459c-ae1a-cc31388be45e", + "outputId": "e7ff645b-ac87-4039-d135-db8fd49855da", "colab": { "base_uri": "https://localhost:8080/", - "height": 303 + "height": 311 } }, "source": [ @@ -1415,7 +1415,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 717\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m ):\n\u001b[0;32m--> 719\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 720\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 721\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: fill_value must be a string or a string series" @@ -1449,10 +1449,10 @@ "metadata": { "id": "Svp6J0cJ5dL0", "colab_type": "code", - "outputId": "352d2f36-658f-4698-bfdb-5c748b67f0d7", + "outputId": "fd9373e8-8a1f-45a3-a7bd-387f0e678d2c", "colab": { "base_uri": "https://localhost:8080/", - "height": 67 + "height": 69 } }, "source": [ @@ -1471,7 +1471,7 @@ "# display values in tax delinquency flag column\n", "print(df_train['taxdelinquencyflag'].value_counts())" ], - "execution_count": 0, + "execution_count": 24, "outputs": [ { "output_type": "stream", @@ -1501,16 +1501,16 @@ "metadata": { "id": "lHh95mAIMrMy", "colab_type": "code", - "outputId": "244b62b2-299c-4440-83d2-b5545712ba3e", + "outputId": "37c584b1-76a4-4df7-ca71-23a9d50f165a", "colab": { "base_uri": "https://localhost:8080/", - "height": 67 + "height": 69 } }, "source": [ "print(df_train.taxdelinquencyflag.value_counts())" ], - "execution_count": 0, + "execution_count": 25, "outputs": [ { "output_type": "stream", @@ -1528,10 +1528,10 @@ "metadata": { "id": "6Bic66I9LfGC", "colab_type": "code", - "outputId": "4311fb13-6d49-44e1-83ef-73e27d4720c4", + "outputId": "af959592-7a80-42ee-8fda-3e18d6b9c514", "colab": { "base_uri": "https://localhost:8080/", - "height": 235 + "height": 243 } }, "source": [ @@ -1552,7 +1552,7 @@ "# what've we got? \n", "print(df_train.taxdelinquencyyear.value_counts())" ], - "execution_count": 0, + "execution_count": 26, "outputs": [ { "output_type": "stream", @@ -1608,10 +1608,10 @@ "metadata": { "id": "Sg0eN-K1QdZy", "colab_type": "code", - "outputId": "0e6ca58c-3b13-4c9e-c902-4d8a9c98a855", + "outputId": "3cdd2ef6-fb68-46bd-a611-5dd3d9ff45d2", "colab": { "base_uri": "https://localhost:8080/", - "height": 474 + "height": 489 } }, "source": [ @@ -1671,7 +1671,7 @@ "\"\"\"\n", "print(df_train[['census_tractnumber', 'block_number']].head())" ], - "execution_count": 0, + "execution_count": 29, "outputs": [ { "output_type": "stream", @@ -1726,10 +1726,10 @@ "metadata": { "id": "xhCosNpXvTVU", "colab_type": "code", - "outputId": "b8ca9fb3-6c67-4466-d7cc-98ff52504659", + "outputId": "3f70a009-a211-46fd-ecc9-731be3d15fe1", "colab": { "base_uri": "https://localhost:8080/", - "height": 84 + "height": 86 } }, "source": [ @@ -1749,7 +1749,7 @@ "# drop columns with more than 95% null values\n", "df_train = df_train.drop(missingvaluescols['field'], axis=1)" ], - "execution_count": 0, + "execution_count": 30, "outputs": [ { "output_type": "stream", @@ -1792,10 +1792,10 @@ "metadata": { "id": "yB2lzAyopS_S", "colab_type": "code", - "outputId": "2860febf-c7ad-4823-d170-2633c4be8ae5", + "outputId": "06922e76-c61b-4212-afc5-4f3a51eaaa09", "colab": { "base_uri": "https://localhost:8080/", - "height": 218 + "height": 225 } }, "source": [ @@ -1804,7 +1804,7 @@ "# let's see what we've got\n", "print(df_train['unitcnt'].value_counts())" ], - "execution_count": 0, + "execution_count": 31, "outputs": [ { "output_type": "stream", @@ -1843,7 +1843,7 @@ "metadata": { "id": "-icFDeLSoJwl", "colab_type": "code", - "outputId": "5ea8e799-3105-4601-82d4-54bd00c5056b", + "outputId": "03d9a89b-6e21-4bba-ae75-aa229c744dcf", "colab": { "base_uri": "https://localhost:8080/", "height": 34 @@ -1862,7 +1862,7 @@ "\n", "print(df_train.pool_sqft.isna().sum())" ], - "execution_count": 0, + "execution_count": 32, "outputs": [ { "output_type": "stream", @@ -1890,10 +1890,10 @@ "metadata": { "id": "3pVABkZTYK9F", "colab_type": "code", - "outputId": "345e4225-6a09-4fae-efb3-c9abe56c622a", + "outputId": "e926b021-7cbb-4acd-96c8-2a15fbe1afd7", "colab": { "base_uri": "https://localhost:8080/", - "height": 84 + "height": 86 } }, "source": [ @@ -1920,7 +1920,7 @@ "print(df_train.total_parcel_tax.isnull().sum())\n", "print(df_train.land_tax.isnull().sum())" ], - "execution_count": 0, + "execution_count": 33, "outputs": [ { "output_type": "stream", @@ -1939,7 +1939,7 @@ "metadata": { "id": "8SID48LOpYvu", "colab_type": "code", - "outputId": "1d369c4a-759e-4331-b5fe-6c784ae66897", + "outputId": "842c0ccb-1710-4e73-f85a-599d5b27988b", "colab": { "base_uri": "https://localhost:8080/", "height": 34 @@ -1950,7 +1950,7 @@ "df_train = df_train.drop(['regionidcounty'], axis=1)\n", "df_train.shape" ], - "execution_count": 0, + "execution_count": 34, "outputs": [ { "output_type": "execute_result", @@ -1962,7 +1962,7 @@ "metadata": { "tags": [] }, - "execution_count": 51 + "execution_count": 34 } ] }, @@ -1971,7 +1971,7 @@ "metadata": { "id": "tWmM2J8_pkg1", "colab_type": "code", - "outputId": "44689c09-a426-48c9-eae8-7e81af63080e", + "outputId": "e544a196-1e32-4d18-a6b5-98c49a57589e", "colab": { "base_uri": "https://localhost:8080/", "height": 34 @@ -1984,7 +1984,7 @@ "df_train['bedroomcnt'].loc[df_train['bedroomcnt'] == 0] = np.nan\n", "print(df_train.bedroomcnt.isnull().sum())" ], - "execution_count": 0, + "execution_count": 35, "outputs": [ { "output_type": "stream", @@ -2000,10 +2000,10 @@ "metadata": { "id": "3qnP2L9LpmeJ", "colab_type": "code", - "outputId": "a4e9550d-5ea8-4066-d3f3-ea73bfe04cef", + "outputId": "2e863b26-9267-45f2-f31f-3305f5577ce3", "colab": { "base_uri": "https://localhost:8080/", - "height": 101 + "height": 104 } }, "source": [ @@ -2040,7 +2040,7 @@ "print(df_train.bedroomcnt.isnull().sum())\n", "print(df_train.roomcnt.isnull().sum())" ], - "execution_count": 0, + "execution_count": 36, "outputs": [ { "output_type": "stream", @@ -2074,10 +2074,10 @@ "metadata": { "id": "IW4CG2InpolD", "colab_type": "code", - "outputId": "47e46700-fe9c-4b98-9941-014ee6dea441", + "outputId": "288444a4-d153-4624-c961-b3956092d87e", "colab": { "base_uri": "https://localhost:8080/", - "height": 252 + "height": 260 } }, "source": [ @@ -2129,7 +2129,7 @@ "print(f'AFTER\\n{df_train.numberofstories.value_counts()}\\n'\n", " f'{df_train.numberofstories.isnull().sum()} remaining null values')" ], - "execution_count": 0, + "execution_count": 37, "outputs": [ { "output_type": "stream", @@ -2158,10 +2158,10 @@ "metadata": { "id": "AHcMsDCxprd4", "colab_type": "code", - "outputId": "3a327d21-4675-41ce-aa9e-f52ae86eb491", + "outputId": "516954f4-d1d9-4876-e3a4-4545d865d9f6", "colab": { "base_uri": "https://localhost:8080/", - "height": 286 + "height": 295 } }, "source": [ @@ -2190,7 +2190,7 @@ "print(f'AFTER\\n{df_train.fireplace_count.value_counts()}\\n'\n", " f'{df_train.fireplace_count.isnull().sum()} remaining null values')" ], - "execution_count": 0, + "execution_count": 38, "outputs": [ { "output_type": "stream", @@ -2221,14 +2221,13 @@ "metadata": { "id": "FIuSWoJspt3H", "colab_type": "code", - "outputId": "9c5daebd-4b2a-461b-8490-350d19fa7ba8", + "outputId": "d8b6ef02-d214-4530-ce3a-c6efc5bd01cf", "colab": { "base_uri": "https://localhost:8080/", "height": 317 } }, "source": [ - "\n", "# set basic sns \n", "color = sns.color_palette()\n", "sns.set(style=\"darkgrid\")\n", @@ -2242,7 +2241,7 @@ "# display the graph\n", "plt.show()" ], - "execution_count": 0, + "execution_count": 41, "outputs": [ { "output_type": "display_data", @@ -2263,7 +2262,7 @@ "metadata": { "id": "KOHPCFRSp5y9", "colab_type": "code", - "outputId": "3aa099cd-791f-4a5a-9ea7-29168fc239b9", + "outputId": "471d6f7c-607a-4520-d219-3ab56500c004", "colab": { "base_uri": "https://localhost:8080/", "height": 274 @@ -2275,12 +2274,12 @@ "# display the graph\n", "plt.show()" ], - "execution_count": 0, + "execution_count": 42, "outputs": [ { "output_type": "display_data", "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEBCAYAAACQbKXWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGx1JREFUeJzt3WtwE+ehBuB3JSFjA6ot2YC5ppnG\n1JmG0DqDGTeFYAimDZDWzJSUi5OBhLYpKbRhUkLTkgYCUUlomEBi0kNLSDLwxx7aQE8hHS4pUCg0\nF+oMxdQY8Az4ItkcY4LtSPudHyDFF0nWZaXd9fc+v8iu9tt3V6tX8mqjVYQQAkREJBWL3gGIiCj1\nWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEbHoHaGm5AVUVcLkGw+tt0ztOXJhdH8yeembNDfSf7BaLgqysQQmPqXv5q6qAqorgv82K\n2fXB7Kln1twAs3fF0z5ERBJi+RMRSYjlT0QkoajK3+12o7i4GOPGjUN1dTUAoKWlBU888QRKSkow\ne/ZsLFu2DM3NzUkNS0RE2ojqC99p06ahrKwMCxYsCE5TFAWPP/44CgsLAdx6g3j55Zexfv365CSl\npGs9cRyeygr4mr2AxQKoKmxOF7JL58Ixqaj3YwIsFjgmT8HwhY9GHLPnWPFmC4yTM7sk7m1NRLTb\nFHJf3VYdYtzAWNeO/R3tZ8+GnNfXemxOFzLGj8dnZ850ywcg7HPbdV5g2mf/PY/WD44Aqtrr+a1/\n560v5vWkKEDP24LfXp8yaBAURYHa1hacFjOLBQPHjYOvofHWtqSlAZ2d3dYZbj/Wj78HnZ/7eu1b\nTVitgN/fLWe414RRKLHcwL24uBjl5eXIy8vrNW///v3YtWsXduzYEVMAr7cNqiqQkzMETU3XY1rW\nKPpD9tYTx9GwcwdEZ2evxyh2O4aVPQYAYR8DAI4HpnY72EONGRgrljeAcON8ZdmPodz99ajH0UK0\n2xRpf0YUqjwTXY/VCkAB/L7o5oXJ4HhgKtLT7Wj43/1RboyOIuzHVOr5mohX146xWBS4XIMTHlOT\nc/6qqmLXrl0oLi7WYjjSgaeyImyBiM5OeCorIj4GwK1Pg32MGRgr0WyisxOX3343pnG0EO029bWv\nwopQWHGvx+8PXfzh5oXJ0PrBETTsfz/yuozCAMUP9H5NGIkm1/mvXbsWGRkZWLhwYczLdn0Hy8kZ\nokUcXZg9e3VL5O9rfH3MBwCoarf9EG5MX0tzTPsr3DgdHm/K93u029TX/oxXqtYTUjynaWTX4zWR\nCK2P9YTL3+1249KlSygvL4fFEvsfEjzto69AdluWM+S56QBblhMAIj4GFku3/RBuTFuWM6b9FW6c\ntGxXyvd7tNvU1/5MZP2pWE9Igdc33wSi1+M1ES/DnfbZtGkTqqqqsHXrVtjt9oTDkH6yS+dCCfMc\nKnY7skvnRnwMADgmT+lzzMBYiWZT7HaMWbQgzBLJE+029bWvwlKU8LPiXY/VCljDfM4LNS9MBsfk\nKRhW8mDkdRlFhP2YSj1fE0Ziff7555/v60Hr1q3Dc889h8bGRuzfvx+VlZWYOHEinn76aQwYMAB7\n9uzB7t27cfToUTz00EMxBbh5sxNCAIMGpeGzz+I4R2oA/SF72qjRGOByof3iRag3b976lCcEbE4X\nhj4yH45JRb0fE2CxwDHlgV5fbPV8fNexYhFunNEzpqV8v0e7TWH3VQQ2pwtDFyyE7+Zn8Hk8vedF\nsR6b04XBhZPgb73+Rb75CzD4618P/dz2mBfIYHE40HH58q1z512e39FTivB/9U1fzOspVOneXp8y\naBAsaWm3vqO4PS1mFgsGfvWrgCpubUtaWq+/RMLtR8f4e6A4Xb32rSas1u7bE+Y1Ea+uHaMoCjIy\nEv+wHdPVPsnA0z76YnZ9mDW7WXMD/Se7IU77EBGRObH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgk\nxPInIpIQy5+ISEIsfyIiCbH8iYgkpMlPOhMRxUqru7xRfFj+RJRyPe9A5mv2omHnDgDgG0CK8LQP\nEaWcVnd5o/ix/Iko5cLdgCZlN6Yhlj8RpZ7N6YppOmmP5U9EKafVXd4ofvzCl4hSLvClLq/20Q/L\nn4h04ZhUxLLXEU/7EBFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJ\nqM/yd7vdKC4uxrhx41BdXR2cXltbi3nz5qGkpATz5s3DxYsXk5mTiIg01OfPO0ybNg1lZWVYsGBB\nt+lr1qzB/Pnz8fDDD+NPf/oTfv3rX2Pnzp1JC6q3+nfeQusHRwBVBSwWOCZPQcZX7oKnsgLVzV7A\nYgFUNam/UdL1zkdBigIMGAB0dkaVoecY1b0ekTqBnPVv/RH4/POYlw+XfWB+PsY8/Yte0y+/4kb7\n2bNfTEhPB27ejHm9WtBzvyfCrLkBg2VPT0fea2/oGkERQohoHlhcXIzy8nLk5eXB6/WipKQEJ0+e\nhNVqhd/vR2FhIQ4cOACn0xlTAK+3DaoqkJMzBE1N1+PaiGSrf+cttB4+1HuGogAhdp9it2NY2WOa\nvgH0vPNRX0JliHUMM+v5BtCr+In0FsMbQNd+tFgUuFyDE159XOf8r169imHDhsFqtQIArFYrhg4d\niqtXryYcyIhaPzgSekaY981k3JEo1J2PIgmVIdYxzKxn0bP4yXB0+qszQPdf9ez6DpaTM0THJOFV\nq2rMy/hamjXdnuqW5oQzxDOGmXXbdh1zEIUTS0do3Y9xlX9ubi4aGhrg9/uDp30aGxuRm5sb81hm\nOO0TOJceC1uWU9PtsWU5Y77FXc8M8YxhZoY9nohui/YYNcxpH5fLhfz8fOzduxcAsHfvXuTn58d8\nvt8sHJOnhJ6hKKEnJ+GORKHufBRJqAyxjmFmA/PzI/43ke7S03VdfZ9f+K5btw4HDhyAx+NBVlYW\nMjMzsW/fPtTU1GDVqlVobW2Fw+GA2+3GnXfeGXMAU3zyR+SrfXwmvtpHT4le7ROOGa72IcnFeLVP\nMj75R321T7KYpfwjYXZ9MHvqmTU30H+y63rah4iIzI3lT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMR\nSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEbIkO\ncOjQIWzevBlCCAghsGzZMsyYMUOLbERElCQJlb8QAs888wzeffdd5OXl4T//+Q9+8IMfYPr06bBY\n+EeFLFpPHIensgK+Zi9sTheyS+fCMalI71hx6U/bYib9ab+H2hYAhtu+hD/5WywWXL9+HQBw/fp1\nDB06lMUvkdYTx9GwcwdEZycAwNfsRcPOHQCg+8Edq/60LWbSn/Z7qG2p/+MfAAjA7w9OM8L2JdTS\niqLg1VdfxZNPPompU6fiJz/5Cdxut1bZyAQ8lRXBAz1AdHbCU1mhU6L49adtMZP+tN9DbQv8vmDx\nBxhh+xL65O/z+bBt2za8/vrrKCgowL/+9S+sWLEC+/btw6BBg6Iaw+UaHPx3Ts6QROLoStbs1S3N\nIaf7WppTsk+0XEeqt8Wsx4zWuVO535O9z8NtSyixbp/W2RMq/7Nnz6KxsREFBQUAgIKCAqSnp6Om\npgbjx4+Pagyvtw2qKpCTMwRNTdcTiaMbmbPbspzwNXtDTk/2PtF6v6dyW8x6zCQjd6r2eyr2ebht\nCffYaPN0zW6xKN0+NMcrodM+w4cPR319PS5cuAAAqKmpgdfrxZgxYxIORuaQXToXit3ebZpitwe/\n5DKT/rQtZtKf9nuobYHVBlit3SYZYfsS+uSfk5OD559/HsuXL4eiKACA9evXIzMzU5NwZHyBL6yM\ndiVDPPrTtphJf9rv4bYl1DS9t08RQgg9A/C0j76YXR9mzW7W3ED/yW6I0z5ERGROLH8iIgmx/ImI\nJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKSEMufiEhCLH8iIgklfDOXVKt+6sfAzZvdpgV+K+Oz\n/55H6wdHAFUFLBY4Jk9B6+FDyc+U9DUkD7Prw6zZzZob0DG7ogBCABYLoKr8bZ+AWH7bJ1TxBwV2\nMBGRwSl2O4aVPRb1GwB/2ydc8QMsfiIyDSPcyctc5U9E1E9Ee9OXZGH5ExHpwOZ06bp+c5V/enr4\nebdvJkNEZHRGuJOXqco/77U3Qr4B2JwuDF/yBBwPTL31jTpw62qfB6amOCERUQ+BD6a3u8nmdMX0\nZW+ymOpqH6Nidn0we+qZNTfQf7LLebUPERFpguVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8R\nkYRY/kREEmL5ExFJKOGbuXR0dGD9+vX4xz/+gbS0NEyYMAFr167VIhsRESVJwuW/ceNGpKWlYf/+\n/VAUBR6PR4tcRESm13riODyVFfA1ew1zB6+AhMr/xo0b2LNnD44cOQLl9o8XZWdnaxKMiMjMWk8c\nR8POHRCdnQBu/X5/w84dAGCIN4CEzvnX1dUhMzMTW7ZsQWlpKRYtWoTTp09rlY2IyLQ8lRXB4g8w\nwh28AhL65O/3+1FXV4e7774bv/jFL/DJJ5/gRz/6Ed5//30MHhzdr851/XW6nJwhicTRFbPrg9lT\nz6y5gdRmr25pDjnd19IcVw6tsydU/rm5ubDZbJg1axYA4N5770VWVhZqa2txzz33RDUGf9JZX8yu\nD7NmN2tuIPXZbVnOkLdqtGU5Y85huJ90djqdKCwsxLFjxwAAtbW18Hq9GDt2bMLBiIjMLLt0LhS7\nvds0I9zBKyDhq31+85vfYPXq1XC73bDZbPjtb38Lh8OhRTYiItMKfKnbL6/2AYDRo0fj7bff1iIL\nEVG/4phUZJiy74n/hy8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8R\nkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVP\nRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJSLPy37JlC8aNG4fq6mqthiQi\noiSxaTHIp59+io8//hgjR47UYriYtZ44Dk9lBXzNXticLmSXzoVjUlGf8+Id8/IrbrSfPRt8bLe3\nO0UBBgwAOjsBiwVQ1eDyAIJjBubpzcxv1cyeembNDRg3e97/7NBlvQmXf2dnJ1544QW88sorKCsr\n0yJTTFpPHEfDzh0QnZ0AAF+zFw07dwTnh5sX6Q0g0pjXjv29W/H3IsSt4geC5e5r9qL+j38AIAC/\nv9s8IpJb9eOP6fIGkHD5b968GXPmzMGoUaO0yBMzT2VFsKQDRGcnPJUVwX+Hmhep/CON6Wv2xhfU\n74tvOSKiJEio/D/66CNUVVVh5cqVcY/hcg0O/jsnZ0jMy1e3NIec7gszPTAv0rriGZOIKF7RdF88\n/RhJQuV/6tQp1NTUYNq0aQCA+vp6LFmyBBs2bMD9998f1RhebxtUVSAnZwiamq7HnMGW5Qz5adyW\n5QSAsPMirSvSmHF/8iciCqOv7uvajxaL0u1Dc7wSutpn6dKlOHr0KA4ePIiDBw9i+PDh2L59e9TF\nr4Xs0rlQ7PZu0xS7HdmlcyPOi3fMgfn58QW12gCrNb5liYg0psnVPnoKnLuPdEVPrFf7RBrTMamo\n19U+3Zjsah8i0pdeV/soQgihy5pvS/S0jxEwuz6YPfXMmhvoP9kNcdqHiIjMieVPRCQhlj8RkYRY\n/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQh\nlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJ\niOVPRCQhlj8RkYRsiSzc0tKCZ555BpcvX4bdbsfYsWPxwgsvwOl0apWPiIiSIKFP/oqi4PHHH8f+\n/fvx3nvvYfTo0Xj55Ze1ykZEREmSUPlnZmaisLAw+N8TJkzAlStXEg5FRETJpQghhBYDqaqKxYsX\no7i4GGVlZVoMSURESZLQOf+u1q5di4yMDCxcuDCm5bzeNqiqQE7OEDQ1XdcqTkoxuz6YPfXMmhvo\nP9ktFgUu1+CEx9Sk/N1uNy5duoTy8nJYLLyAiIjI6BIu/02bNqGqqgpvvvkm7Ha7FpmIiCjJEir/\n8+fPY9u2bbjjjjvwyCOPAABGjRqFrVu3ahKOiIiSI6Hyv+uuu3Du3DmtshARUYrwBD0RkYRY/kRE\nEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYQ0+1VPuqX1xHF4Kivga/bC5nQh\nu3QuHJOKYl4uY/x4fHbmTMhxuj7WMngwhBAQN270uVyo9VU3ewGLBVBVzfdFslWHma6kpUF0dMDm\ndME2bCjaz50z3PaFy250Zs0NGDC7xQLH5CkYvvBRXVav2e/5x6s//aRz64njaNi5A6KzMzhPsdsx\nrOyxiG8AoZbrKTAOgD4fG2q5ruuPZn1ElBqOB6b2+QaQjJ905mkfDXkqK3oVqujshKeyIublegqM\nE81j+1p/rGMQUfK0fnBEl/XytI+GfM3emKZHOz/Wx/W1XLzjEFES6HRKkp/8NWRzumKaHu38ro+L\n9rGRxo9nDCJKEp1ugMXy11B26VwoPW5oo9jtyC6dG/NyPQXGieaxfa0/1jGIKHkck6fosl6e9tFQ\n4EvVWK/2CbVcX1ftJHK1T8/1mfVqn3DMcLUPEa/26UdX+5gRs+vDrNnNmhvoP9l5tQ8REcWN5U9E\nJCGWPxGRhFj+REQS0v1qH4tFCflvs2F2fTB76pk1N9A/smu1Dbpf7UNERKnH0z5ERBJi+RMRSYjl\nT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJKSvm73W4UFxdj3LhxqK6uDk4/dOgQ\nvvvd7+Lhhx/GnDlzcODAgajm1dbWYt68eSgpKcG8efNw8eLFZMSOmP3w4cP43ve+h9mzZ2PhwoWo\nq6uLKp+Rs7e0tOCJJ55ASUkJZs+ejWXLlqG5uTm43Mcff4w5c+agpKQEixcvhtebvHv/xrPfA7Zs\n2dJrOaNn7+jowJo1azBjxgzMnj0bv/rVr4LzjHzMAMZ4rUY6diM99/HO0zt7bW0tFi1ahJkzZ2LW\nrFl49tln0d7eHhzz4MGDmDlzJh588EGsWLECN2/e7DuISIJTp06JK1euiKlTp4pz584JIYRQVVXc\nd999wf8+e/asmDBhgvD7/RHnCSHEokWLxJ49e4QQQuzZs0csWrQoGbHDZr927ZqYOHGiuHDhQjDD\n4sWLg8tEymfk7C0tLeLEiRPB5V966SXx7LPPCiGE8Pv9Yvr06eLUqVNCCCG2bt0qVq1aZZjsAVVV\nVWLJkiXdljND9rVr14oXX3xRqKoqhBCiqakpOM/Ix4xRXqvhjt1Iz32884yQva6uTnz66afBrMuX\nLxdbtmwRQgjR1tYmioqKRG1trRBCiNWrV4vXXnutzxxJKf+AnuU/ceJEcfr0aSGEEP/85z/FjBkz\n+pzn8XhEQUGB8Pl8QgghfD6fKCgoEF6vN5nRu2X/5JNPxHe+853gvJaWFpGXlye8Xm/EfEbP3tNf\n//pX8eijjwaXe+ihh4LzvF6vmDBhQlJzCxFb9o6ODvH9739f1NXV9VrOyNnb2tpEQUGBaGtr6zWG\n0Y8ZI75Whfji2I303Mc7zwjZe9q+fbtYvXq1EEKIv/zlL2Lp0qXBeWfOnOn2/IWTsl/1VBQFr776\nKp588klkZGTgxo0bePPNN/ucd/XqVQwbNgxWqxUAYLVaMXToUFy9ehVOpzMl2b/85S/D4/HgzJkz\nGD9+PN57771gNiFE2HyR5hkhe9cMqqpi165dKC4uDs4fMWJEcL7T6YSqqrh27RoyMzMNkX3z5s2Y\nM2cORo0a1W05o2e3Wq3IzMzEli1bcPLkSQwaNAjLly/HfffdZ/jj3el0Gu612vXYjfTcxzsvmcdM\ntNm7Zmhvb0dFRQV+/vOfA+h9vI8YMQJXr17tc90p+8LX5/Nh27ZteP3113Ho0CG88cYbWLFiBW7c\nuBFxnhEMGTIEv/vd77BhwwaUlpbC6/XC4XAED3Ijizb72rVrkZGRgYULF+qUtLdI2T/66CNUVVVh\n/vz5escMKVJ2v9+Puro63H333aisrMTKlSvx1FNPoa2tTe/YACJnN+Jr1YjHbrRize7z+fCzn/0M\nkyZNwrRp0xJad8o++Z89exaNjY0oKCgAABQUFCA9PR01NTVQFCXsvJEjR6KhoQF+vz/4wmlsbERu\nbm6qogMAioqKUFRUBADweDzYvn07xowZg5s3b4bNJ4QwdPYAt9uNS5cuoby8HBbLrc8Dubm5uHLl\nSvAxzc3NsFgsKfvk3Ff2d955BzU1NcEXQH19PZYsWYINGzYYPnt7eztsNhtmzZoFALj33nuRlZWF\n2tpajBgxwtDHTKTXsR6v1Z7HbqTnPt55RsgOAH6/HytXrsSXvvQlPPfcc8HH5ebm4uTJk8H/vnLl\nSlT7PGWf/IcPH476+npcuHABAFBTUwOv14sxY8ZEnOdyuZCfn4+9e/cCAPbu3Yv8/PyU/Qkc0NTU\nBODWn2mbNm3CI488goyMjIj5jJ4dADZt2oSqqips3boVdrs9uMzXvvY1tLe34/Tp0wCA3bt3Y+bM\nmSnNHSn70qVLcfToURw8eBAHDx7E8OHDsX37dtx///2Gz+50OlFYWIhjx44BuHUlh9frxdixYw1/\nzBjptRrq2I303Mc7zwjZVVXFqlWrYLVa8eKLL0JRvrihy7e+9S38+9//Dl5ZtXv3bnz729/uM0NS\nbuaybt06HDhwAB6PB1lZWcjMzMS+ffvw5z//Gb///e+DwX/6059i+vTpABBxXk1NDVatWoXW1lY4\nHA643W7ceeedWseOmP2Xv/wlPvzwQ3z++ef45je/idWrVyMtLa3PfEbOfv78ecyaNQt33HEHBg4c\nCAAYNWoUtm7dCgD48MMPsWbNGnR0dGDkyJHYuHEjsrOzDZG9p+LiYpSXlyMvL88U2evq6rB69Wpc\nu3YNNpsNK1aswJQpUwAY+5gBjPFajXTsRnru452nd/bDhw/jhz/8IfLy8oJ/nX/jG9/AmjVrAAB/\n+9vfsHHjRqiqivz8fLz00kvBD3jh8E5eREQS4v/hS0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/\nEZGEWP5ERBJi+RMRSej/AZusTW/jKGeJAAAAAElFTkSuQmCC\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEBCAYAAACQbKXWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGxxJREFUeJzt3X1wFPXBB/Dv3p0XEuCa3CVAeLVO\nDY1TkTYOYVILEpDQCtiGmWJ5iQ4obS0WWhlFaosVBE+Uygga7EOLqAP/JEMr9CnY4cUChUJ9oXEo\noSFAZiAvdwlPCJLEu/09f8Cdebm73Mve7W5+389fuHv72+/u7X3vsrfeKkIIASIikopF7wBERJR6\nLH8iIgmx/ImIJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKS\nEMufiEhCNr0DtLRch6oKuFyD4PW26R0nLsyuD2ZPPbPmBvpPdotFQVbWwITH1L38VVVAVUXw32bF\n7Ppg9tQza26A2bviaR8iIgmx/ImIJMTyJyKSUFTl73a7UVxcjLFjx6K6uhoA0NLSgscffxwlJSWY\nNWsWli5diubm5qSGJSIibUT1he/UqVNRVlaG+fPnB6cpioLHHnsMhYWFAG6+QbzyyitYt25dcpJS\n0rUePwZPZQV8zV7AYgFUFTanC9mlc+CYWNT7MQEWCxyTJmPYgkcijtlzrHizBcbJmVUS97YmItpt\nCrmvbqkOMW5grKtH/472M2dCzutrPTanCxnjxuHz06e75QMQ9rntOi8w7fP/nkPrh4cBVe31/Na/\n+/aX83pSFKDnbcFvrU8ZOBCKokBtawtOi5nFggFjx8LX0HhzW9LSgM7ObusMtx/rx92Nzi98vfat\nJqxWwO/vljPca8IolFhu4F5cXIzy8nLk5eX1mrdv3z7s3LkT27dvjymA19sGVRXIyRmMpqZrMS1r\nFP0he+vxY2jYsR2is7PXYxS7HUPLHgWAsI8BAMf9U7od7KHGDIwVyxtAuHG+tvSnUO76ZtTjaCHa\nbYq0PyMKVZ6JrsdqBaAAfl9088JkcNw/BenpdjT8774oN0ZHEfZjKvV8TcSra8dYLApcrkEJj6nJ\nOX9VVbFz504UFxdrMRzpwFNZEbZARGcnPJUVER8D4OanwT7GDIyVaDbR2YlL77wX0zhaiHab+tpX\nYUUorLjX4/eHLv5w88JkaP3wMBr2fRB5XUZhgOIHer8mjEST6/zXrFmDjIwMLFiwIOZlu76D5eQM\n1iKOLsyevbol8vc1vj7mAwBUtdt+CDemr6U5pv0VbpwOjzfl+z3abeprf8YrVesJKZ7TNLLr8ZpI\nhNbHesLl73a7cfHiRZSXl8Niif0PCZ720Vcguy3LGfLcdIAtywkAER8Di6Xbfgg3pi3LGdP+CjdO\nWrYr5fs92m3qa38msv5UrCekwOubbwLR6/GaiJfhTvts3LgRVVVV2LJlC+x2e8JhSD/ZpXOghHkO\nFbsd2aVzIj4GAByTJvc5ZmCsRLMpdjtGL5wfZonkiXab+tpXYSlK+FnxrsdqBaxhPueFmhcmg2PS\nZAwteSDyuowiwn5MpZ6vCSOxPv/888/39aC1a9fiueeeQ2NjI/bt24fKykpMmDABTz31FG677Tbs\n3r0bu3btwpEjR/Dggw/GFODGjU4IAQwcmIbPP4/jHKkB9IfsaSNH4TaXC+0XLkC9cePmpzwhYHO6\nMOTheXBMLOr9mACLBY7J9/f6Yqvn47uOFYtw44yaPjXl+z3abQq7ryKwOV0YMn8BfDc+h8/j6T0v\nivXYnC4MKpwIf+u1L/PNm49B3/xm6Oe2x7xABovDgY5Ll26eO+/y/I6aXIT/q2/6cl5PoUr31vqU\ngQNhSUu7+R3FrWkxs1gw4OtfB1Rxc1vS0nr9JRJuPzrG3Q3F6eq1bzVhtXbfnjCviXh17RhFUZCR\nkfiH7Ziu9kkGnvbRF7Prw6zZzZob6D/ZDXHah4iIzInlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEWP5ERBJi+RMRSYjlT0QkIU1+0pmIKFZa3eWN4sPyJ6KU63kHMl+zFw07tgMA3wBShKd9\niCjltLrLG8WP5U9EKRfuBjQpuzENsfyJKPVsTldM00l7LH8iSjmt7vJG8eMXvkSUcoEvdXm1j35Y\n/kSkC8fEIpa9jnjah4hIQix/IiIJsfyJiCTE8icikhDLn4hIQix/IiIJsfyJiCTE8icikhDLn4hI\nQn2Wv9vtRnFxMcaOHYvq6urg9NraWsydOxclJSWYO3cuLly4kMycRESkoT5/3mHq1KkoKyvD/Pnz\nu01fvXo15s2bh4ceegh/+tOf8Jvf/AY7duxIWlC91b/7Nlo/PAyoKmCxwDFpMjK+dic8lRWobvYC\nFgugqkn9jZKudz4KUhTAbgc6OqLK0HOM6l6PSJ1Azvq3/wh88UXMy4fLPiA/H6OfeqbX9EuvutF+\n5syXE9LTgRs3Yl6vFvTc74kwa27AYNnT05H3+pu6RlCEECKaBxYXF6O8vBx5eXnwer0oKSnBiRMn\nYLVa4ff7UVhYiP3798PpdMYUwOttg6oK5OQMRlPTtbg2Itnq330brYcO9p6hKECI3afY7Rha9qim\nbwA973zUl1AZYh3DzHq+AfQqfiK9xfAG0LUfLRYFLteghFcf1zn/K1euYOjQobBarQAAq9WKIUOG\n4MqVKwkHMqLWDw+HnhHmfTMZdyQKdeejSEJliHUMM+tZ9Cx+Mhyd/uoM0P1XPbu+g+XkDNYxSXjV\nqhrzMr6WZk23p7qlOeEM8YxhZt22XcccROHE0hFa92Nc5Z+bm4uGhgb4/f7gaZ/Gxkbk5ubGPJYZ\nTvsEzqXHwpbl1HR7bFnOmG9x1zNDPGOYmWGPJ6Jboj1GDXPax+VyIT8/H3v27AEA7NmzB/n5+TGf\n7zcLx6TJoWcoSujJSbgjUag7H0USKkOsY5jZgPz8iP9NpLv0dF1X3+cXvmvXrsX+/fvh8XiQlZWF\nzMxM7N27FzU1NVi5ciVaW1vhcDjgdrtxxx13xBzAFJ/8EflqH5+Jr/bRU6JX+4Rjhqt9SHIxXu2T\njE/+UV/tkyxmKf9ImF0fzJ56Zs0N9J/sup72ISIic2P5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJ\niOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kRE\nEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhW6ID\nHDx4EJs2bYIQAkIILF26FNOnT9ciGxERJUlC5S+EwNNPP4333nsPeXl5+M9//oMf/ehHmDZtGiwW\n/lEhi9bjx+CprICv2Qub04Xs0jlwTCzSO1Zc+tO2mEl/2u+htgWA4bYv4U/+FosF165dAwBcu3YN\nQ4YMYfFLpPX4MTTs2A7R2QkA8DV70bBjOwDofnDHqj9ti5n0p/0ealvq//gHAALw+4PTjLB9CbW0\noih47bXX8MQTT2DKlCn42c9+BrfbrVU2MgFPZUXwQA8QnZ3wVFbolCh+/WlbzKQ/7fdQ2wK/L1j8\nAUbYvoQ++ft8PmzduhVvvPEGCgoK8K9//QvLly/H3r17MXDgwKjGcLkGBf+dkzM4kTi6kjV7dUtz\nyOm+luaU7BMt15HqbTHrMaN17lTu92Tv83DbEkqs26d19oTK/8yZM2hsbERBQQEAoKCgAOnp6aip\nqcG4ceOiGsPrbYOqCuTkDEZT07VE4uhG5uy2LCd8zd6Q05O9T7Te76ncFrMeM8nInar9nop9Hm5b\nwj022jxds1ssSrcPzfFK6LTPsGHDUF9fj/PnzwMAampq4PV6MXr06ISDkTlkl86BYrd3m6bY7cEv\nucykP22LmfSn/R5qW2C1AVZrt0lG2L6EPvnn5OTg+eefx7Jly6AoCgBg3bp1yMzM1CQcGV/gCyuj\nXckQj/60LWbSn/Z7uG0JNU3v7VOEEELPADztoy9m14dZs5s1N9B/shvitA8REZkTy5+ISEIsfyIi\nCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIJ38wl1aqf/Clw40a3aYHfyvj8\nv+fQ+uFhQFUBiwWOSZPReuhg8jMlfQ3Jw+z6MGt2s+YGdMyuKIAQgMUCqCp/2ycglt/2CVX8QYEd\nTERkcIrdjqFlj0b9BsDf9glX/ACLn4hMwwh38jJX+RMR9RPR3vQlWVj+REQ6sDlduq7fXOWfnh5+\n3q2byRARGZ0R7uRlqvLPe/3NkG8ANqcLwxY/Dsf9U25+ow7cvNrn/ikpTkhE1EPgg+mtbrI5XTF9\n2Zssprrax6iYXR/MnnpmzQ30n+xyXu1DRESaYPkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9E\nJCGWPxGRhFj+REQSSvhmLh0dHVi3bh3+8Y9/IC0tDePHj8eaNWu0yEZEREmScPlv2LABaWlp2Ldv\nHxRFgcfj0SIXEZHptR4/Bk9lBXzNXsPcwSsgofK/fv06du/ejcOHD0O59eNF2dnZmgQjIjKz1uPH\n0LBjO0RnJ4Cbv9/fsGM7ABjiDSChc/51dXXIzMzE5s2bUVpaioULF+LUqVNaZSMiMi1PZUWw+AOM\ncAevgIQ++fv9ftTV1eGuu+7CM888g08//RQ/+clP8MEHH2DQoOh+da7rr9Pl5AxOJI6umF0fzJ56\nZs0NpDZ7dUtzyOm+lua4cmidPaHyz83Nhc1mw8yZMwEA99xzD7KyslBbW4u77747qjH4k876YnZ9\nmDW7WXMDqc9uy3KGvFWjLcsZcw7D/aSz0+lEYWEhjh49CgCora2F1+vFmDFjEg5GRGRm2aVzoNjt\n3aYZ4Q5eAQlf7fPb3/4Wq1atgtvths1mw8svvwyHw6FFNiIi0wp8qdsvr/YBgFGjRuGdd97RIgsR\nUb/imFhkmLLvif+HLxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGR\nhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9E\nJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUlIs/LfvHkzxo4di+rqaq2GJCKi\nJLFpMchnn32GTz75BCNGjNBiuJi1Hj8GT2UFfM1e2JwuZJfOgWNiUZ/z4h3z0qtutJ85E3xst7c7\nRQFuuw3o7AQsFkBVg8sDCI4ZmKc3M79VM3vqmTU3YNzsef+zXZf1Jlz+nZ2deOGFF/Dqq6+irKxM\ni0wxaT1+DA07tkN0dgIAfM1eNOzYHpwfbl6kN4BIY149+vduxd+LEDeLHwiWu6/Zi/o//gGAAPz+\nbvOISG7Vjz2qyxtAwuW/adMmzJ49GyNHjtQiT8w8lRXBkg4QnZ3wVFYE/x1qXqTyjzSmr9kbX1C/\nL77liIiSIKHy//jjj1FVVYUVK1bEPYbLNSj475ycwTEvX93SHHK6L8z0wLxI64pnTCKieEXTffH0\nYyQJlf/JkydRU1ODqVOnAgDq6+uxePFirF+/Hvfdd19UY3i9bVBVgZycwWhquhZzBluWM+SncVuW\nEwDCzou0rkhjxv3Jn4gojL66r2s/WixKtw/N8Uroap8lS5bgyJEjOHDgAA4cOIBhw4Zh27ZtURe/\nFrJL50Cx27tNU+x2ZJfOiTgv3jEH5OfHF9RqA6zW+JYlItKYJlf76Clw7j7SFT2xXu0TaUzHxKJe\nV/t0Y7KrfYhIX3pd7aMIIYQua74l0dM+RsDs+mD21DNrbqD/ZDfEaR8iIjInlj8RkYRY/kREEmL5\nExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY\n/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQh\nlj8RkYRY/kREErIlsnBLSwuefvppXLp0CXa7HWPGjMELL7wAp9OpVT4iIkqChD75K4qCxx57DPv2\n7cP777+PUaNG4ZVXXtEqGxERJUlC5Z+ZmYnCwsLgf48fPx6XL19OOBQRESWXIoQQWgykqioWLVqE\n4uJilJWVaTEkERElSULn/Ltas2YNMjIysGDBgpiW83rboKoCOTmD0dR0Tas4KcXs+mD21DNrbqD/\nZLdYFLhcgxIeU5Pyd7vduHjxIsrLy2Gx8AIiIiKjS7j8N27ciKqqKrz11luw2+1aZCIioiRLqPzP\nnTuHrVu34vbbb8fDDz8MABg5ciS2bNmiSTgiIkqOhMr/zjvvxNmzZ7XKQkREKcIT9EREEmL5ExFJ\niOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEtLsVz3pptbjx+CprICv2Qub04Xs\n0jlwTCyKebmMcePw+enTIcfp+ljLoEEQQkBcv97ncqHWV93sBSwWQFU13xfJVh1mupKWBtHRAZvT\nBdvQIWg/e9Zw2xcuu9GZNTdgwOwWCxyTJmPYgkd0Wb1mv+cfr/70k86tx4+hYcd2iM7O4DzFbsfQ\nskcjvgGEWq6nwDgA+nxsqOW6rj+a9RFRajjun9LnG0AyftKZp3005Kms6FWoorMTnsqKmJfrKTBO\nNI/ta/2xjkFEydP64WFd1svTPhryNXtjmh7t/Fgf19dy8Y5DREmg0ylJfvLXkM3piml6tPO7Pi7a\nx0YaP54xiChJdLoBFstfQ9mlc6D0uKGNYrcju3ROzMv1FBgnmsf2tf5YxyCi5HFMmqzLennaR0OB\nL1Vjvdon1HJ9XbWTyNU+Pddn1qt9wjHD1T5EvNqnH13tY0bMrg+zZjdrbqD/ZOfVPkREFDeWPxGR\nhFj+REQSYvkTEUlI96t9LBYl5L/Nhtn1weypZ9bcQP/IrtU26H61DxERpR5P+xARSYjlT0QkIZY/\nEZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSSgp5e92u1FcXIyxY8eiuro6OP3gwYP4\n/ve/j4ceegizZ8/G/v37o5pXW1uLuXPnoqSkBHPnzsWFCxeSETti9kOHDuEHP/gBZs2ahQULFqCu\nri6qfEbO3tLSgscffxwlJSWYNWsWli5diubm5uByn3zyCWbPno2SkhIsWrQIXm/y7v0bz34P2Lx5\nc6/ljJ69o6MDq1evxvTp0zFr1iz8+te/Ds4z8jEDGOO1GunYjfTcxztP7+y1tbVYuHAhZsyYgZkz\nZ+LZZ59Fe3t7cMwDBw5gxowZeOCBB7B8+XLcuHGj7yAiCU6ePCkuX74spkyZIs6ePSuEEEJVVXHv\nvfcG//vMmTNi/Pjxwu/3R5wnhBALFy4Uu3fvFkIIsXv3brFw4cJkxA6b/erVq2LChAni/PnzwQyL\nFi0KLhMpn5Gzt7S0iOPHjweXf+mll8Szzz4rhBDC7/eLadOmiZMnTwohhNiyZYtYuXKlYbIHVFVV\nicWLF3dbzgzZ16xZI1588UWhqqoQQoimpqbgPCMfM0Z5rYY7diM99/HOM0L2uro68dlnnwWzLlu2\nTGzevFkIIURbW5soKioStbW1QgghVq1aJV5//fU+cySl/AN6lv+ECRPEqVOnhBBC/POf/xTTp0/v\nc57H4xEFBQXC5/MJIYTw+XyioKBAeL3eZEbvlv3TTz8V3/ve94LzWlpaRF5envB6vRHzGT17T3/9\n61/FI488ElzuwQcfDM7zer1i/PjxSc0tRGzZOzo6xA9/+ENRV1fXazkjZ29raxMFBQWira2t1xhG\nP2aM+FoV4stjN9JzH+88I2Tvadu2bWLVqlVCCCH+8pe/iCVLlgTnnT59utvzF07KftVTURS89tpr\neOKJJ5CRkYHr16/jrbfe6nPelStXMHToUFitVgCA1WrFkCFDcOXKFTidzpRk/+pXvwqPx4PTp09j\n3LhxeP/994PZhBBh80WaZ4TsXTOoqoqdO3eiuLg4OH/48OHB+U6nE6qq4urVq8jMzDRE9k2bNmH2\n7NkYOXJkt+WMnt1qtSIzMxObN2/GiRMnMHDgQCxbtgz33nuv4Y93p9NpuNdq12M30nMf77xkHjPR\nZu+aob29HRUVFfjlL38JoPfxPnz4cFy5cqXPdafsC1+fz4etW7fijTfewMGDB/Hmm29i+fLluH79\nesR5RjB48GD87ne/w/r161FaWgqv1wuHwxE8yI0s2uxr1qxBRkYGFixYoFPS3iJl//jjj1FVVYV5\n8+bpHTOkSNn9fj/q6upw1113obKyEitWrMCTTz6JtrY2vWMDiJzdiK9VIx670Yo1u8/nwy9+8QtM\nnDgRU6dOTWjdKfvkf+bMGTQ2NqKgoAAAUFBQgPT0dNTU1EBRlLDzRowYgYaGBvj9/uALp7GxEbm5\nuamKDgAoKipCUVERAMDj8WDbtm0YPXo0bty4ETafEMLQ2QPcbjcuXryI8vJyWCw3Pw/k5ubi8uXL\nwcc0NzfDYrGk7JNzX9nfffdd1NTUBF8A9fX1WLx4MdavX2/47O3t7bDZbJg5cyYA4J577kFWVhZq\na2sxfPhwQx8zkV7HerxWex67kZ77eOcZITsA+P1+rFixAl/5ylfw3HPPBR+Xm5uLEydOBP/78uXL\nUe3zlH3yHzZsGOrr63H+/HkAQE1NDbxeL0aPHh1xnsvlQn5+Pvbs2QMA2LNnD/Lz81P2J3BAU1MT\ngJt/pm3cuBEPP/wwMjIyIuYzenYA2LhxI6qqqrBlyxbY7fbgMt/4xjfQ3t6OU6dOAQB27dqFGTNm\npDR3pOxLlizBkSNHcODAARw4cADDhg3Dtm3bcN999xk+u9PpRGFhIY4ePQrg5pUcXq8XY8aMMfwx\nY6TXaqhjN9JzH+88I2RXVRUrV66E1WrFiy++CEX58oYu3/nOd/Dvf/87eGXVrl278N3vfrfPDEm5\nmcvatWuxf/9+eDweZGVlITMzE3v37sWf//xn/P73vw8G//nPf45p06YBQMR5NTU1WLlyJVpbW+Fw\nOOB2u3HHHXdoHTti9l/96lf46KOP8MUXX+Db3/42Vq1ahbS0tD7zGTn7uXPnMHPmTNx+++0YMGAA\nAGDkyJHYsmULAOCjjz7C6tWr0dHRgREjRmDDhg3Izs42RPaeiouLUV5ejry8PFNkr6urw6pVq3D1\n6lXYbDYsX74ckydPBmDsYwYwxms10rEb6bmPd57e2Q8dOoQf//jHyMvLC/51/q1vfQurV68GAPzt\nb3/Dhg0boKoq8vPz8dJLLwU/4IXDO3kREUmI/4cvEZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5E\nRBJi+RMRSYjlT0Qkof8Hm7xNb6groUQAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] @@ -2320,10 +2319,11 @@ "colab": {} }, "source": [ + "from cuml.preprocessing.model_selection import train_test_split\n", "#location seems to be related to building quality, (knnclassifier)\n", "\n", - "def fillna_knn( df, base, target):\n", - " data_colnames = [ target ] + base\n", + "def fillna_knn(df, base, target):\n", + " data_colnames = [target] + base\n", " #print(\"data_colnames\",data_colnames)\n", " missing_values_boolflag = df[target].isnull() #true for missing rows, false for columns with values\n", " #print(\"miss\",missing_values_boolflag.head())\n", @@ -2331,11 +2331,12 @@ " #print(\"not miss\",not_missing_boolflag.head())\n", " number_of_missing_val = missing_values_boolflag.sum()\n", " print(\"# of miss\",number_of_missing_val)\n", - " not_missing_rows = df.loc[ not_missing_boolflag, data_colnames ]\n", + " not_missing_rows = df.loc[not_missing_boolflag, data_colnames]\n", " #print(not_missing_rows.head())\n", " Y = not_missing_rows[target]\n", " X = not_missing_rows[base]\n", - " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=3192,stratify=Y)\n", + " #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=3192,stratify=Y)\n", + " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)\n", " metrics = ['euclidean'] \n", " weights = ['distance'] \n", " numNeighbors = [5,10,15,20,25]\n", @@ -2360,43 +2361,13 @@ { "cell_type": "code", "metadata": { - "id": "AT8Osn51lD9v", - "colab_type": "code", - "outputId": "9a3af301-2c19-4bfd-faca-3dba219a270c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 50 - } - }, - "source": [ - "print(df_train.buildingqualitytypeid.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "temp['buildingqualitytypeid']=temp['buildingqualitytypeid'].fillna(-1)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "32911\n", - "(90275, 45)\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "f8rNxkrxACGe", + "id": "6eES-hq--NKZ", "colab_type": "code", "colab": {} }, "source": [ - "\"\"\"RESET WIRE\"\"\"\n", - "# hold_df = df_train.copy()\n", - "df_train = hold_df.copy()" + "# test = df_train.copy()\n", + "df_train = test.copy()" ], "execution_count": 0, "outputs": [] @@ -2404,35 +2375,56 @@ { "cell_type": "code", "metadata": { - "id": "OkyuebKaACxa", + "id": "AT8Osn51lD9v", "colab_type": "code", - "outputId": "d0dc876b-b02f-4179-91d0-d9a9b42e0e27", + "outputId": "83435ba5-0887-47fb-f8fb-ceeb9dd92fda", "colab": { "base_uri": "https://localhost:8080/", - "height": 185 + "height": 573 } }, "source": [ - "\n", - "print(df_train.buildingqualitytypeid.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "temp['buildingqualitytypeid']=temp['buildingqualitytypeid'].fillna(-1)\n", - "print(temp.to_pandas().head())\n" + "print('CURRENT DF SITUATION\\n')\n", + "print(f'SHAPE = {df_train.shape}')\n", + "print(f'NULL COUNT = {df_train.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{df_train.buildingqualitytypeid.value_counts()}\\n')\n", + "print(f'BUILDINGTYPEID HEAD\\n{df_train.buildingqualitytypeid.head()}\\n')\n", + "print(f'DF TRAIN HEAD\\n{df_train.head()}')" ], - "execution_count": 0, + "execution_count": 49, "outputs": [ { "output_type": "stream", "text": [ - "32911\n", - "(90275, 45)\n", - " parcelid logerror ac_id ... transaction_month census_tractnumber block_number\n", - "0 11827818 0.0402 NaN ... 3 5315.03 1013\n", - "1 12123024 0.0296 NaN ... 3 4625.00 1017\n", - "2 13867327 0.0344 NaN ... 3 0114.01 2017\n", - "3 12681894 0.0060 NaN ... 3 6513.02 1004\n", - "4 12848541 0.0695 1.0 ... 3 4087.03 1018\n", + "CURRENT DF SITUATION\n", + "\n", + "SHAPE = (90275, 45)\n", + "NULL COUNT = 32911\n", + "VALUE COUNTS\n", + "7.0 29310\n", + "4.0 23839\n", + "1.0 2627\n", + "10.0 1461\n", + "12.0 119\n", + "8.0 5\n", + "6.0 2\n", + "11.0 1\n", + "Name: buildingqualitytypeid, dtype: int32\n", + "\n", + "BUILDINGTYPEID HEAD\n", + "0 7.0\n", + "1 null\n", + "2 null\n", + "3 7.0\n", + "4 4.0\n", + "Name: buildingqualitytypeid, dtype: float64\n", + "\n", + "DF TRAIN HEAD\n", + " parcelid logerror ac_id ... transaction_month census_tractnumber block_number\n", + "0 11827818 0.0402 null ... 3 5315.03 1013\n", + "1 12123024 0.0296 null ... 3 4625.00 1017\n", + "2 13867327 0.0344 null ... 3 0114.01 2017\n", + "3 12681894 0.0060 null ... 3 6513.02 1004\n", + "4 12848541 0.0695 1.0 ... 3 4087.03 1018\n", "\n", "[5 rows x 45 columns]\n" ], @@ -2445,50 +2437,48 @@ "metadata": { "id": "79bB7JKdAEtX", "colab_type": "code", - "outputId": "29f38a6a-dac2-4917-8f1b-8a4b198afe67", + "outputId": "b1b1e940-e89a-40e8-c5af-5919c896ca19", "colab": { "base_uri": "https://localhost:8080/", - "height": 118 + "height": 225 } }, "source": [ - "print(temp.to_pandas().buildingqualitytypeid.head())" + "temp=df_train.copy()\n", + "temp['buildingqualitytypeid'] = temp['buildingqualitytypeid'].fillna(-1)\n", + "print(f'NULL COUNT = {temp.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{temp.buildingqualitytypeid.value_counts()}')" ], - "execution_count": 0, + "execution_count": 50, "outputs": [ { "output_type": "stream", "text": [ - "0 7.0\n", - "1 -1.0\n", - "2 -1.0\n", - "3 7.0\n", - "4 4.0\n", - "Name: buildingqualitytypeid, dtype: float64\n" + "NULL COUNT = 0\n", + "VALUE COUNTS\n", + "-1.0 32911\n", + " 7.0 29310\n", + " 4.0 23839\n", + " 1.0 2627\n", + " 10.0 1461\n", + " 12.0 119\n", + " 8.0 5\n", + " 6.0 2\n", + " 11.0 1\n", + "Name: buildingqualitytypeid, dtype: int32\n" ], "name": "stdout" } ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "DVgF1c_p_bN1", - "colab_type": "text" - }, - "source": [ - "# -----current: break-----" - ] - }, { "cell_type": "code", "metadata": { "id": "mAB9bsrPAGzQ", "colab_type": "code", - "outputId": "2f9eaa73-a7b2-4634-e24d-9aec777b2536", + "outputId": "ff5376d3-6854-4d05-a7c1-7ffe0a6136a4", "colab": { "base_uri": "https://localhost:8080/", - "height": 387 + "height": 347 } }, "source": [ @@ -2499,157 +2489,139 @@ "# conditions = (temp.buildingqualitytypeid.value_counts > 3)\n", "# print(temp.loc[temp.buildingqualitytypeid.astype(int) > 3].head())\n", "# temp.loc[temp.census_tractnumber.value_counts() > 3]\n", - "print(temp.loc[temp.census_tractnumber.value_counts().values > 3].to_pandas().head())\n", + "# print(temp.loc[temp.census_tractnumber.value_counts().values > 3].to_pandas().head())\n", "\n", - "# temp = temp.loc[]\n", - "print(temp.to_pandas().buildingqualitytypeid.head())\n" + "\"\"\"still working on how to best do this in RAPIDS\n", + "\"\"\"\n", + "print(f'{temp.buildingqualitytypeid.value_counts()}\\n')\n", + "temp = temp.to_pandas()\n", + "temp = temp.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", + "temp = cudf.from_pandas(temp)\n", + "print(temp.buildingqualitytypeid.value_counts())" ], - "execution_count": 0, + "execution_count": 51, "outputs": [ { - "output_type": "error", - "ename": "RuntimeError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcensus_tractnumber\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# temp = temp.loc[]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 110\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_tuple_arg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m_getitem_tuple_arg\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcolumns_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 212\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcolumns_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 213\u001b[0m \u001b[0;31m# Step 3: Gather index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# we have a single row\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loc_to_iloc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 392\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 393\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 394\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/column.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_bool_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_boolean_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 533\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/columnops.py\u001b[0m in \u001b[0;36mapply_boolean_mask\u001b[0;34m(self, mask)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mas_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"bool\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 118\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mapply_apply_boolean_mask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 119\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcolumn_empty_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnewsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mcudf/bindings/stream_compaction.pyx\u001b[0m in \u001b[0;36mcudf.bindings.stream_compaction.apply_apply_boolean_mask\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mRuntimeError\u001b[0m: cuDF failure at: /conda/conda-bld/libcudf_1566412619056/work/cpp/src/stream_compaction/apply_boolean_mask.cu:64: Column size mismatch" - ] + "output_type": "stream", + "text": [ + "-1.0 32911\n", + " 7.0 29310\n", + " 4.0 23839\n", + " 1.0 2627\n", + " 10.0 1461\n", + " 12.0 119\n", + " 8.0 5\n", + " 6.0 2\n", + " 11.0 1\n", + "Name: buildingqualitytypeid, dtype: int32\n", + "\n", + "-1.0 32911\n", + " 7.0 29310\n", + " 4.0 23839\n", + " 1.0 2627\n", + " 10.0 1461\n", + " 12.0 119\n", + " 8.0 5\n", + "Name: buildingqualitytypeid, dtype: int32\n" + ], + "name": "stdout" } ] }, { "cell_type": "code", "metadata": { - "id": "QCyed1SjAJFP", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(temp.to_pandas().head())\n" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "1JgQ1Tq2NRsz", + "id": "uCyRxp-7qEXf", "colab_type": "code", - "outputId": "c113cc08-3a69-4aa1-d05e-7b4d2a5df9fa", + "outputId": "629f0745-3a63-4bd8-aa10-835a94450cb6", "colab": { "base_uri": "https://localhost:8080/", - "height": 162 + "height": 52 } }, "source": [ - "df_train.loc[df_train.buildingqualitytypeid>3]" + "temp['buildingqualitytypeid'] = temp['buildingqualitytypeid'].replace(-1,np.nan)\n", + "print(temp.buildingqualitytypeid.isnull().sum())\n", + "print(temp.shape)" ], - "execution_count": 0, + "execution_count": 52, "outputs": [ { - "output_type": "error", - "ename": "NameError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'buildingqualitytypeid' is not defined" - ] + "output_type": "stream", + "text": [ + "32911\n", + "(90272, 45)\n" + ], + "name": "stdout" } ] }, { - "cell_type": "code", + "cell_type": "markdown", "metadata": { - "id": "XFkPwjUmHu4Y", - "colab_type": "code", - "outputId": "00b5fdb3-25fc-460a-bbd3-aaa421a93555", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 279 - } + "id": "DVgF1c_p_bN1", + "colab_type": "text" }, "source": [ - "temp=temp.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "error", - "ename": "AttributeError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"buildingqualitytypeid\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/groupby/groupby.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 133\u001b[0m )\n\u001b[1;32m 134\u001b[0m raise AttributeError(\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0;34m\"'DataFrameGroupBy' object has no attribute \"\u001b[0m \u001b[0;34m\"'{}'\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m )\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'DataFrameGroupBy' object has no attribute 'filter'" - ] - } + "# -----current: break-----\n", + "- below is last cell run" ] }, { "cell_type": "code", "metadata": { - "id": "uCyRxp-7qEXf", + "id": "Q3ZBSOHm-79A", "colab_type": "code", - "outputId": "969848f0-fbc6-4388-dca2-08f8bde03990", + "outputId": "3da3e840-8d13-426a-e0aa-8ae20679326b", "colab": { "base_uri": "https://localhost:8080/", - "height": 380 + "height": 394 } }, "source": [ - "\n", - "temp['buildingqualitytypeid'] = temp['buildingqualitytypeid'].replace(-1,np.nan)\n", - "print(temp.buildingqualitytypeid.isnull().sum())\n", - "print(temp.shape)\n", - "\n", - "missing_values=fillna_knn(temp,\n", - " base = [ 'latitude', 'longitude' ] ,\n", - " target = 'buildingqualitytypeid')\n", + "missing_values = fillna_knn(temp, \n", + " base = ['latitude', 'longitude'], \n", + " target = 'buildingqualitytypeid')\n", "\n", "print(\"predicted output shape\",missing_values.shape)\n", "missing_values_boolflag = df_train['buildingqualitytypeid'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'buildingqualitytypeid' ] = missing_values\n", + "df_train.loc[missing_values_boolflag, 'buildingqualitytypeid'] = missing_values\n", "\n", "print(df_train.buildingqualitytypeid.isnull().sum())" ], - "execution_count": 0, + "execution_count": 53, "outputs": [ { "output_type": "stream", "text": [ - "32911\n", - "(90275, 45)\n" + "# of miss 32911\n" ], "name": "stdout" }, { "output_type": "error", - "ename": "AttributeError", + "ename": "NameError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"buildingqualitytypeid\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'buildingqualitytypeid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/groupby/groupby.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 133\u001b[0m )\n\u001b[1;32m 134\u001b[0m raise AttributeError(\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0;34m\"'DataFrameGroupBy' object has no attribute \"\u001b[0m \u001b[0;34m\"'{}'\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m )\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'DataFrameGroupBy' object has no attribute 'filter'" + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m missing_values = fillna_knn(temp, \n\u001b[1;32m 2\u001b[0m \u001b[0mbase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'latitude'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'longitude'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m target = 'buildingqualitytypeid')\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"predicted output shape\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmissing_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mfillna_knn\u001b[0;34m(df, base, target)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mnumNeighbors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m15\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m25\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mparam_grid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mweights\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mweights\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mn_neighbors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnumNeighbors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mcv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStratifiedKFold\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_splits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3192\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0mgrid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mneighbors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mKNeighborsClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'f1_weighted'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrefit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mreturn_train_score\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpre_dispatch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'n_jobs'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mY_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'StratifiedKFold' is not defined" ] } ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "bgXh5OATEacY", + "colab_type": "text" + }, + "source": [ + "# BELOW NOT RUN" + ] + }, { "cell_type": "code", "metadata": { @@ -3727,6 +3699,47 @@ ], "execution_count": 0, "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WzATgLxmam5w", + "colab_type": "text" + }, + "source": [ + "In this competition, Zillow is asking you to predict the log-error between their Zestimate and the actual sale price, given all the features of a home. The log error is defined as\n", + "\n", + "logerror=log(Zestimate)−log(SalePrice)\n", + "and it is recorded in the transactions file train.csv. In this competition, you are going to predict the logerror for the months in Fall 2017. Since all the real estate transactions in the U.S. are publicly available, we will close the competition (no longer accepting submissions) before the evaluation period begins.\n", + "\n", + "Train/Test split\n", + "You are provided with a full list of real estate properties in three counties (Los Angeles, Orange and Ventura, California) data in 2016.\n", + "The train data has all the transactions before October 15, 2016, plus some of the transactions after October 15, 2016.\n", + "The test data in the public leaderboard has the rest of the transactions between October 15 and December 31, 2016.\n", + "The rest of the test data, which is used for calculating the private leaderboard, is all the properties in October 15, 2017, to December 15, 2017. This period is called the \"sales tracking period\", during which we will not be taking any submissions.\n", + "You are asked to predict 6 time points for all properties: October 2016 (201610), November 2016 (201611), December 2016 (201612), October 2017 (201710), November 2017 (201711), and December 2017 (201712).\n", + "Not all the properties are sold in each time period. If a property was not sold in a certain time period, that particular row will be ignored when calculating your score.\n", + "If a property is sold multiple times within 31 days, we take the first reasonable value as the ground truth. By \"reasonable\", we mean if the data seems wrong, we will take the transaction that has a value that makes more sense.\n", + "File descriptions\n", + "properties_2016.csv - all the properties with their home features for 2016. Note: Some 2017 new properties don't have any data yet except for their parcelid's. Those data points should be populated when properties_2017.csv is available.\n", + "properties_2017.csv - all the properties with their home features for 2017 (released on 10/2/2017)\n", + "train_2016.csv - the training set with transactions from 1/1/2016 to 12/31/2016\n", + "train_2017.csv - the training set with transactions from 1/1/2017 to 9/15/2017 (released on 10/2/2017)\n", + "sample_submission.csv - a sample submission file in the correct format" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "R0yrYUf7anN0", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] } ] } \ No newline at end of file From 189711a1a39f4cde39b1c2aa9280d0a039c614eb Mon Sep 17 00:00:00 2001 From: gumdropsteve Date: Mon, 9 Sep 2019 08:13:24 -0700 Subject: [PATCH 4/7] run through as is; data string conversion and current .filter workaround issues displayed/labeled; removed rapids and kaggle install logs (first few cell outputs) for readability --- .../zillow_kaggle_zestimate_comp.ipynb | 1077 ++++++++--------- 1 file changed, 533 insertions(+), 544 deletions(-) diff --git a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb index f05586f5..c8d68291 100644 --- a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb +++ b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb @@ -43,11 +43,11 @@ "metadata": { "id": "W-um5d-x7o46", "colab_type": "code", - "outputId": "a3d473ea-3028-49fb-b769-c78616b388ae", "colab": { "base_uri": "https://localhost:8080/", "height": 312 - } + }, + "outputId": "a604e66b-95d7-44fb-f8d3-848fcedaf796" }, "source": [ "\"\"\"make sure we have the right GPU\n", @@ -61,7 +61,7 @@ { "output_type": "stream", "text": [ - "Thu Sep 5 06:04:00 2019 \n", + "Mon Sep 9 14:17:51 2019 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 430.40 Driver Version: 418.67 CUDA Version: 10.1 |\n", "|-------------------------------+----------------------+----------------------+\n", @@ -69,7 +69,7 @@ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", - "| N/A 39C P8 10W / 70W | 0MiB / 15079MiB | 0% Default |\n", + "| N/A 68C P0 28W / 70W | 0MiB / 15079MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", @@ -98,11 +98,7 @@ "metadata": { "id": "p129YxxnihcV", "colab_type": "code", - "outputId": "ce0d1990-45c5-4c91-d1f2-86cedd666bbc", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } + "colab": {} }, "source": [ "!wget -nc https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh\n", @@ -115,257 +111,8 @@ "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'" ], - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "text": [ - "--2019-09-05 06:04:07-- https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1609 (1.6K) [text/plain]\n", - "Saving to: ‘rapids-colab.sh’\n", - "\n", - "\rrapids-colab.sh 0%[ ] 0 --.-KB/s \rrapids-colab.sh 100%[===================>] 1.57K --.-KB/s in 0s \n", - "\n", - "2019-09-05 06:04:08 (510 MB/s) - ‘rapids-colab.sh’ saved [1609/1609]\n", - "\n", - "--2019-09-05 06:04:09-- https://github.com/rapidsai/notebooks-extended/raw/master/utils/env-check.py\n", - "Resolving github.com (github.com)... 13.114.40.48\n", - "Connecting to github.com (github.com)|13.114.40.48|:443... connected.\n", - "HTTP request sent, awaiting response... 301 Moved Permanently\n", - "Location: https://github.com/rapidsai/notebooks-contrib/raw/master/utils/env-check.py [following]\n", - "--2019-09-05 06:04:09-- https://github.com/rapidsai/notebooks-contrib/raw/master/utils/env-check.py\n", - "Reusing existing connection to github.com:443.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/env-check.py [following]\n", - "--2019-09-05 06:04:10-- https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/env-check.py\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 783 [text/plain]\n", - "Saving to: ‘env-check.py’\n", - "\n", - "env-check.py 100%[===================>] 783 --.-KB/s in 0s \n", - "\n", - "2019-09-05 06:04:10 (162 MB/s) - ‘env-check.py’ saved [783/783]\n", - "\n", - "Checking for GPU type:\n", - "*********************************************\n", - "Woo! Your instance has the right kind of GPU!\n", - "*********************************************\n", - "\n", - "Removing conflicting packages, will replace with RAPIDS compatible versions\n", - "Uninstalling xgboost-0.90:\n", - " Successfully uninstalled xgboost-0.90\n", - "Uninstalling dask-1.1.5:\n", - " Successfully uninstalled dask-1.1.5\n", - "Uninstalling distributed-1.25.3:\n", - " Successfully uninstalled distributed-1.25.3\n", - "Installing conda\n", - "--2019-09-05 06:04:14-- https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\n", - "Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c94f, ...\n", - "Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 58468498 (56M) [application/x-sh]\n", - "Saving to: ‘Miniconda3-4.5.4-Linux-x86_64.sh’\n", - "\n", - "Miniconda3-4.5.4-Li 100%[===================>] 55.76M 65.1MB/s in 0.9s \n", - "\n", - "2019-09-05 06:04:15 (65.1 MB/s) - ‘Miniconda3-4.5.4-Linux-x86_64.sh’ saved [58468498/58468498]\n", - "\n", - "PREFIX=/usr/local\n", - "installing: python-3.6.5-hc3d631a_2 ...\n", - "Python 3.6.5 :: Anaconda, Inc.\n", - "installing: ca-certificates-2018.03.07-0 ...\n", - "installing: conda-env-2.6.0-h36134e3_1 ...\n", - "installing: libgcc-ng-7.2.0-hdf63c60_3 ...\n", - "installing: libstdcxx-ng-7.2.0-hdf63c60_3 ...\n", - "installing: libffi-3.2.1-hd88cf55_4 ...\n", - "installing: ncurses-6.1-hf484d3e_0 ...\n", - "installing: openssl-1.0.2o-h20670df_0 ...\n", - "installing: tk-8.6.7-hc745277_3 ...\n", - "installing: xz-5.2.4-h14c3975_4 ...\n", - "installing: yaml-0.1.7-had09818_2 ...\n", - "installing: zlib-1.2.11-ha838bed_2 ...\n", - "installing: libedit-3.1.20170329-h6b74fdf_2 ...\n", - "installing: readline-7.0-ha6073c6_4 ...\n", - "installing: sqlite-3.23.1-he433501_0 ...\n", - "installing: asn1crypto-0.24.0-py36_0 ...\n", - "installing: certifi-2018.4.16-py36_0 ...\n", - "installing: chardet-3.0.4-py36h0f667ec_1 ...\n", - "installing: idna-2.6-py36h82fb2a8_1 ...\n", - "installing: pycosat-0.6.3-py36h0a5515d_0 ...\n", - "installing: pycparser-2.18-py36hf9f622e_1 ...\n", - "installing: pysocks-1.6.8-py36_0 ...\n", - "installing: ruamel_yaml-0.15.37-py36h14c3975_2 ...\n", - "installing: six-1.11.0-py36h372c433_1 ...\n", - "installing: cffi-1.11.5-py36h9745a5d_0 ...\n", - "installing: setuptools-39.2.0-py36_0 ...\n", - "installing: cryptography-2.2.2-py36h14c3975_0 ...\n", - "installing: wheel-0.31.1-py36_0 ...\n", - "installing: pip-10.0.1-py36_0 ...\n", - "installing: pyopenssl-18.0.0-py36_0 ...\n", - "installing: urllib3-1.22-py36hbe7ace6_0 ...\n", - "installing: requests-2.18.4-py36he2e5f8d_1 ...\n", - "installing: conda-4.5.4-py36_0 ...\n", - "installation finished.\n", - "WARNING:\n", - " You currently have a PYTHONPATH environment variable set. This may cause\n", - " unexpected behavior when running the Python interpreter in Miniconda3.\n", - " For best results, please verify that your PYTHONPATH only points to\n", - " directories of packages that are compatible with the Python interpreter\n", - " in Miniconda3: /usr/local\n", - "Installing RAPIDS 0.10 packages\n", - "Please standby, this will take a few minutes...\n", - "\n", - "\n", - "==> WARNING: A newer version of conda exists. <==\n", - " current version: 4.5.4\n", - " latest version: 4.7.11\n", - "\n", - "Please update conda by running\n", - "\n", - " $ conda update -n base conda\n", - "\n", - "\n", - "dask-cuda-0.10.0a | 921 KB | : 100% 1.0/1 [00:02<00:00, 2.81s/it] \n", - "jpeg-9c | 251 KB | : 100% 1.0/1 [00:00<00:00, 8.31it/s]\n", - "ca-certificates-2019 | 145 KB | : 100% 1.0/1 [00:00<00:00, 10.93it/s]\n", - "joblib-0.13.2 | 180 KB | : 100% 1.0/1 [00:00<00:00, 7.30it/s]\n", - "blinker-1.4 | 13 KB | : 100% 1.0/1 [00:00<00:00, 13.37it/s]\n", - "dask-core-2.3.0 | 574 KB | : 100% 1.0/1 [00:00<00:00, 4.16it/s] \n", - "cudf-0.10.0a | 4.7 MB | : 100% 1.0/1 [00:01<00:00, 1.74s/it] \n", - "pyasn1-modules-0.2.6 | 47 KB | : 100% 1.0/1 [00:00<00:00, 10.35it/s]\n", - "jinja2-2.10.1 | 91 KB | : 100% 1.0/1 [00:00<00:00, 11.34it/s]\n", - "grpc-cpp-1.23.0 | 4.5 MB | : 100% 1.0/1 [00:01<00:00, 1.05s/it] \n", - "boost-cpp-1.70.0 | 21.1 MB | : 100% 1.0/1 [00:08<00:00, 8.53s/it] \n", - "idna-2.8 | 132 KB | : 100% 1.0/1 [00:00<00:00, 11.10it/s]\n", - "numba-0.45.1 | 3.1 MB | : 100% 1.0/1 [00:00<00:00, 1.04it/s] \n", - "numpy-1.17.1 | 5.2 MB | : 100% 1.0/1 [00:01<00:00, 1.13s/it] \n", - "yaml-0.1.7 | 78 KB | : 100% 1.0/1 [00:00<00:00, 12.23it/s]\n", - "click-7.0 | 61 KB | : 100% 1.0/1 [00:00<00:00, 12.19it/s]\n", - "python-dateutil-2.8. | 219 KB | : 100% 1.0/1 [00:00<00:00, 11.76it/s]\n", - "google-auth-1.6.3 | 45 KB | : 100% 1.0/1 [00:00<00:00, 11.44it/s]\n", - "gcsfs-0.3.0 | 19 KB | : 100% 1.0/1 [00:00<00:00, 15.31it/s]\n", - "tk-8.6.9 | 3.2 MB | : 100% 1.0/1 [00:00<00:00, 1.57it/s] \n", - "pytz-2019.2 | 228 KB | : 100% 1.0/1 [00:00<00:00, 4.04it/s] \n", - "pip-19.2.3 | 1.9 MB | : 100% 1.0/1 [00:00<00:00, 1.82it/s] \n", - "cachetools-2.1.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 15.55it/s]\n", - "zict-1.0.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 14.96it/s]\n", - "cloudpickle-1.2.1 | 22 KB | : 100% 1.0/1 [00:00<00:00, 14.84it/s]\n", - "dask-cudf-0.10.0a | 62 KB | : 100% 1.0/1 [00:01<00:00, 1.34s/it] \n", - "libcudf-0.10.0a | 26.0 MB | : 100% 1.0/1 [00:07<00:00, 7.09s/it] \n", - "pillow-6.1.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.42it/s] \n", - "libcumlprims-0.9.0 | 3.9 MB | : 100% 1.0/1 [00:02<00:00, 2.24s/it] \n", - "cytoolz-0.10.0 | 429 KB | : 100% 1.0/1 [00:00<00:00, 6.44it/s] \n", - "requests-oauthlib-1. | 19 KB | : 100% 1.0/1 [00:00<00:00, 14.79it/s]\n", - "six-1.12.0 | 22 KB | : 100% 1.0/1 [00:00<00:00, 13.72it/s]\n", - "bzip2-1.0.8 | 396 KB | : 100% 1.0/1 [00:00<00:00, 7.69it/s] \n", - "llvmlite-0.29.0 | 19.9 MB | : 100% 1.0/1 [00:03<00:00, 3.15s/it] \n", - "re2-2019.09.01 | 431 KB | : 100% 1.0/1 [00:00<00:00, 7.14it/s] \n", - "zstd-1.4.0 | 928 KB | : 100% 1.0/1 [00:00<00:00, 4.81it/s] \n", - "pycparser-2.19 | 173 KB | : 100% 1.0/1 [00:00<00:00, 9.89it/s]\n", - "urllib3-1.25.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 7.69it/s]\n", - "uriparser-0.9.3 | 49 KB | : 100% 1.0/1 [00:00<00:00, 10.99it/s]\n", - "gflags-2.2.2 | 177 KB | : 100% 1.0/1 [00:00<00:00, 9.88it/s]\n", - "libpng-1.6.37 | 343 KB | : 100% 1.0/1 [00:00<00:00, 8.51it/s] \n", - "certifi-2019.6.16 | 149 KB | : 100% 1.0/1 [00:00<00:00, 12.26it/s]\n", - "libcblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 15.55it/s]\n", - "_libgcc_mutex-0.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 18.61it/s]\n", - "psutil-5.6.3 | 322 KB | : 100% 1.0/1 [00:00<00:00, 7.35it/s] \n", - "lz4-c-1.8.3 | 187 KB | : 100% 1.0/1 [00:00<00:00, 9.43it/s]\n", - "zlib-1.2.11 | 105 KB | : 100% 1.0/1 [00:00<00:00, 11.30it/s]\n", - "fsspec-0.4.4 | 39 KB | : 100% 1.0/1 [00:00<00:00, 13.92it/s]\n", - "thrift-cpp-0.12.0 | 2.4 MB | : 100% 1.0/1 [00:00<00:00, 2.11it/s] \n", - "double-conversion-3. | 85 KB | : 100% 1.0/1 [00:00<00:00, 11.97it/s]\n", - "heapdict-1.0.0 | 7 KB | : 100% 1.0/1 [00:00<00:00, 16.08it/s]\n", - "libffi-3.2.1 | 46 KB | : 100% 1.0/1 [00:00<00:00, 13.59it/s]\n", - "chardet-3.0.4 | 190 KB | : 100% 1.0/1 [00:00<00:00, 8.67it/s] \n", - "pynvml-8.0.3 | 30 KB | : 100% 1.0/1 [00:00<00:00, 2.88it/s] \n", - "bokeh-1.3.4 | 4.0 MB | : 100% 1.0/1 [00:01<00:00, 1.30s/it] \n", - "freetype-2.10.0 | 884 KB | : 100% 1.0/1 [00:00<00:00, 4.89it/s] \n", - "nvstrings-0.10.0a | 124 KB | : 100% 1.0/1 [00:01<00:00, 1.37s/it] \n", - "libxgboost-0.90.rapi | 32.8 MB | : 100% 1.0/1 [00:09<00:00, 9.67s/it] \n", - "pyasn1-0.4.6 | 52 KB | : 100% 1.0/1 [00:00<00:00, 12.03it/s]\n", - "brotli-1.0.7 | 1.0 MB | : 100% 1.0/1 [00:00<00:00, 4.92it/s] \n", - "setuptools-41.2.0 | 634 KB | : 100% 1.0/1 [00:00<00:00, 4.53it/s] \n", - "wheel-0.33.6 | 35 KB | : 100% 1.0/1 [00:00<00:00, 13.26it/s]\n", - "libgcc-ng-9.1.0 | 8.1 MB | : 100% 1.0/1 [00:01<00:00, 1.23s/it] \n", - "libcuml-0.10.0a | 29.3 MB | : 100% 1.0/1 [00:10<00:00, 10.22s/it] \n", - "dlpack-0.2 | 12 KB | : 100% 1.0/1 [00:00<00:00, 1.39it/s] \n", - "pandas-0.24.2 | 11.1 MB | : 100% 1.0/1 [00:02<00:00, 2.32s/it] \n", - "dask-cuml-0.8.0a | 30 KB | : 100% 1.0/1 [00:01<00:00, 1.14s/it] \n", - "sqlite-3.29.0 | 1.9 MB | : 100% 1.0/1 [00:00<00:00, 2.74it/s] \n", - "libgfortran-ng-7.3.0 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 3.61it/s] \n", - "toolz-0.10.0 | 46 KB | : 100% 1.0/1 [00:00<00:00, 11.64it/s]\n", - "asn1crypto-0.24.0 | 154 KB | : 100% 1.0/1 [00:00<00:00, 9.53it/s]\n", - "liblapack-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 15.48it/s]\n", - "packaging-19.0 | 23 KB | : 100% 1.0/1 [00:00<00:00, 3.70it/s] \n", - "cryptography-2.7 | 607 KB | : 100% 1.0/1 [00:00<00:00, 3.62it/s] \n", - "olefile-0.46 | 31 KB | : 100% 1.0/1 [00:00<00:00, 15.17it/s]\n", - "libopenblas-0.3.7 | 7.6 MB | : 100% 1.0/1 [00:01<00:00, 1.29s/it] \n", - "libtiff-4.0.10 | 587 KB | : 100% 1.0/1 [00:00<00:00, 6.35it/s] \n", - "cffi-1.12.3 | 218 KB | : 100% 1.0/1 [00:00<00:00, 8.74it/s]\n", - "ncurses-6.1 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 1.19it/s] \n", - "rmm-0.10.0a | 14 KB | : 100% 1.0/1 [00:00<00:00, 1.98it/s] \n", - "libprotobuf-3.8.0 | 4.7 MB | : 100% 1.0/1 [00:01<00:00, 1.71s/it] \n", - "pyopenssl-19.0.0 | 81 KB | : 100% 1.0/1 [00:00<00:00, 12.69it/s]\n", - "libevent-2.1.10 | 1.3 MB | : 100% 1.0/1 [00:00<00:00, 2.66it/s] \n", - "librmm-0.10.0a | 44 KB | : 100% 1.0/1 [00:00<00:00, 1.99it/s] \n", - "scipy-1.3.1 | 18.1 MB | : 100% 1.0/1 [00:03<00:00, 3.18s/it] \n", - "readline-8.0 | 441 KB | : 100% 1.0/1 [00:00<00:00, 6.79it/s] \n", - "msgpack-python-0.6.1 | 89 KB | : 100% 1.0/1 [00:00<00:00, 13.43it/s]\n", - "requests-2.22.0 | 84 KB | : 100% 1.0/1 [00:00<00:00, 1.49it/s]\n", - "py-xgboost-0.90.rapi | 86 KB | : 100% 1.0/1 [00:00<00:00, 1.41it/s] \n", - "cuml-0.10.0a | 5.9 MB | : 100% 1.0/1 [00:02<00:00, 2.12s/it] \n", - "libblas-3.8.0 | 10 KB | : 100% 1.0/1 [00:00<00:00, 13.55it/s]\n", - "c-ares-1.15.0 | 100 KB | : 100% 1.0/1 [00:00<00:00, 12.24it/s]\n", - "glog-0.4.0 | 104 KB | : 100% 1.0/1 [00:00<00:00, 12.09it/s]\n", - "pyarrow-0.14.1 | 2.8 MB | : 100% 1.0/1 [00:00<00:00, 1.27it/s] \n", - "xz-5.2.4 | 366 KB | : 100% 1.0/1 [00:00<00:00, 7.05it/s] \n", - "arrow-cpp-0.14.1 | 17.3 MB | : 100% 1.0/1 [00:02<00:00, 2.84s/it] \n", - "icu-64.2 | 12.6 MB | : 100% 1.0/1 [00:01<00:00, 1.93s/it] \n", - "distributed-2.3.2 | 370 KB | : 100% 1.0/1 [00:00<00:00, 5.21it/s] \n", - "xgboost-0.90.rapidsd | 11 KB | : 100% 1.0/1 [00:01<00:00, 1.01s/it] \n", - "locket-0.2.0 | 6 KB | : 100% 1.0/1 [00:00<00:00, 15.02it/s]\n", - "snappy-1.1.7 | 39 KB | : 100% 1.0/1 [00:00<00:00, 14.68it/s]\n", - "pyjwt-1.7.1 | 17 KB | : 100% 1.0/1 [00:00<00:00, 13.20it/s]\n", - "libstdcxx-ng-9.1.0 | 4.0 MB | : 100% 1.0/1 [00:00<00:00, 1.61it/s] \n", - "pysocks-1.7.0 | 26 KB | : 100% 1.0/1 [00:00<00:00, 15.49it/s]\n", - "dask-2.3.0 | 4 KB | : 100% 1.0/1 [00:00<00:00, 15.11it/s]\n", - "sortedcontainers-2.1 | 25 KB | : 100% 1.0/1 [00:00<00:00, 14.59it/s]\n", - "parquet-cpp-1.5.1 | 3 KB | : 100% 1.0/1 [00:00<00:00, 14.74it/s]\n", - "nccl-2.4.6.1 | 66.6 MB | : 100% 1.0/1 [00:12<00:00, 12.58s/it] \n", - "google-auth-oauthlib | 18 KB | : 100% 1.0/1 [00:00<00:00, 13.18it/s]\n", - "cugraph-0.10.0a | 1.3 MB | : 100% 1.0/1 [00:10<00:00, 3.84s/it] \n", - "libcugraph-0.10.0a | 11.3 MB | : 100% 1.0/1 [00:18<00:00, 18.20s/it] \n", - "python-3.6.7 | 34.6 MB | : 100% 1.0/1 [00:05<00:00, 5.00s/it] \n", - "openssl-1.1.1c | 2.1 MB | : 100% 1.0/1 [00:00<00:00, 2.56it/s] \n", - "tornado-6.0.3 | 636 KB | : 100% 1.0/1 [00:00<00:00, 4.46it/s] \n", - "partd-1.0.0 | 16 KB | : 100% 1.0/1 [00:00<00:00, 13.42it/s]\n", - "markupsafe-1.1.1 | 26 KB | : 100% 1.0/1 [00:00<00:00, 14.08it/s]\n", - "fastavro-0.22.4 | 405 KB | : 100% 1.0/1 [00:00<00:00, 7.21it/s] \n", - "cython-0.29.13 | 2.2 MB | : 100% 1.0/1 [00:00<00:00, 1.86it/s] \n", - "rsa-3.4.2 | 31 KB | : 100% 1.0/1 [00:00<00:00, 13.62it/s]\n", - "pyyaml-5.1.2 | 184 KB | : 100% 1.0/1 [00:00<00:00, 10.32it/s]\n", - "scikit-learn-0.21.3 | 6.7 MB | : 100% 1.0/1 [00:01<00:00, 1.44s/it] \n", - "decorator-4.4.0 | 11 KB | : 100% 1.0/1 [00:00<00:00, 15.90it/s]\n", - "oauthlib-3.0.1 | 82 KB | : 100% 1.0/1 [00:00<00:00, 9.79it/s]\n", - "pyparsing-2.4.2 | 57 KB | : 100% 1.0/1 [00:00<00:00, 13.40it/s]\n", - "tblib-1.4.0 | 12 KB | : 100% 1.0/1 [00:00<00:00, 15.25it/s]\n", - "cudatoolkit-10.0.130 | 380.0 MB | : 100% 1.0/1 [00:46<00:00, 46.86s/it] \n", - "libnvstrings-0.10.0a | 24.8 MB | : 100% 1.0/1 [00:07<00:00, 7.82s/it] \n", - "Copying shared object files to /usr/lib\n", - "\n", - "*********************************************\n", - "Your Google Colab instance is RAPIDS ready!\n", - "*********************************************\n" - ], - "name": "stdout" - } - ] + "execution_count": 0, + "outputs": [] }, { "cell_type": "markdown", @@ -386,18 +133,15 @@ "metadata": { "id": "x1dLRTm168Tk", "colab_type": "code", - "outputId": "406a519a-e019-46cf-f0bf-7bba3dd2bb79", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } + "colab": {} }, "source": [ + "# 5b4ecdb3cb122fb692a8349124960424\n", "# Info on how to get your api key (kaggle.json) here: https://github.com/Kaggle/kaggle-api#api-credentials\n", "!pip install kaggle\n", "!mkdir /root/.kaggle\n", "# plug api -- get your own API key\n", - "!echo '{\"username\":\"warobson\",\"key\":\"5b4ecdb3cb122fb692a8349124960424\"}' > /root/.kaggle/kaggle.json\n", + "!echo '{\"username\":\"warobson\",\"key\":\"\"}' > /root/.kaggle/kaggle.json\n", "!chmod 600 /root/.kaggle/kaggle.json\n", "# !kaggle datasets download\n", "!kaggle competitions download -c zillow-prize-1\n", @@ -409,87 +153,8 @@ "!unzip -q \"/content/train_2017.csv.zip\"\n", "!unzip -q \"/content/properties_2017.csv.zip\"" ], - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting kaggle\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e9/fc/0de659ea1f2096563204925b6660ae141f3d85bbe9e8a1571c3eb6cc1fdd/kaggle-1.5.5.tar.gz (56kB)\n", - "\u001b[K |████████████████████████████████| 61kB 31.1MB/s \n", - "\u001b[?25hCollecting urllib3<1.25,>=1.21.1 (from kaggle)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/01/11/525b02e4acc0c747de8b6ccdab376331597c569c42ea66ab0a1dbd36eca2/urllib3-1.24.3-py2.py3-none-any.whl (118kB)\n", - "\u001b[K |████████████████████████████████| 122kB 35.9MB/s \n", - "\u001b[?25hRequirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/site-packages (from kaggle) (1.12.0)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.6/site-packages (from kaggle) (2019.6.16)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/site-packages (from kaggle) (2.8.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/site-packages (from kaggle) (2.22.0)\n", - "Collecting tqdm (from kaggle)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/dc/88/d3213e2f3492daf09d8b41631ad6899f56db17ce83ea9c8a579902bafe5e/tqdm-4.35.0-py2.py3-none-any.whl (50kB)\n", - "\u001b[K |████████████████████████████████| 51kB 29.7MB/s \n", - "\u001b[?25hCollecting python-slugify (from kaggle)\n", - " Downloading https://files.pythonhosted.org/packages/a2/5d/bd30413c00bbed3945558aca07c55944073e1e30abeee1f06515281f9811/python-slugify-3.0.3.tar.gz\n", - "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/site-packages (from requests->kaggle) (2.8)\n", - "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/site-packages (from requests->kaggle) (3.0.4)\n", - "Collecting text-unidecode==1.2 (from python-slugify->kaggle)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/79/42/d717cc2b4520fb09e45b344b1b0b4e81aa672001dd128c180fabc655c341/text_unidecode-1.2-py2.py3-none-any.whl (77kB)\n", - "\u001b[K |████████████████████████████████| 81kB 32.1MB/s \n", - "\u001b[?25hBuilding wheels for collected packages: kaggle, python-slugify\n", - " Building wheel for kaggle (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for kaggle: filename=kaggle-1.5.5-cp36-none-any.whl size=71896 sha256=d9815b0d9eae6d3594e8dc1a57a33174b1dbe24f623e1d688a92a2588f4e1be0\n", - " Stored in directory: /root/.cache/pip/wheels/db/6a/80/6cd1892eb9b9b136333db3c74e16cba4e17e2c700f51541f06\n", - " Building wheel for python-slugify (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for python-slugify: filename=python_slugify-3.0.3-py2.py3-none-any.whl size=4789 sha256=ccca227a48fbd1c2f5ba45701b0b52f1a12b7d1484ba459889128b3712c17b88\n", - " Stored in directory: /root/.cache/pip/wheels/0f/96/ca/85f5b01165975402d1e37f8dd346df00dc39be1d0761bd17bb\n", - "Successfully built kaggle python-slugify\n", - "Installing collected packages: urllib3, tqdm, text-unidecode, python-slugify, kaggle\n", - " Found existing installation: urllib3 1.25.3\n", - " Uninstalling urllib3-1.25.3:\n", - " Successfully uninstalled urllib3-1.25.3\n", - "Successfully installed kaggle-1.5.5 python-slugify-3.0.3 text-unidecode-1.2 tqdm-4.35.0 urllib3-1.24.3\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "urllib3" - ] - } - } - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "Downloading sample_submission.csv.zip to /content\n", - " 51% 5.00M/9.86M [00:00<00:00, 15.9MB/s]\n", - "100% 9.86M/9.86M [00:00<00:00, 29.2MB/s]\n", - "Downloading properties_2016.csv.zip to /content\n", - " 91% 145M/159M [00:02<00:00, 54.3MB/s]\n", - "100% 159M/159M [00:02<00:00, 59.1MB/s]\n", - "Downloading zillow_data_dictionary.xlsx.zip to /content\n", - " 0% 0.00/15.7k [00:00 0) & (df_train.garage_sqft == 0)\n", "print(df_train.loc[conditions][garage].head())" ], - "execution_count": 18, + "execution_count": 166, "outputs": [ { "output_type": "stream", @@ -1125,9 +797,9 @@ " garagecarcnt garage_sqft\n", "16 2.0 0.0\n", "29 1.0 0.0\n", - "36 2.0 0.0\n", - "54 2.0 0.0\n", - "65 1.0 0.0\n" + "32 2.0 0.0\n", + "49 1.0 0.0\n", + "52 2.0 0.0\n" ], "name": "stdout" } @@ -1244,7 +916,7 @@ "metadata": { "id": "yHZH4rMNLfBA", "colab_type": "code", - "outputId": "53844d43-16cb-41f2-8684-5268848f1476", + "outputId": "97106bb4-10f2-49a9-f821-03a3972db136", "colab": { "base_uri": "https://localhost:8080/", "height": 208 @@ -1303,7 +975,7 @@ "# let's see how out unit counts look\n", "print(df_train.unitcnt.value_counts())" ], - "execution_count": 22, + "execution_count": 170, "outputs": [ { "output_type": "stream", @@ -1396,17 +1068,17 @@ "metadata": { "id": "8lYcO_T5XKNN", "colab_type": "code", - "outputId": "e7ff645b-ac87-4039-d135-db8fd49855da", "colab": { "base_uri": "https://localhost:8080/", "height": 311 - } + }, + "outputId": "596cfad3-890d-4241-b8b8-347673082a7f" }, "source": [ "# how we'd normally take care of this\n", "df_train['taxdelinquencyflag'].fillna(0)" ], - "execution_count": 0, + "execution_count": 172, "outputs": [ { "output_type": "error", @@ -1415,9 +1087,9 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/dataframe/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 717\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m ):\n\u001b[0;32m--> 719\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 720\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 721\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1165\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1166\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1167\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1168\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1169\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/column/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 720\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 721\u001b[0m ):\n\u001b[0;32m--> 722\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 723\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 724\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: fill_value must be a string or a string series" ] } @@ -1449,7 +1121,7 @@ "metadata": { "id": "Svp6J0cJ5dL0", "colab_type": "code", - "outputId": "fd9373e8-8a1f-45a3-a7bd-387f0e678d2c", + "outputId": "03862711-e104-4954-bf9c-61bd51b3a9e3", "colab": { "base_uri": "https://localhost:8080/", "height": 69 @@ -1471,7 +1143,7 @@ "# display values in tax delinquency flag column\n", "print(df_train['taxdelinquencyflag'].value_counts())" ], - "execution_count": 24, + "execution_count": 173, "outputs": [ { "output_type": "stream", @@ -1501,7 +1173,7 @@ "metadata": { "id": "lHh95mAIMrMy", "colab_type": "code", - "outputId": "37c584b1-76a4-4df7-ca71-23a9d50f165a", + "outputId": "832c405d-d89f-4b85-d77d-7a6726a61907", "colab": { "base_uri": "https://localhost:8080/", "height": 69 @@ -1510,7 +1182,7 @@ "source": [ "print(df_train.taxdelinquencyflag.value_counts())" ], - "execution_count": 25, + "execution_count": 174, "outputs": [ { "output_type": "stream", @@ -1528,7 +1200,7 @@ "metadata": { "id": "6Bic66I9LfGC", "colab_type": "code", - "outputId": "af959592-7a80-42ee-8fda-3e18d6b9c514", + "outputId": "baaa5387-bbd7-4242-a336-0b6b90606935", "colab": { "base_uri": "https://localhost:8080/", "height": 243 @@ -1552,7 +1224,7 @@ "# what've we got? \n", "print(df_train.taxdelinquencyyear.value_counts())" ], - "execution_count": 26, + "execution_count": 175, "outputs": [ { "output_type": "stream", @@ -1597,8 +1269,8 @@ }, "source": [ "# make a copy of dataframe at this point\n", - "# safe = df_train.copy()\n", - "df_train = safe.copy()" + "# pre_string = df_train.copy()\n", + "df_train = pre_string.copy()" ], "execution_count": 0, "outputs": [] @@ -1608,7 +1280,7 @@ "metadata": { "id": "Sg0eN-K1QdZy", "colab_type": "code", - "outputId": "3cdd2ef6-fb68-46bd-a611-5dd3d9ff45d2", + "outputId": "a90de47f-5c88-4834-df44-75a9dedcd07c", "colab": { "base_uri": "https://localhost:8080/", "height": 489 @@ -1671,7 +1343,7 @@ "\"\"\"\n", "print(df_train[['census_tractnumber', 'block_number']].head())" ], - "execution_count": 29, + "execution_count": 177, "outputs": [ { "output_type": "stream", @@ -1726,7 +1398,7 @@ "metadata": { "id": "xhCosNpXvTVU", "colab_type": "code", - "outputId": "3f70a009-a211-46fd-ecc9-731be3d15fe1", + "outputId": "2d969756-decb-4912-94f6-19836eb0323a", "colab": { "base_uri": "https://localhost:8080/", "height": 86 @@ -1749,7 +1421,7 @@ "# drop columns with more than 95% null values\n", "df_train = df_train.drop(missingvaluescols['field'], axis=1)" ], - "execution_count": 30, + "execution_count": 178, "outputs": [ { "output_type": "stream", @@ -1792,7 +1464,7 @@ "metadata": { "id": "yB2lzAyopS_S", "colab_type": "code", - "outputId": "06922e76-c61b-4212-afc5-4f3a51eaaa09", + "outputId": "db6c7add-5452-4535-8948-a426654851b7", "colab": { "base_uri": "https://localhost:8080/", "height": 225 @@ -1804,7 +1476,7 @@ "# let's see what we've got\n", "print(df_train['unitcnt'].value_counts())" ], - "execution_count": 31, + "execution_count": 179, "outputs": [ { "output_type": "stream", @@ -1843,7 +1515,7 @@ "metadata": { "id": "-icFDeLSoJwl", "colab_type": "code", - "outputId": "03d9a89b-6e21-4bba-ae75-aa229c744dcf", + "outputId": "b1ed39c3-3a14-4dc1-eb48-b3429da5cffe", "colab": { "base_uri": "https://localhost:8080/", "height": 34 @@ -1862,7 +1534,7 @@ "\n", "print(df_train.pool_sqft.isna().sum())" ], - "execution_count": 32, + "execution_count": 180, "outputs": [ { "output_type": "stream", @@ -1890,10 +1562,10 @@ "metadata": { "id": "3pVABkZTYK9F", "colab_type": "code", - "outputId": "e926b021-7cbb-4acd-96c8-2a15fbe1afd7", + "outputId": "b5cb7ced-7458-4971-936c-b6e5d33bc126", "colab": { "base_uri": "https://localhost:8080/", - "height": 86 + "height": 173 } }, "source": [ @@ -1902,6 +1574,11 @@ "#land_tax\n", "#total_property_tax_2016\n", "#2)recalculate total_parcel_tax = structure_tax + land_tax\n", + "print(df_train.total_property_tax_2016.isnull().sum())\n", + "print(df_train.structure_tax.isnull().sum())\n", + "print(df_train.total_parcel_tax.isnull().sum())\n", + "print(df_train.land_tax.isnull().sum())\n", + "print()\n", "\n", "# total_parcel_tax =structure_tax + land_tax\n", "#->structure_tax=total_parcel_tax -land_tax\n", @@ -1920,11 +1597,16 @@ "print(df_train.total_parcel_tax.isnull().sum())\n", "print(df_train.land_tax.isnull().sum())" ], - "execution_count": 33, + "execution_count": 181, "outputs": [ { "output_type": "stream", "text": [ + "6\n", + "380\n", + "1\n", + "1\n", + "\n", "6\n", "380\n", "1\n", @@ -1939,7 +1621,7 @@ "metadata": { "id": "8SID48LOpYvu", "colab_type": "code", - "outputId": "842c0ccb-1710-4e73-f85a-599d5b27988b", + "outputId": "6d20a3ba-4360-4554-908d-f6d673aece12", "colab": { "base_uri": "https://localhost:8080/", "height": 34 @@ -1950,7 +1632,7 @@ "df_train = df_train.drop(['regionidcounty'], axis=1)\n", "df_train.shape" ], - "execution_count": 34, + "execution_count": 182, "outputs": [ { "output_type": "execute_result", @@ -1962,7 +1644,7 @@ "metadata": { "tags": [] }, - "execution_count": 34 + "execution_count": 182 } ] }, @@ -1971,7 +1653,7 @@ "metadata": { "id": "tWmM2J8_pkg1", "colab_type": "code", - "outputId": "e544a196-1e32-4d18-a6b5-98c49a57589e", + "outputId": "6362e07f-e363-4884-b0c5-9380b5fee956", "colab": { "base_uri": "https://localhost:8080/", "height": 34 @@ -1984,7 +1666,7 @@ "df_train['bedroomcnt'].loc[df_train['bedroomcnt'] == 0] = np.nan\n", "print(df_train.bedroomcnt.isnull().sum())" ], - "execution_count": 35, + "execution_count": 183, "outputs": [ { "output_type": "stream", @@ -2000,10 +1682,10 @@ "metadata": { "id": "3qnP2L9LpmeJ", "colab_type": "code", - "outputId": "2e863b26-9267-45f2-f31f-3305f5577ce3", + "outputId": "c0eabce4-3232-4435-8733-779526f18c57", "colab": { "base_uri": "https://localhost:8080/", - "height": 104 + "height": 208 } }, "source": [ @@ -2014,6 +1696,12 @@ "# bedroomcnt 1421\n", "# roomcnt 1416\n", "\n", + "print(df_train.total_bath.isna().sum())\n", + "print(df_train.full_bath.isnull().sum())\n", + "print(df_train.half_bath.isnull().sum())\n", + "print(df_train.bedroomcnt.isnull().sum())\n", + "print(df_train.roomcnt.isnull().sum())\n", + "print()\n", "\n", "# roomcnt = (full_bath + half_bath) + bedroomcnt\n", "# total_bath = fullbath+ 0.5(half_bath)\n", @@ -2040,11 +1728,17 @@ "print(df_train.bedroomcnt.isnull().sum())\n", "print(df_train.roomcnt.isnull().sum())" ], - "execution_count": 36, + "execution_count": 184, "outputs": [ { "output_type": "stream", "text": [ + "1165\n", + "1182\n", + "1182\n", + "1421\n", + "69700\n", + "\n", "1165\n", "1182\n", "1182\n", @@ -2074,7 +1768,7 @@ "metadata": { "id": "IW4CG2InpolD", "colab_type": "code", - "outputId": "288444a4-d153-4624-c961-b3956092d87e", + "outputId": "02375307-54e2-432b-8b87-1397c73d56b2", "colab": { "base_uri": "https://localhost:8080/", "height": 260 @@ -2129,7 +1823,7 @@ "print(f'AFTER\\n{df_train.numberofstories.value_counts()}\\n'\n", " f'{df_train.numberofstories.isnull().sum()} remaining null values')" ], - "execution_count": 37, + "execution_count": 185, "outputs": [ { "output_type": "stream", @@ -2158,7 +1852,7 @@ "metadata": { "id": "AHcMsDCxprd4", "colab_type": "code", - "outputId": "516954f4-d1d9-4876-e3a4-4545d865d9f6", + "outputId": "30481b2c-e035-4478-d62f-63e10a09c17e", "colab": { "base_uri": "https://localhost:8080/", "height": 295 @@ -2190,7 +1884,7 @@ "print(f'AFTER\\n{df_train.fireplace_count.value_counts()}\\n'\n", " f'{df_train.fireplace_count.isnull().sum()} remaining null values')" ], - "execution_count": 38, + "execution_count": 186, "outputs": [ { "output_type": "stream", @@ -2221,7 +1915,7 @@ "metadata": { "id": "FIuSWoJspt3H", "colab_type": "code", - "outputId": "d8b6ef02-d214-4530-ce3a-c6efc5bd01cf", + "outputId": "cb11c3a1-1658-4bce-cbde-a1a47ccdc0a8", "colab": { "base_uri": "https://localhost:8080/", "height": 317 @@ -2241,7 +1935,7 @@ "# display the graph\n", "plt.show()" ], - "execution_count": 41, + "execution_count": 187, "outputs": [ { "output_type": "display_data", @@ -2262,7 +1956,7 @@ "metadata": { "id": "KOHPCFRSp5y9", "colab_type": "code", - "outputId": "471d6f7c-607a-4520-d219-3ab56500c004", + "outputId": "e0f3fe2e-a82a-49e8-a798-a3f79a30bcee", "colab": { "base_uri": "https://localhost:8080/", "height": 274 @@ -2274,12 +1968,12 @@ "# display the graph\n", "plt.show()" ], - "execution_count": 42, + "execution_count": 188, "outputs": [ { "output_type": "display_data", "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEBCAYAAACQbKXWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGxxJREFUeJzt3X1wFPXBB/Dv3p0XEuCa3CVAeLVO\nDY1TkTYOYVILEpDQCtiGmWJ5iQ4obS0WWhlFaosVBE+Uygga7EOLqAP/JEMr9CnY4cUChUJ9oXEo\noSFAZiAvdwlPCJLEu/09f8Cdebm73Mve7W5+389fuHv72+/u7X3vsrfeKkIIASIikopF7wBERJR6\nLH8iIgmx/ImIJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKS\nEMufiEhCNr0DtLRch6oKuFyD4PW26R0nLsyuD2ZPPbPmBvpPdotFQVbWwITH1L38VVVAVUXw32bF\n7Ppg9tQza26A2bviaR8iIgmx/ImIJMTyJyKSUFTl73a7UVxcjLFjx6K6uhoA0NLSgscffxwlJSWY\nNWsWli5diubm5qSGJSIibUT1he/UqVNRVlaG+fPnB6cpioLHHnsMhYWFAG6+QbzyyitYt25dcpJS\n0rUePwZPZQV8zV7AYgFUFTanC9mlc+CYWNT7MQEWCxyTJmPYgkcijtlzrHizBcbJmVUS97YmItpt\nCrmvbqkOMW5grKtH/472M2dCzutrPTanCxnjxuHz06e75QMQ9rntOi8w7fP/nkPrh4cBVe31/Na/\n+/aX83pSFKDnbcFvrU8ZOBCKokBtawtOi5nFggFjx8LX0HhzW9LSgM7ObusMtx/rx92Nzi98vfat\nJqxWwO/vljPca8IolFhu4F5cXIzy8nLk5eX1mrdv3z7s3LkT27dvjymA19sGVRXIyRmMpqZrMS1r\nFP0he+vxY2jYsR2is7PXYxS7HUPLHgWAsI8BAMf9U7od7KHGDIwVyxtAuHG+tvSnUO76ZtTjaCHa\nbYq0PyMKVZ6JrsdqBaAAfl9088JkcNw/BenpdjT8774oN0ZHEfZjKvV8TcSra8dYLApcrkEJj6nJ\nOX9VVbFz504UFxdrMRzpwFNZEbZARGcnPJUVER8D4OanwT7GDIyVaDbR2YlL77wX0zhaiHab+tpX\nYUUorLjX4/eHLv5w88JkaP3wMBr2fRB5XUZhgOIHer8mjEST6/zXrFmDjIwMLFiwIOZlu76D5eQM\n1iKOLsyevbol8vc1vj7mAwBUtdt+CDemr6U5pv0VbpwOjzfl+z3abeprf8YrVesJKZ7TNLLr8ZpI\nhNbHesLl73a7cfHiRZSXl8Niif0PCZ720Vcguy3LGfLcdIAtywkAER8Di6Xbfgg3pi3LGdP+CjdO\nWrYr5fs92m3qa38msv5UrCekwOubbwLR6/GaiJfhTvts3LgRVVVV2LJlC+x2e8JhSD/ZpXOghHkO\nFbsd2aVzIj4GAByTJvc5ZmCsRLMpdjtGL5wfZonkiXab+tpXYSlK+FnxrsdqBaxhPueFmhcmg2PS\nZAwteSDyuowiwn5MpZ6vCSOxPv/888/39aC1a9fiueeeQ2NjI/bt24fKykpMmDABTz31FG677Tbs\n3r0bu3btwpEjR/Dggw/GFODGjU4IAQwcmIbPP4/jHKkB9IfsaSNH4TaXC+0XLkC9cePmpzwhYHO6\nMOTheXBMLOr9mACLBY7J9/f6Yqvn47uOFYtw44yaPjXl+z3abQq7ryKwOV0YMn8BfDc+h8/j6T0v\nivXYnC4MKpwIf+u1L/PNm49B3/xm6Oe2x7xABovDgY5Ll26eO+/y/I6aXIT/q2/6cl5PoUr31vqU\ngQNhSUu7+R3FrWkxs1gw4OtfB1Rxc1vS0nr9JRJuPzrG3Q3F6eq1bzVhtXbfnjCviXh17RhFUZCR\nkfiH7Ziu9kkGnvbRF7Prw6zZzZob6D/ZDXHah4iIzInlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEWP5ERBJi+RMRSYjlT0QkIU1+0pmIKFZa3eWN4sPyJ6KU63kHMl+zFw07tgMA3wBShKd9\niCjltLrLG8WP5U9EKRfuBjQpuzENsfyJKPVsTldM00l7LH8iSjmt7vJG8eMXvkSUcoEvdXm1j35Y\n/kSkC8fEIpa9jnjah4hIQix/IiIJsfyJiCTE8icikhDLn4hIQix/IiIJsfyJiCTE8icikhDLn4hI\nQn2Wv9vtRnFxMcaOHYvq6urg9NraWsydOxclJSWYO3cuLly4kMycRESkoT5/3mHq1KkoKyvD/Pnz\nu01fvXo15s2bh4ceegh/+tOf8Jvf/AY7duxIWlC91b/7Nlo/PAyoKmCxwDFpMjK+dic8lRWobvYC\nFgugqkn9jZKudz4KUhTAbgc6OqLK0HOM6l6PSJ1Azvq3/wh88UXMy4fLPiA/H6OfeqbX9EuvutF+\n5syXE9LTgRs3Yl6vFvTc74kwa27AYNnT05H3+pu6RlCEECKaBxYXF6O8vBx5eXnwer0oKSnBiRMn\nYLVa4ff7UVhYiP3798PpdMYUwOttg6oK5OQMRlPTtbg2Itnq330brYcO9p6hKECI3afY7Rha9qim\nbwA973zUl1AZYh3DzHq+AfQqfiK9xfAG0LUfLRYFLteghFcf1zn/K1euYOjQobBarQAAq9WKIUOG\n4MqVKwkHMqLWDw+HnhHmfTMZdyQKdeejSEJliHUMM+tZ9Cx+Mhyd/uoM0P1XPbu+g+XkDNYxSXjV\nqhrzMr6WZk23p7qlOeEM8YxhZt22XcccROHE0hFa92Nc5Z+bm4uGhgb4/f7gaZ/Gxkbk5ubGPJYZ\nTvsEzqXHwpbl1HR7bFnOmG9x1zNDPGOYmWGPJ6Jboj1GDXPax+VyIT8/H3v27AEA7NmzB/n5+TGf\n7zcLx6TJoWcoSujJSbgjUag7H0USKkOsY5jZgPz8iP9NpLv0dF1X3+cXvmvXrsX+/fvh8XiQlZWF\nzMxM7N27FzU1NVi5ciVaW1vhcDjgdrtxxx13xBzAFJ/8EflqH5+Jr/bRU6JX+4Rjhqt9SHIxXu2T\njE/+UV/tkyxmKf9ImF0fzJ56Zs0N9J/sup72ISIic2P5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJ\niOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kRE\nEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhW6ID\nHDx4EJs2bYIQAkIILF26FNOnT9ciGxERJUlC5S+EwNNPP4333nsPeXl5+M9//oMf/ehHmDZtGiwW\n/lEhi9bjx+CprICv2Qub04Xs0jlwTCzSO1Zc+tO2mEl/2u+htgWA4bYv4U/+FosF165dAwBcu3YN\nQ4YMYfFLpPX4MTTs2A7R2QkA8DV70bBjOwDofnDHqj9ti5n0p/0ealvq//gHAALw+4PTjLB9CbW0\noih47bXX8MQTT2DKlCn42c9+BrfbrVU2MgFPZUXwQA8QnZ3wVFbolCh+/WlbzKQ/7fdQ2wK/L1j8\nAUbYvoQ++ft8PmzduhVvvPEGCgoK8K9//QvLly/H3r17MXDgwKjGcLkGBf+dkzM4kTi6kjV7dUtz\nyOm+luaU7BMt15HqbTHrMaN17lTu92Tv83DbEkqs26d19oTK/8yZM2hsbERBQQEAoKCgAOnp6aip\nqcG4ceOiGsPrbYOqCuTkDEZT07VE4uhG5uy2LCd8zd6Q05O9T7Te76ncFrMeM8nInar9nop9Hm5b\nwj022jxds1ssSrcPzfFK6LTPsGHDUF9fj/PnzwMAampq4PV6MXr06ISDkTlkl86BYrd3m6bY7cEv\nucykP22LmfSn/R5qW2C1AVZrt0lG2L6EPvnn5OTg+eefx7Jly6AoCgBg3bp1yMzM1CQcGV/gCyuj\nXckQj/60LWbSn/Z7uG0JNU3v7VOEEELPADztoy9m14dZs5s1N9B/shvitA8REZkTy5+ISEIsfyIi\nCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIJ38wl1aqf/Clw40a3aYHfyvj8\nv+fQ+uFhQFUBiwWOSZPReuhg8jMlfQ3Jw+z6MGt2s+YGdMyuKIAQgMUCqCp/2ycglt/2CVX8QYEd\nTERkcIrdjqFlj0b9BsDf9glX/ACLn4hMwwh38jJX+RMR9RPR3vQlWVj+REQ6sDlduq7fXOWfnh5+\n3q2byRARGZ0R7uRlqvLPe/3NkG8ANqcLwxY/Dsf9U25+ow7cvNrn/ikpTkhE1EPgg+mtbrI5XTF9\n2Zssprrax6iYXR/MnnpmzQ30n+xyXu1DRESaYPkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9E\nJCGWPxGRhFj+REQSSvhmLh0dHVi3bh3+8Y9/IC0tDePHj8eaNWu0yEZEREmScPlv2LABaWlp2Ldv\nHxRFgcfj0SIXEZHptR4/Bk9lBXzNXsPcwSsgofK/fv06du/ejcOHD0O59eNF2dnZmgQjIjKz1uPH\n0LBjO0RnJ4Cbv9/fsGM7ABjiDSChc/51dXXIzMzE5s2bUVpaioULF+LUqVNaZSMiMi1PZUWw+AOM\ncAevgIQ++fv9ftTV1eGuu+7CM888g08//RQ/+clP8MEHH2DQoOh+da7rr9Pl5AxOJI6umF0fzJ56\nZs0NpDZ7dUtzyOm+lua4cmidPaHyz83Nhc1mw8yZMwEA99xzD7KyslBbW4u77747qjH4k876YnZ9\nmDW7WXMDqc9uy3KGvFWjLcsZcw7D/aSz0+lEYWEhjh49CgCora2F1+vFmDFjEg5GRGRm2aVzoNjt\n3aYZ4Q5eAQlf7fPb3/4Wq1atgtvths1mw8svvwyHw6FFNiIi0wp8qdsvr/YBgFGjRuGdd97RIgsR\nUb/imFhkmLLvif+HLxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGR\nhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9E\nJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUlIs/LfvHkzxo4di+rqaq2GJCKi\nJLFpMchnn32GTz75BCNGjNBiuJi1Hj8GT2UFfM1e2JwuZJfOgWNiUZ/z4h3z0qtutJ85E3xst7c7\nRQFuuw3o7AQsFkBVg8sDCI4ZmKc3M79VM3vqmTU3YNzsef+zXZf1Jlz+nZ2deOGFF/Dqq6+irKxM\ni0wxaT1+DA07tkN0dgIAfM1eNOzYHpwfbl6kN4BIY149+vduxd+LEDeLHwiWu6/Zi/o//gGAAPz+\nbvOISG7Vjz2qyxtAwuW/adMmzJ49GyNHjtQiT8w8lRXBkg4QnZ3wVFYE/x1qXqTyjzSmr9kbX1C/\nL77liIiSIKHy//jjj1FVVYUVK1bEPYbLNSj475ycwTEvX93SHHK6L8z0wLxI64pnTCKieEXTffH0\nYyQJlf/JkydRU1ODqVOnAgDq6+uxePFirF+/Hvfdd19UY3i9bVBVgZycwWhquhZzBluWM+SncVuW\nEwDCzou0rkhjxv3Jn4gojL66r2s/WixKtw/N8Uroap8lS5bgyJEjOHDgAA4cOIBhw4Zh27ZtURe/\nFrJL50Cx27tNU+x2ZJfOiTgv3jEH5OfHF9RqA6zW+JYlItKYJlf76Clw7j7SFT2xXu0TaUzHxKJe\nV/t0Y7KrfYhIX3pd7aMIIYQua74l0dM+RsDs+mD21DNrbqD/ZDfEaR8iIjInlj8RkYRY/kREEmL5\nExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY\n/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQh\nlj8RkYRY/kREErIlsnBLSwuefvppXLp0CXa7HWPGjMELL7wAp9OpVT4iIkqChD75K4qCxx57DPv2\n7cP777+PUaNG4ZVXXtEqGxERJUlC5Z+ZmYnCwsLgf48fPx6XL19OOBQRESWXIoQQWgykqioWLVqE\n4uJilJWVaTEkERElSULn/Ltas2YNMjIysGDBgpiW83rboKoCOTmD0dR0Tas4KcXs+mD21DNrbqD/\nZLdYFLhcgxIeU5Pyd7vduHjxIsrLy2Gx8AIiIiKjS7j8N27ciKqqKrz11luw2+1aZCIioiRLqPzP\nnTuHrVu34vbbb8fDDz8MABg5ciS2bNmiSTgiIkqOhMr/zjvvxNmzZ7XKQkREKcIT9EREEmL5ExFJ\niOVPRCQhlj8RkYRY/kREEmL5ExFJiOVPRCQhlj8RkYRY/kREEtLsVz3pptbjx+CprICv2Qub04Xs\n0jlwTCyKebmMcePw+enTIcfp+ljLoEEQQkBcv97ncqHWV93sBSwWQFU13xfJVh1mupKWBtHRAZvT\nBdvQIWg/e9Zw2xcuu9GZNTdgwOwWCxyTJmPYgkd0Wb1mv+cfr/70k86tx4+hYcd2iM7O4DzFbsfQ\nskcjvgGEWq6nwDgA+nxsqOW6rj+a9RFRajjun9LnG0AyftKZp3005Kms6FWoorMTnsqKmJfrKTBO\nNI/ta/2xjkFEydP64WFd1svTPhryNXtjmh7t/Fgf19dy8Y5DREmg0ylJfvLXkM3piml6tPO7Pi7a\nx0YaP54xiChJdLoBFstfQ9mlc6D0uKGNYrcju3ROzMv1FBgnmsf2tf5YxyCi5HFMmqzLennaR0OB\nL1Vjvdon1HJ9XbWTyNU+Pddn1qt9wjHD1T5EvNqnH13tY0bMrg+zZjdrbqD/ZOfVPkREFDeWPxGR\nhFj+REQSYvkTEUlI96t9LBYl5L/Nhtn1weypZ9bcQP/IrtU26H61DxERpR5P+xARSYjlT0QkIZY/\nEZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSSgp5e92u1FcXIyxY8eiuro6OP3gwYP4\n/ve/j4ceegizZ8/G/v37o5pXW1uLuXPnoqSkBHPnzsWFCxeSETti9kOHDuEHP/gBZs2ahQULFqCu\nri6qfEbO3tLSgscffxwlJSWYNWsWli5diubm5uByn3zyCWbPno2SkhIsWrQIXm/y7v0bz34P2Lx5\nc6/ljJ69o6MDq1evxvTp0zFr1iz8+te/Ds4z8jEDGOO1GunYjfTcxztP7+y1tbVYuHAhZsyYgZkz\nZ+LZZ59Fe3t7cMwDBw5gxowZeOCBB7B8+XLcuHGj7yAiCU6ePCkuX74spkyZIs6ePSuEEEJVVXHv\nvfcG//vMmTNi/Pjxwu/3R5wnhBALFy4Uu3fvFkIIsXv3brFw4cJkxA6b/erVq2LChAni/PnzwQyL\nFi0KLhMpn5Gzt7S0iOPHjweXf+mll8Szzz4rhBDC7/eLadOmiZMnTwohhNiyZYtYuXKlYbIHVFVV\nicWLF3dbzgzZ16xZI1588UWhqqoQQoimpqbgPCMfM0Z5rYY7diM99/HOM0L2uro68dlnnwWzLlu2\nTGzevFkIIURbW5soKioStbW1QgghVq1aJV5//fU+cySl/AN6lv+ECRPEqVOnhBBC/POf/xTTp0/v\nc57H4xEFBQXC5/MJIYTw+XyioKBAeL3eZEbvlv3TTz8V3/ve94LzWlpaRF5envB6vRHzGT17T3/9\n61/FI488ElzuwQcfDM7zer1i/PjxSc0tRGzZOzo6xA9/+ENRV1fXazkjZ29raxMFBQWira2t1xhG\nP2aM+FoV4stjN9JzH+88I2Tvadu2bWLVqlVCCCH+8pe/iCVLlgTnnT59utvzF07KftVTURS89tpr\neOKJJ5CRkYHr16/jrbfe6nPelStXMHToUFitVgCA1WrFkCFDcOXKFTidzpRk/+pXvwqPx4PTp09j\n3LhxeP/994PZhBBh80WaZ4TsXTOoqoqdO3eiuLg4OH/48OHB+U6nE6qq4urVq8jMzDRE9k2bNmH2\n7NkYOXJkt+WMnt1qtSIzMxObN2/GiRMnMHDgQCxbtgz33nuv4Y93p9NpuNdq12M30nMf77xkHjPR\nZu+aob29HRUVFfjlL38JoPfxPnz4cFy5cqXPdafsC1+fz4etW7fijTfewMGDB/Hmm29i+fLluH79\nesR5RjB48GD87ne/w/r161FaWgqv1wuHwxE8yI0s2uxr1qxBRkYGFixYoFPS3iJl//jjj1FVVYV5\n8+bpHTOkSNn9fj/q6upw1113obKyEitWrMCTTz6JtrY2vWMDiJzdiK9VIx670Yo1u8/nwy9+8QtM\nnDgRU6dOTWjdKfvkf+bMGTQ2NqKgoAAAUFBQgPT0dNTU1EBRlLDzRowYgYaGBvj9/uALp7GxEbm5\nuamKDgAoKipCUVERAMDj8WDbtm0YPXo0bty4ETafEMLQ2QPcbjcuXryI8vJyWCw3Pw/k5ubi8uXL\nwcc0NzfDYrGk7JNzX9nfffdd1NTUBF8A9fX1WLx4MdavX2/47O3t7bDZbJg5cyYA4J577kFWVhZq\na2sxfPhwQx8zkV7HerxWex67kZ77eOcZITsA+P1+rFixAl/5ylfw3HPPBR+Xm5uLEydOBP/78uXL\nUe3zlH3yHzZsGOrr63H+/HkAQE1NDbxeL0aPHh1xnsvlQn5+Pvbs2QMA2LNnD/Lz81P2J3BAU1MT\ngJt/pm3cuBEPP/wwMjIyIuYzenYA2LhxI6qqqrBlyxbY7fbgMt/4xjfQ3t6OU6dOAQB27dqFGTNm\npDR3pOxLlizBkSNHcODAARw4cADDhg3Dtm3bcN999xk+u9PpRGFhIY4ePQrg5pUcXq8XY8aMMfwx\nY6TXaqhjN9JzH+88I2RXVRUrV66E1WrFiy++CEX58oYu3/nOd/Dvf/87eGXVrl278N3vfrfPDEm5\nmcvatWuxf/9+eDweZGVlITMzE3v37sWf//xn/P73vw8G//nPf45p06YBQMR5NTU1WLlyJVpbW+Fw\nOOB2u3HHHXdoHTti9l/96lf46KOP8MUXX+Db3/42Vq1ahbS0tD7zGTn7uXPnMHPmTNx+++0YMGAA\nAGDkyJHYsmULAOCjjz7C6tWr0dHRgREjRmDDhg3Izs42RPaeiouLUV5ejry8PFNkr6urw6pVq3D1\n6lXYbDYsX74ckydPBmDsYwYwxms10rEb6bmPd57e2Q8dOoQf//jHyMvLC/51/q1vfQurV68GAPzt\nb3/Dhg0boKoq8vPz8dJLLwU/4IXDO3kREUmI/4cvEZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5E\nRBJi+RMRSYjlT0Qkof8Hm7xNb6groUQAAAAASUVORK5CYII=\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEBCAYAAACQbKXWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGxtJREFUeJzt3WtwE+ehBuB3JUXGxqi2ZAPmmmYa\nU2caQusMZtwUgiGYNkBaM1NSLk4GEtqmpNCGSQlNSxoIRCWhYQKJSQ8tIcnAH3toAz2FdLikQKHQ\nXKgzFFNjwDPgiyRzjIkvSPrOD5DiiyTrstLu+nufX7Cr/fbd1eqVWC1aRQghQEREUjFpHYCIiFKP\n5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQS\nYvkTEUnIonWAlpYb8PsFHI5MuN1tWseJC7Nrg9lTz6i5gYGT3WRSkJ09OOExNS9/v1/A7xfBPxsV\ns2uD2VPPqLkBZu+Op32IiCTE8icikhDLn4hIQlGVv9PpRElJCcaNG4eamhoAQEtLC5588kmUlpZi\n9uzZWLZsGTweT1LDEhGROqL6wnfatGkoLy/HggULgtMURcETTzyBoqIiALfeIF555RWsX78+OUkp\n6VpPHIerqhJejxswmQC/Hxa7Azllc2GbVNz3MQEmE2yTp2D4wscijtl7rHizBcbJnV0a97YmItpt\nCrmvbqsJMW5grGvH/o6Os2dDzutvPRa7Axnjx+PzM2d65AMQ9rntPi8w7fP/nkfrh0cAv7/P89vw\n7ttfzOtNUYDetwW/vT5TZiaEEBA3bgSnxcxkwqBx4+BtbILX44aSlgbR2RnVfmwYfy+6bnr77FtV\nmM2Az9cjZ7jXhF4osdzAvaSkBBUVFcjPz+8zb//+/di1axd27NgRUwC3uw1+v0Bu7hA0N1+PaVm9\nGAjZW08cR+POHRBdXX0eo1itGFb+OACEfQwA2B6c2uNgDzVmYKxY3gDCjfOVZT+Gcs/Xox5HDdFu\nU6T9GVGo8kx0PWYzAAXweaObFyaD7cGpSE+3ovF/90e5MRqKsB9TqfdrIl7dO8ZkUuBwZCY8pirn\n/P1+P3bt2oWSkhI1hiMNuKoqwxaI6OqCq6oy4mMA3Po02M+YgbESzSa6unD5nfdiGkcN0W5Tf/sq\nrAiFFfd6fL7QxR9uXpgMrR8eQeP+DyKvSy90UPxA39eEnqhynf/atWuRkZGBhQsXxrxs93ew3Nwh\nasTRhNGz17RE/r7G2898AIDf32M/hBvT2+KJaX+FG6fT5U75fo92m/rbn/FK1XpCiuc0jex6vSYS\nofaxnnD5O51OXLp0CRUVFTCZYv+HBE/7aCuQ3ZJtD3luOsCSbQeAiI+BydRjP4Qb05Jtj2l/hRsn\nLceR8v0e7Tb1tz8TWX8q1hNS4PXNN4Ho9XpNxEt3p302bdqE6upqbN26FVarNeEwpJ2csrlQwjyH\nitWKnLK5ER8DALbJU/odMzBWotkUqxVjFi0Is0TyRLtN/e2rsBQl/Kx412M2A+Ywn/NCzQuTwTZ5\nCoaVPhR5XXoRYT+mUu/XhJ6YX3jhhRf6e9C6devw/PPPo6mpCfv370dVVRUmTpyIZ555BnfccQf2\n7NmD3bt34+jRo3j44YdjCtDe3gUhgMGD0/D553GcI9WBgZA9bdRo3OFwoOPiRfjb2299yhMCFrsD\nQx+dD9uk4r6PCTCZYJvyYJ8vtno/vvtYsQg3zugZ01K+36PdprD7KgKL3YGhCxbC2/45vC5X33lR\nrMdidyCzaBJ8rde/yDd/ATK//vXQz22veYEMJpsNnZcv3zp33u35HT2lGP/X0PzFvN5Cle7t9Zky\nM4E77gBu3gxOi5nJhEFf/SrgF/C3t0NJS+t5lU2E/Wgbfy8Uu6PPvlWF2dxze8K8JuLVvWMURUFG\nRuIftmO62icZeNpHW8yuDaNmN2puYOBk18VpHyIiMiaWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGR\nhFj+REQSYvkTEUmI5U9EJCGWPxGRhFT5SWciolipdZc3ig/Ln4hSrvcdyLweNxp37gAAvgGkCE/7\nEFHKqXWXN4ofy5+IUi7cDWhSdmMaYvkTUepZ7I6YppP6WP5ElHJq3eWN4scvfIko5QJf6vJqH+2w\n/IlIE7ZJxSx7DfG0DxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGR\nhPotf6fTiZKSEowbNw41NTXB6XV1dZg3bx5KS0sxb948XLx4MZk5iYhIRf3+vMO0adNQXl6OBQsW\n9Ji+Zs0azJ8/H4888gj+9Kc/4de//jV27tyZtKBaa3j3bbR+eATw+wGTCbbJU5DxlbvhqqpEjccN\nmEyA35/U3yjpfuejIEUBrFagszOqDL3HqOnziNQJ5Gx4+4/AzZsxLx8u+6CCAox55hd9pl9+1YmO\ns2e/mJCeDrS3x7xeNWi53xNh1NyAzrKnpyP/9Tc1jaAIIUQ0DywpKUFFRQXy8/PhdrtRWlqKkydP\nwmw2w+fzoaioCAcOHIDdbo8pgNvdBr9fIDd3CJqbr8e1EcnW8O7baD18qO8MRQFC7D7FasWw8sdV\nfQPofeej/oTKEOsYRtb7DaBP8RNpLYY3gO79aDIpcDgyE159XOf8r169imHDhsFsNgMAzGYzhg4d\niqtXryYcSI9aPzwSekaY981k3JEo1J2PIgmVIdYxjKx30bP4SXc0+ldngOa/6tn9HSw3d4iGScKr\n8ftjXsbb4lF1e2paPAlniGcMI+ux7RrmIAonlo5Qux/jKv+8vDw0NjbC5/MFT/s0NTUhLy8v5rGM\ncNoncC49FpZsu6rbY8m2x3yLu94Z4hnDyHR7PBHdFu0xqpvTPg6HAwUFBdi7dy8AYO/evSgoKIj5\nfL9R2CZPCT1DUUJPTsIdiULd+SiSUBliHcPIBhUURPw7kebS0zVdfb9f+K5btw4HDhyAy+VCdnY2\nsrKysG/fPtTW1mLVqlVobW2FzWaD0+nEXXfdFXMAQ3zyR+SrfbwGvtpHS4le7ROOEa72IcnFeLVP\nMj75R321T7IYpfwjYXZtMHvqGTU3MHCya3rah4iIjI3lT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMR\nSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGELIkO\ncOjQIWzevBlCCAghsGzZMsyYMUONbERElCQJlb8QAs8++yzee+895Ofn4z//+Q9+8IMfYPr06TCZ\n+I8KWbSeOA5XVSW8HjcsdgdyyubCNqlY61hxGUjbYiQDab+H2hYAutu+hD/5m0wmXL9+HQBw/fp1\nDB06lMUvkdYTx9G4cwdEVxcAwOtxo3HnDgDQ/OCO1UDaFiMZSPs91LY0/PEPAATg8wWn6WH7Empp\nRVHw2muv4amnnsLUqVPxk5/8BE6nU61sZACuqsrggR4gurrgqqrUKFH8BtK2GMlA2u+htgU+b7D4\nA/SwfQl98vd6vdi2bRveeOMNFBYW4l//+hdWrFiBffv2YfDgwVGN4XBkBv+cmzskkTiakjV7TYsn\n5HRviycl+0TNdaR6W4x6zKidO5X7Pdn7PNy2hBLr9qmdPaHyP3v2LJqamlBYWAgAKCwsRHp6Ompr\nazF+/PioxnC72+D3C+TmDkFz8/VE4mhG5uyWbDu8HnfI6cneJ2rv91Rui1GPmWTkTtV+T8U+D7ct\n4R4bbZ7u2U0mpceH5ngldNpn+PDhaGhowIULFwAAtbW1cLvdGDNmTMLByBhyyuZCsVp7TFOs1uCX\nXEYykLbFSAbSfg+1LTBbALO5xyQ9bF9Cn/xzc3PxwgsvYPny5VAUBQCwfv16ZGVlqRKO9C/whZXe\nrmSIx0DaFiMZSPs93LaEmqb19ilCCKFlAJ720Raza8Oo2Y2aGxg42XVx2oeIiIyJ5U9EJCGWPxGR\nhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJKGEb+aSajVP/xhob+8xLfBbGZ//\n9zxaPzwC+P2AyQTb5CloPXwo+ZmSvobkYXZtGDW7UXMDGmZXFEAIwGQC/H7+tk9ALL/tE6r4gwI7\nmIhI5xSrFcPKH4/6DYC/7ROu+AEWPxEZhh7u5GWs8iciGiCivelLsrD8iYg0YLE7NF2/sco/PT38\nvNs3kyEi0js93MnLUOWf//qbId8ALHYHhi95ErYHp976Rh24dbXPg1NTnJCIqJfAB9Pb3WSxO2L6\nsjdZDHW1j14xuzaYPfWMmhsYONnlvNqHiIhUwfInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+I\nSEIsfyIiCbH8iYgklPDNXDo7O7F+/Xr84x//QFpaGiZMmIC1a9eqkY2IiJIk4fLfuHEj0tLSsH//\nfiiKApfLpUYuIiLDaz1xHK6qSng9bt3cwSsgofK/ceMG9uzZgyNHjkC5/eNFOTk5qgQjIjKy1hPH\n0bhzB0RXF4Bbv9/fuHMHAOjiDSChc/719fXIysrCli1bUFZWhkWLFuH06dNqZSMiMixXVWWw+AP0\ncAevgIQ++ft8PtTX1+Oee+7BL37xC3z66af40Y9+hA8++ACZmdH96lz3X6fLzR2SSBxNMbs2mD31\njJobSG32mhZPyOneFk9cOdTOnlD55+XlwWKxYNasWQCA++67D9nZ2airq8O9994b1Rj8SWdtMbs2\njJrdqLmB1Ge3ZNtD3qrRkm2POYfuftLZbrejqKgIx44dAwDU1dXB7XZj7NixCQcjIjKynLK5UKzW\nHtP0cAevgISv9vnNb36D1atXw+l0wmKx4Le//S1sNpsa2YiIDCvwpe6AvNoHAEaPHo133nlHjSxE\nRAOKbVKxbsq+N/4PXyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIi\nCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+I\nSEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpKQauW/ZcsWjBs3DjU1NWoNSURE\nSWJRY5DPPvsMn3zyCUaOHKnGcDFrPXEcrqpKeD1uWOwO5JTNhW1Scb/z4h3z8qtOdJw9G3xsj7c7\nRQGsVqCzEzCZAL8/uDyA4JiBeVoz8ls1s6eeUXMD+s2e/z87NFlvwuXf1dWFF198Ea+++irKy8vV\nyBST1hPH0bhzB0RXFwDA63GjceeO4Pxw8yK9AUQa89qxv/co/j6EuFX8QLDcvR43Gv74BwAC8Pl6\nzCMiudU88bgmbwAJl//mzZsxZ84cjBo1So08MXNVVQZLOkB0dcFVVRn8c6h5kco/0phejzu+oD5v\nfMsRESVBQuX/8ccfo7q6GitXrox7DIcjM/jn3NwhMS9f0+IJOd0bZnpgXqR1xTMmEVG8oum+ePox\nkoTK/9SpU6itrcW0adMAAA0NDViyZAk2bNiABx54IKox3O42+P0CublD0Nx8PeYMlmx7yE/jlmw7\nAISdF2ldkcaM+5M/EVEY/XVf9340mZQeH5rjldDVPkuXLsXRo0dx8OBBHDx4EMOHD8f27dujLn41\n5JTNhWK19pimWK3IKZsbcV68Yw4qKIgvqNkCmM3xLUtEpDJVrvbRUuDcfaQremK92ifSmLZJxX2u\n9unBYFf7EJG2tLraRxFCCE3WfFuip330gNm1weypZ9TcwMDJrovTPkREZEwsfyIiCbH8iYgkxPIn\nIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8\niYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIs\nfyIiCbH8iYgkZElk4ZaWFjz77LO4fPkyrFYrxo4dixdffBF2u12tfERElAQJffJXFAVPPPEE9u/f\nj/fffx+jR4/GK6+8olY2IiJKkoTKPysrC0VFRcG/T5gwAVeuXEk4FBERJZcihBBqDOT3+7F48WKU\nlJSgvLxcjSGJiChJEjrn393atWuRkZGBhQsXxrSc290Gv18gN3cImpuvqxUnpZhdG8yeekbNDQyc\n7CaTAocjM+ExVSl/p9OJS5cuoaKiAiYTLyAiItK7hMt/06ZNqK6uxltvvQWr1apGJiIiSrKEyv/8\n+fPYtm0b7rzzTjz66KMAgFGjRmHr1q2qhCMiouRIqPzvvvtunDt3Tq0sRESUIjxBT0QkIZY/EZGE\nWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIdV+1ZNuaT1xHK6qSng9bljsDuSU\nzYVtUnHMy2WMH4/Pz5wJOU73x5oyMyGEgLhxo9/lQq2vxuMGTCbA71d9XyRbTZjpSloaRGcnLHYH\nLMOGouPcOd1tX7jsemfU3IAOs5tMsE2eguELH9Nk9ar9nn+8BtJPOreeOI7GnTsgurqC8xSrFcPK\nH4/4BhBqud4C4wDo97Ghluu+/mjWR0SpYXtwar9vAMn4SWee9lGRq6qyT6GKri64qipjXq63wDjR\nPLa/9cc6BhElT+uHRzRZL0/7qMjrccc0Pdr5sT6uv+XiHYeIkkCjU5L85K8ii90R0/Ro53d/XLSP\njTR+PGMQUZJodAMslr+KcsrmQul1QxvFakVO2dyYl+stME40j+1v/bGOQUTJY5s8RZP18rSPigJf\nqsZ6tU+o5fq7aieRq316r8+oV/uEY4SrfYh4tc8AutrHiJhdG0bNbtTcwMDJzqt9iIgobix/IiIJ\nsfyJiCTE8icikpDmV/uYTErIPxsNs2uD2VPPqLmBgZFdrW3Q/GofIiJKPZ72ISKSEMufiEhCLH8i\nIgmx/ImIJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKSUFLK3+l0oqSkBOPGjUNNTU1w+qFDh/Dd\n734XjzzyCObMmYMDBw5ENa+urg7z5s1DaWkp5s2bh4sXLyYjdsTshw8fxve+9z3Mnj0bCxcuRH19\nfVT59Jy9paUFTz75JEpLSzF79mwsW7YMHo8nuNwnn3yCOXPmoLS0FIsXL4bbnbx7/8az3wO2bNnS\nZzm9Z+/s7MSaNWswY8YMzJ49G7/61a+C8/R8zAD6eK1GOnYjPffxztM6e11dHRYtWoSZM2di1qxZ\neO6559DR0REc8+DBg5g5cyYeeughrFixAu3t7f0HEUlw6tQpceXKFTF16lRx7tw5IYQQfr9f3H//\n/cG/nz17VkyYMEH4fL6I84QQYtGiRWLPnj1CCCH27NkjFi1alIzYYbNfu3ZNTJw4UVy4cCGYYfHi\nxcFlIuXTc/aWlhZx4sSJ4PIvv/yyeO6554QQQvh8PjF9+nRx6tQpIYQQW7duFatWrdJN9oDq6mqx\nZMmSHssZIfvatWvFSy+9JPx+vxBCiObm5uA8PR8zenmthjt2Iz338c7TQ/b6+nrx2WefBbMuX75c\nbNmyRQghRFtbmyguLhZ1dXVCCCFWr14tXn/99X5zJKX8A3qX/8SJE8Xp06eFEEL885//FDNmzOh3\nnsvlEoWFhcLr9QohhPB6vaKwsFC43e5kRu+R/dNPPxXf+c53gvNaWlpEfn6+cLvdEfPpPXtvf/3r\nX8Vjjz0WXO7hhx8OznO73WLChAlJzS1EbNk7OzvF97//fVFfX99nOT1nb2trE4WFhaKtra3PGHo/\nZvT4WhXii2M30nMf7zw9ZO9t+/btYvXq1UIIIf7yl7+IpUuXBuedOXOmx/MXTsp+1VNRFLz22mt4\n6qmnkJGRgRs3buCtt97qd97Vq1cxbNgwmM1mAIDZbMbQoUNx9epV2O32lGT/8pe/DJfLhTNnzmD8\n+PF4//33g9mEEGHzRZqnh+zdM/j9fuzatQslJSXB+SNGjAjOt9vt8Pv9uHbtGrKysnSRffPmzZgz\nZw5GjRrVYzm9ZzebzcjKysKWLVtw8uRJDB48GMuXL8f999+v++Pdbrfr7rXa/diN9NzHOy+Zx0y0\n2btn6OjoQGVlJX7+858D6Hu8jxgxAlevXu133Sn7wtfr9WLbtm144403cOjQIbz55ptYsWIFbty4\nEXGeHgwZMgS/+93vsGHDBpSVlcHtdsNmswUPcj2LNvvatWuRkZGBhQsXapS0r0jZP/74Y1RXV2P+\n/PlaxwwpUnafz4f6+nrcc889qKqqwsqVK/H000+jra1N69gAImfX42tVj8dutGLN7vV68bOf/QyT\nJk3CtGnTElp3yj75nz17Fk1NTSgsLAQAFBYWIj09HbW1tVAUJey8kSNHorGxET6fL/jCaWpqQl5e\nXqqiAwCKi4tRXFwMAHC5XNi+fTvGjBmD9vb2sPmEELrOHuB0OnHp0iVUVFTAZLr1eSAvLw9XrlwJ\nPsbj8cBkMqXsk3N/2d99913U1tYGXwANDQ1YsmQJNmzYoPvsHR0dsFgsmDVrFgDgvvvuQ3Z2Nurq\n6jBixAhdHzORXsdavFZ7H7uRnvt45+khOwD4fD6sXLkSX/rSl/D8888HH5eXl4eTJ08G/37lypWo\n9nnKPvkPHz4cDQ0NuHDhAgCgtrYWbrcbY8aMiTjP4XCgoKAAe/fuBQDs3bsXBQUFKfsncEBzczOA\nW/9M27RpEx599FFkZGREzKf37ACwadMmVFdXY+vWrbBarcFlvva1r6GjowOnT58GAOzevRszZ85M\nae5I2ZcuXYqjR4/i4MGDOHjwIIYPH47t27fjgQce0H12u92OoqIiHDt2DMCtKzncbjfGjh2r+2NG\nT6/VUMdupOc+3nl6yO73+7Fq1SqYzWa89NJLUJQvbujyrW99C//+97+DV1bt3r0b3/72t/vNkJSb\nuaxbtw4HDhyAy+VCdnY2srKysG/fPvz5z3/G73//+2Dwn/70p5g+fToARJxXW1uLVatWobW1FTab\nDU6nE3fddZfasSNm/+Uvf4mPPvoIN2/exDe/+U2sXr0aaWlp/ebTc/bz589j1qxZuPPOOzFo0CAA\nwKhRo7B161YAwEcffYQ1a9ags7MTI0eOxMaNG5GTk6OL7L2VlJSgoqIC+fn5hsheX1+P1atX49q1\na7BYLFixYgWmTJkCQN/HDKCP12qkYzfScx/vPK2zHz58GD/84Q+Rn58f/Nf5N77xDaxZswYA8Le/\n/Q0bN26E3+9HQUEBXn755eAHvHB4Jy8iIgnxf/gSEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9E\nJCGWPxGRhFj+REQS+n9YnE5sVgm99QAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -2308,7 +2002,10 @@ "colab_type": "text" }, "source": [ - "# -----current: test ready-----" + "# -----current: test ready-----\n", + "- converting to pandas \n", + " - to see what's going on\n", + " - figuring out what can and what can't be replicated in cuML" ] }, { @@ -2319,7 +2016,9 @@ "colab": {} }, "source": [ - "from cuml.preprocessing.model_selection import train_test_split\n", + "from sklearn import neighbors\n", + "# from cuml.preprocessing.model_selection import train_test_split\n", + "from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split\n", "#location seems to be related to building quality, (knnclassifier)\n", "\n", "def fillna_knn(df, base, target):\n", @@ -2335,8 +2034,10 @@ " #print(not_missing_rows.head())\n", " Y = not_missing_rows[target]\n", " X = not_missing_rows[base]\n", - " #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=3192,stratify=Y)\n", - " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)\n", + " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, \n", + " test_size=0.20,\n", + " random_state=3192,\n", + " stratify=Y)\n", " metrics = ['euclidean'] \n", " weights = ['distance'] \n", " numNeighbors = [5,10,15,20,25]\n", @@ -2363,21 +2064,88 @@ "metadata": { "id": "6eES-hq--NKZ", "colab_type": "code", - "colab": {} + "outputId": "2bc86856-507d-47bf-cfab-d29649cba819", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 903 + } }, "source": [ + "# make safe copy\n", "# test = df_train.copy()\n", - "df_train = test.copy()" + "df_train = test.copy()\n", + "# switch to pandas (figuring out what's going on)\n", + "df_train = df_train.to_pandas()\n", + "\n", + "print(df_train.info())" ], - "execution_count": 0, - "outputs": [] + "execution_count": 191, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 90275 entries, 0 to 90274\n", + "Data columns (total 45 columns):\n", + "parcelid 90275 non-null int64\n", + "logerror 90275 non-null float64\n", + "ac_id 28781 non-null float64\n", + "basement_sqft 90275 non-null float64\n", + "total_bath 89110 non-null float64\n", + "bedroomcnt 88854 non-null float64\n", + "buildingqualitytypeid 57364 non-null float64\n", + "deck_flag 90275 non-null float64\n", + "finished_living_area_entryfloor_sqft2 6856 non-null float64\n", + "total_finished_living_area_sqft 89614 non-null float64\n", + "finished_living_area_entryfloor_sqft1 6856 non-null float64\n", + "fips 90275 non-null float64\n", + "fireplace_count 90275 non-null float64\n", + "full_bath 89093 non-null float64\n", + "garagecarcnt 29937 non-null float64\n", + "garage_sqft 21017 non-null float64\n", + "has_hottub_or_spa 90275 non-null int64\n", + "heating_system_id 56080 non-null float64\n", + "latitude 90275 non-null float64\n", + "longitude 90275 non-null float64\n", + "lot_area_sqft 80125 non-null float64\n", + "pool_count 90275 non-null float64\n", + "pool_sqft 90275 non-null float64\n", + "just_hottub_or_spa 90275 non-null float64\n", + "pool_with_spa_tub_yes 90275 non-null float64\n", + "pool_with_spa_tub_no 90275 non-null float64\n", + "propertylandusetypeid 90275 non-null float64\n", + "roomcnt 88859 non-null float64\n", + "basement_flag 90275 non-null float64\n", + "half_bath 89093 non-null float64\n", + "unitcnt 90275 non-null float64\n", + "patio_sqft 90275 non-null float64\n", + "storage_sqft 90275 non-null float64\n", + "yearbuilt 89519 non-null float64\n", + "numberofstories 20581 non-null float64\n", + "fireplaceflag 90275 non-null bool\n", + "structure_tax 89895 non-null float64\n", + "total_parcel_tax 90274 non-null float64\n", + "land_tax 90274 non-null float64\n", + "total_property_tax_2016 90269 non-null float64\n", + "taxdelinquencyflag 90275 non-null int64\n", + "taxdelinquencyyear 90275 non-null float64\n", + "transaction_month 90275 non-null int16\n", + "census_tractnumber 90275 non-null object\n", + "block_number 90275 non-null object\n", + "dtypes: bool(1), float64(38), int16(1), int64(3), object(2)\n", + "memory usage: 29.9+ MB\n", + "None\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", "metadata": { "id": "AT8Osn51lD9v", "colab_type": "code", - "outputId": "83435ba5-0887-47fb-f8fb-ceeb9dd92fda", + "outputId": "8ab0690a-2e06-468e-b7ce-f4d051a3ce83", "colab": { "base_uri": "https://localhost:8080/", "height": 573 @@ -2390,7 +2158,7 @@ "print(f'BUILDINGTYPEID HEAD\\n{df_train.buildingqualitytypeid.head()}\\n')\n", "print(f'DF TRAIN HEAD\\n{df_train.head()}')" ], - "execution_count": 49, + "execution_count": 192, "outputs": [ { "output_type": "stream", @@ -2408,23 +2176,23 @@ "8.0 5\n", "6.0 2\n", "11.0 1\n", - "Name: buildingqualitytypeid, dtype: int32\n", + "Name: buildingqualitytypeid, dtype: int64\n", "\n", "BUILDINGTYPEID HEAD\n", - "0 7.0\n", - "1 null\n", - "2 null\n", - "3 7.0\n", - "4 4.0\n", + "0 7.0\n", + "1 NaN\n", + "2 NaN\n", + "3 7.0\n", + "4 4.0\n", "Name: buildingqualitytypeid, dtype: float64\n", "\n", "DF TRAIN HEAD\n", - " parcelid logerror ac_id ... transaction_month census_tractnumber block_number\n", - "0 11827818 0.0402 null ... 3 5315.03 1013\n", - "1 12123024 0.0296 null ... 3 4625.00 1017\n", - "2 13867327 0.0344 null ... 3 0114.01 2017\n", - "3 12681894 0.0060 null ... 3 6513.02 1004\n", - "4 12848541 0.0695 1.0 ... 3 4087.03 1018\n", + " parcelid logerror ac_id ... transaction_month census_tractnumber block_number\n", + "0 11827818 0.0402 NaN ... 3 5315.03 1013\n", + "1 12123024 0.0296 NaN ... 3 4625.00 1017\n", + "2 13867327 0.0344 NaN ... 3 0114.01 2017\n", + "3 12681894 0.0060 NaN ... 3 6513.02 1004\n", + "4 12848541 0.0695 1.0 ... 3 4087.03 1018\n", "\n", "[5 rows x 45 columns]\n" ], @@ -2437,18 +2205,17 @@ "metadata": { "id": "79bB7JKdAEtX", "colab_type": "code", - "outputId": "b1b1e940-e89a-40e8-c5af-5919c896ca19", + "outputId": "32b79160-fd19-4d39-988a-fc5fcd7c3284", "colab": { "base_uri": "https://localhost:8080/", "height": 225 } }, "source": [ - "temp=df_train.copy()\n", - "temp['buildingqualitytypeid'] = temp['buildingqualitytypeid'].fillna(-1)\n", - "print(f'NULL COUNT = {temp.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{temp.buildingqualitytypeid.value_counts()}')" + "df_train['buildingqualitytypeid'] = df_train['buildingqualitytypeid'].fillna(-1)\n", + "print(f'NULL COUNT = {df_train.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{df_train.buildingqualitytypeid.value_counts()}')" ], - "execution_count": 50, + "execution_count": 193, "outputs": [ { "output_type": "stream", @@ -2464,42 +2231,75 @@ " 8.0 5\n", " 6.0 2\n", " 11.0 1\n", - "Name: buildingqualitytypeid, dtype: int32\n" + "Name: buildingqualitytypeid, dtype: int64\n" ], "name": "stdout" } ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "DVgF1c_p_bN1", + "colab_type": "text" + }, + "source": [ + "# -----current: break-----\n", + "- break 1 of 2" + ] + }, { "cell_type": "code", "metadata": { "id": "mAB9bsrPAGzQ", "colab_type": "code", - "outputId": "ff5376d3-6854-4d05-a7c1-7ffe0a6136a4", + "outputId": "d847758e-212e-4de8-85c4-89b469b71c48", "colab": { "base_uri": "https://localhost:8080/", - "height": 347 + "height": 762 } }, "source": [ "# say we run this whole thing by buildingqualitytypeid\n", - "# temp=temp.groupby(\"buildingqualitytypeid\")\n", "# drop building types that aren't seen at least 3 times in the data\n", - "# .filter(lambda x: x.buildingqualitytypeid.size > 3)\n", - "# conditions = (temp.buildingqualitytypeid.value_counts > 3)\n", - "# print(temp.loc[temp.buildingqualitytypeid.astype(int) > 3].head())\n", - "# temp.loc[temp.census_tractnumber.value_counts() > 3]\n", - "# print(temp.loc[temp.census_tractnumber.value_counts().values > 3].to_pandas().head())\n", + "# df_train = df_train.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", "\n", - "\"\"\"still working on how to best do this in RAPIDS\n", - "\"\"\"\n", - "print(f'{temp.buildingqualitytypeid.value_counts()}\\n')\n", - "temp = temp.to_pandas()\n", - "temp = temp.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", - "temp = cudf.from_pandas(temp)\n", - "print(temp.buildingqualitytypeid.value_counts())" + "# BACK TO cuDF\n", + "df_train = cudf.from_pandas(df_train)\n", + "\n", + "print(df_train.buildingqualitytypeid.value_counts())\n", + "print()\n", + "print(df_train.buildingqualitytypeid.isnull().sum())\n", + "print(df_train.shape)\n", + "print()\n", + "\n", + "type_ids = list(set(df_train.buildingqualitytypeid.values))\n", + "from time import sleep\n", + "safe = []\n", + "for tid in type_ids:\n", + " print(tid)\n", + " sleep(5)\n", + " t = len(df_train.loc[df_train.buildingqualitytypeid == tid])\n", + " if t > 3:\n", + " safe.append(tid)\n", + " else:\n", + " print(f'{tid} count too low @ {t}')\n", + "for tid in type_ids:\n", + " if tid not in safe:\n", + " df_train = df_train.loc[df_train.buildingqualitytypeid != tid]\n", + "\n", + "print()\n", + "print(df_train.buildingqualitytypeid.value_counts())\n", + "print()\n", + "\n", + "df_train['buildingqualitytypeid'] = df_train['buildingqualitytypeid'].replace(-1,np.nan)\n", + "print(df_train.buildingqualitytypeid.isnull().sum())\n", + "print(df_train.shape)\n", + "\n", + "# BACK TO PANDAS\n", + "df_train = df_train.to_pandas()" ], - "execution_count": 51, + "execution_count": 194, "outputs": [ { "output_type": "stream", @@ -2515,56 +2315,46 @@ " 11.0 1\n", "Name: buildingqualitytypeid, dtype: int32\n", "\n", - "-1.0 32911\n", - " 7.0 29310\n", - " 4.0 23839\n", - " 1.0 2627\n", - " 10.0 1461\n", - " 12.0 119\n", - " 8.0 5\n", - "Name: buildingqualitytypeid, dtype: int32\n" + "0\n", + "(90275, 45)\n", + "\n", + "1.0\n", + "4.0\n", + "6.0\n", + "6.0 count too low @ 2\n", + "7.0\n", + "8.0\n", + "10.0\n", + "11.0\n" ], "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "uCyRxp-7qEXf", - "colab_type": "code", - "outputId": "629f0745-3a63-4bd8-aa10-835a94450cb6", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - } - }, - "source": [ - "temp['buildingqualitytypeid'] = temp['buildingqualitytypeid'].replace(-1,np.nan)\n", - "print(temp.buildingqualitytypeid.isnull().sum())\n", - "print(temp.shape)" - ], - "execution_count": 52, - "outputs": [ + }, { - "output_type": "stream", - "text": [ - "32911\n", - "(90272, 45)\n" - ], - "name": "stdout" + "output_type": "error", + "ename": "ValueError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mtid\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mt\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0msafe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 109\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_tuple_arg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 110\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_tuple_arg\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;31m# Step 4: Downcast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_downcast_to_series\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/dataframe.py\u001b[0m in \u001b[0;36mindex\u001b[0;34m(self, _index)\u001b[0m\n\u001b[1;32m 1058\u001b[0m \u001b[0;34m\"have %d elements\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mold_length\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnew_length\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1059\u001b[0m )\n\u001b[0;32m-> 1060\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1061\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1062\u001b[0m \u001b[0;31m# try to build an index from generic _index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Length mismatch: Expected axis has 1 elements, new values have 90275 elements" + ] } ] }, { "cell_type": "markdown", "metadata": { - "id": "DVgF1c_p_bN1", + "id": "Zl7eXGt_g1uU", "colab_type": "text" }, "source": [ "# -----current: break-----\n", - "- below is last cell run" + "- break 2 of 2\n", + " - below is last cell run" ] }, { @@ -2572,42 +2362,76 @@ "metadata": { "id": "Q3ZBSOHm-79A", "colab_type": "code", - "outputId": "3da3e840-8d13-426a-e0aa-8ae20679326b", + "outputId": "e9ddb9b3-0bb0-4cf7-fa8e-ca35b9ea7f46", "colab": { "base_uri": "https://localhost:8080/", - "height": 394 + "height": 557 } }, "source": [ - "missing_values = fillna_knn(temp, \n", + "# run cell above (currently broken) as would be in pandas\n", + "not_df_train = df_train.to_pandas()\n", + "not_df_train = not_df_train.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", + "\n", + "missing_values = fillna_knn(not_df_train, \n", " base = ['latitude', 'longitude'], \n", " target = 'buildingqualitytypeid')\n", "\n", "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['buildingqualitytypeid'].isnull()\n", - "df_train.loc[missing_values_boolflag, 'buildingqualitytypeid'] = missing_values\n", + "missing_values_boolflag = not_df_train['buildingqualitytypeid'].isnull()\n", + "not_df_train.loc[missing_values_boolflag, 'buildingqualitytypeid'] = missing_values\n", "\n", - "print(df_train.buildingqualitytypeid.isnull().sum())" + "print(not_df_train.buildingqualitytypeid.isnull().sum())" ], - "execution_count": 53, + "execution_count": 195, "outputs": [ { "output_type": "stream", "text": [ - "# of miss 32911\n" + "# of miss 0\n", + "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" ], "name": "stdout" }, + { + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", + " metric_params=None, n_jobs=-1, n_neighbors=15, p=2,\n", + " weights='distance')\n", + "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}\n", + "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 7.1s finished\n" + ], + "name": "stderr" + }, { "output_type": "error", - "ename": "NameError", + "ename": "ValueError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m missing_values = fillna_knn(temp, \n\u001b[1;32m 2\u001b[0m \u001b[0mbase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'latitude'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'longitude'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m target = 'buildingqualitytypeid')\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"predicted output shape\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmissing_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mfillna_knn\u001b[0;34m(df, base, target)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mnumNeighbors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m15\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m25\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mparam_grid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mweights\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mweights\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mn_neighbors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnumNeighbors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mcv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStratifiedKFold\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_splits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3192\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0mgrid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mneighbors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mKNeighborsClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'f1_weighted'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrefit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mreturn_train_score\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpre_dispatch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'n_jobs'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mY_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'StratifiedKFold' is not defined" + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m missing_values = fillna_knn(not_df_train, \n\u001b[1;32m 5\u001b[0m \u001b[0mbase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'latitude'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'longitude'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m target = 'buildingqualitytypeid')\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"predicted output shape\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmissing_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mfillna_knn\u001b[0;34m(df, base, target)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mY_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0mZ\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmissing_values_boolflag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0;31m#df.loc[ missing_values_boolflag, target ] = Z\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mZ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/utils/metaestimators.py\u001b[0m in \u001b[0;36m\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;31m# lambda, but not partial, allows help() to work with update_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 117\u001b[0m \u001b[0;31m# update the docstring of the returned function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0mupdate_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 455\u001b[0m \"\"\"\n\u001b[1;32m 456\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'predict'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 457\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 458\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mif_delegate_has_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelegate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'best_estimator_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'estimator'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/neighbors/classification.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[0mClass\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0meach\u001b[0m \u001b[0mdata\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m \"\"\"\n\u001b[0;32m--> 147\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0mneigh_dist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneigh_ind\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkneighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[0;34m\" minimum of %d is required%s.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m % (n_samples, array.shape, ensure_min_samples,\n\u001b[0;32m--> 550\u001b[0;31m context))\n\u001b[0m\u001b[1;32m 551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mensure_min_features\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required." ] } ] @@ -2619,7 +2443,8 @@ "colab_type": "text" }, "source": [ - "# BELOW NOT RUN" + "# BELOW NOT (really) RUN\n", + "- if run, was in pandas" ] }, { @@ -2627,7 +2452,11 @@ "metadata": { "id": "oTh_XPErqkHf", "colab_type": "code", - "colab": {} + "outputId": "3e667bca-70c5-4b66-c7d2-12d171cb140b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + } }, "source": [ "print(df_train.heating_system_id.isnull().sum())\n", @@ -2651,14 +2480,52 @@ "print(df_train.heating_system_id.isnull().sum())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "34194\n", + "(90272, 45)\n", + "34194\n", + "(90266, 45)\n", + "# of miss 34194\n", + "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 3.3s finished\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", + " metric_params=None, n_jobs=-1, n_neighbors=15, p=2,\n", + " weights='distance')\n", + "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}\n", + "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n", + "predicted output shape (34194,)\n", + "0\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", "metadata": { "id": "oVjNSkUYqnCt", "colab_type": "code", - "colab": {} + "outputId": "80fc7e87-36cd-44b7-96e9-ef0631c7d10c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + } }, "source": [ "print(df_train.ac_id.isnull().sum())\n", @@ -2681,14 +2548,52 @@ "print(df_train.ac_id.isnull().sum())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "61492\n", + "(90272, 45)\n", + "61492\n", + "(90270, 45)\n", + "# of miss 61492\n", + "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 2.0s finished\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", + " metric_params=None, n_jobs=-1, n_neighbors=25, p=2,\n", + " weights='distance')\n", + "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 25, 'weights': 'distance'}\n", + "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n", + "predicted output shape (61492,)\n", + "0\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", "metadata": { "id": "qTbcYbexqr0Y", "colab_type": "code", - "colab": {} + "outputId": "3459affa-a41a-4241-ab62-f0dfcadda039", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + } }, "source": [ "#yearbuilt\n", @@ -2711,7 +2616,41 @@ "print(df_train.yearbuilt.isnull().sum())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "755\n", + "(90272, 45)\n", + "755\n", + "(90258, 45)\n", + "# of miss 755\n", + "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 44.3s finished\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", + " metric_params=None, n_jobs=-1, n_neighbors=5, p=2,\n", + " weights='distance')\n", + "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}\n", + "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n", + "predicted output shape (755,)\n", + "0\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", @@ -2763,7 +2702,11 @@ "metadata": { "id": "pj5PXm7ozg5l", "colab_type": "code", - "colab": {} + "outputId": "3d42279f-221c-444c-8795-05a0832f97cd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 606 + } }, "source": [ "#garage_sqft\n", @@ -2780,11 +2723,57 @@ "\n", "print(\"predicted output shape\",missing_values.shape)\n", "missing_values_boolflag = df_train['garage_sqft'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'garage_sqft' ] = missing_values\n", + "df_train.loc[missing_values_boolflag, 'garage_sqft'] = missing_values\n", "print(df_train.garage_sqft.isnull().sum())" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "69255\n", + "(90272, 45)\n", + "8920\n", + "(29647, 45)\n", + "# of miss 8920\n", + "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 2.7s finished\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "grid.best_estimator_ KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='euclidean',\n", + " metric_params=None, n_jobs=-1, n_neighbors=5, p=2,\n", + " weights='distance')\n", + "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}\n", + "grid.scorer_ make_scorer(mean_absolute_error, greater_is_better=False)\n", + "predicted output shape (8920,)\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "ValueError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"predicted output shape\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmissing_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mmissing_values_boolflag\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'garage_sqft'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmissing_values_boolflag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'garage_sqft'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmissing_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgarage_sqft\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_setitem_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setitem_with_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_validate_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_setitem_with_indexer\u001b[0;34m(self, indexer, value)\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 611\u001b[0;31m raise ValueError('Must have equal len keys and value '\n\u001b[0m\u001b[1;32m 612\u001b[0m 'when setting with an iterable')\n\u001b[1;32m 613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Must have equal len keys and value when setting with an iterable" + ] + } + ] }, { "cell_type": "code", From af425f5bc700c2d4c756d070f059f635393d7262 Mon Sep 17 00:00:00 2001 From: Winston <43570913+gumdropsteve@users.noreply.github.com> Date: Thu, 3 Oct 2019 23:59:41 -0700 Subject: [PATCH 5/7] Create linear_regression_boston_demo.ipynb requested change: needs non colab version in intermediate > examples --- .../linear_regression_boston_demo.ipynb | 768 ++++++++++++++++++ 1 file changed, 768 insertions(+) create mode 100644 intermediate_notebooks/examples/linear_regression_boston_demo.ipynb diff --git a/intermediate_notebooks/examples/linear_regression_boston_demo.ipynb b/intermediate_notebooks/examples/linear_regression_boston_demo.ipynb new file mode 100644 index 00000000..53b868d2 --- /dev/null +++ b/intermediate_notebooks/examples/linear_regression_boston_demo.ipynb @@ -0,0 +1,768 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "LOCAL_intro_lin_reg_cuml", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "2tZ3RLnlkrkg", + "colab_type": "text" + }, + "source": [ + "# Intro to Linear Regression with cuML\n", + "Corresponding notebook to [*Beginner’s Guide to Linear Regression in Python with cuML*](https://medium.com/future-vision/beginners-guide-to-linear-regression-in-python-with-cuml-30e2709c761) story on Medium\n", + "\n", + "Linear Regression is a simple machine learning model where the response `y` is modelled by a linear combination of the predictors in `X`. The `LinearRegression` function implemented in the `cuML` library allows users to change the `fit_intercept`, `normalize`, and `algorithm` parameters. \n", + "\n", + "Here is a brief on RAPIDS' Linear Regression parameters:\n", + "\n", + "- `algorithm`: 'eig' or 'svd' (default = 'eig')\n", + " - `Eig` uses a eigendecomposition of the covariance matrix, and is much faster\n", + " - `SVD` is slower, but guaranteed to be stable\n", + "- `fit_intercept`: boolean (default = True)\n", + " - If `True`, `LinearRegresssion` tries to correct for the global mean of `y`\n", + " - If `False`, the model expects that you have centered the data.\n", + "- `normalize`: boolean (default = False)\n", + " - If True, the predictors in X will be normalized by dividing by it’s L2 norm\n", + " - If False, no scaling will be done\n", + "\n", + "Methods that can be used with `LinearRegression` are:\n", + "\n", + "- `fit`: Fit the model with `X` and `y`\n", + "- `get_params`: Sklearn style return parameter state\n", + "- `predict`: Predicts the `y` for `X`\n", + "- `set_params`: Sklearn style set parameter state to dictionary of params\n", + "\n", + "`cuML`'s `LinearRegression` expects expects either `cuDF` DataFrame or `NumPy` matrix inputs\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-tG6ezqKh1Z0", + "colab_type": "text" + }, + "source": [ + "Note: `CuPy` is not installed by default with RAPIDS `Conda` or `Docker` packages, but is needed for visualizing results in this notebook.\n", + "- install with `pip` via the cell below " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pxBcXor_0-Jd", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# install cupy\n", + "!pip install cupy" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N20le3_KlP3O", + "colab_type": "text" + }, + "source": [ + "## Load data\n", + "- for this demo, we will be utilizing the Boston housing dataset from `sklearn`\n", + " - start by loading in the set and printing a map of the contents" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RFE-nxxlTajg", + "colab_type": "code", + "outputId": "04f89e88-61a3-4dd2-9088-123b410e508c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "from sklearn.datasets import load_boston\n", + "\n", + "# load Boston dataset\n", + "boston = load_boston()\n", + "\n", + "# let's see what's inside\n", + "print(boston.keys())" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wmcO8dxO0uOB", + "colab_type": "text" + }, + "source": [ + "#### Boston house prices dataset\n", + "- a description of the dataset is provided in `DESCR`\n", + " - let's explore " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "c3kLHAsP-Al2", + "colab_type": "code", + "outputId": "02518c3c-7767-42a7-b6f4-6756ace741cc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 923 + } + }, + "source": [ + "# what do we know about this dataset?\n", + "print(boston.DESCR)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + ".. _boston_dataset:\n", + "\n", + "Boston house prices dataset\n", + "---------------------------\n", + "\n", + "**Data Set Characteristics:** \n", + "\n", + " :Number of Instances: 506 \n", + "\n", + " :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n", + "\n", + " :Attribute Information (in order):\n", + " - CRIM per capita crime rate by town\n", + " - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n", + " - INDUS proportion of non-retail business acres per town\n", + " - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n", + " - NOX nitric oxides concentration (parts per 10 million)\n", + " - RM average number of rooms per dwelling\n", + " - AGE proportion of owner-occupied units built prior to 1940\n", + " - DIS weighted distances to five Boston employment centres\n", + " - RAD index of accessibility to radial highways\n", + " - TAX full-value property-tax rate per $10,000\n", + " - PTRATIO pupil-teacher ratio by town\n", + " - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", + " - LSTAT % lower status of the population\n", + " - MEDV Median value of owner-occupied homes in $1000's\n", + "\n", + " :Missing Attribute Values: None\n", + "\n", + " :Creator: Harrison, D. and Rubinfeld, D.L.\n", + "\n", + "This is a copy of UCI ML housing dataset.\n", + "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n", + "\n", + "\n", + "This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n", + "\n", + "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\n", + "prices and the demand for clean air', J. Environ. Economics & Management,\n", + "vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n", + "...', Wiley, 1980. N.B. Various transformations are used in the table on\n", + "pages 244-261 of the latter.\n", + "\n", + "The Boston house-price data has been used in many machine learning papers that address regression\n", + "problems. \n", + " \n", + ".. topic:: References\n", + "\n", + " - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n", + " - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wI_sB78vE297", + "colab_type": "text" + }, + "source": [ + "### Build Dataframe\n", + "- Import `cuDF` and input the data into a DataFrame \n", + " - Then add a `PRICE` column equal to the `target` key" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xiMmIZ8O5scJ", + "colab_type": "code", + "outputId": "fd09db1f-fb41-4494-bb8b-eab6e18c258f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "source": [ + "import cudf\n", + "\n", + "# build dataframe from data key\n", + "bos = cudf.DataFrame(list(boston.data))\n", + "# set column names to feature_names\n", + "bos.columns = boston.feature_names\n", + "\n", + "# add PRICE column from target\n", + "bos['PRICE'] = boston.target\n", + "\n", + "# let's see what we're working with\n", + "bos.head()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX ... TAX PTRATIO B LSTAT PRICE\n", + "0 0.00632 18.0 2.31 0.0 0.538 ... 296.0 15.3 396.90 4.98 24.0\n", + "1 0.02731 0.0 7.07 0.0 0.469 ... 242.0 17.8 396.90 9.14 21.6\n", + "2 0.02729 0.0 7.07 0.0 0.469 ... 242.0 17.8 392.83 4.03 34.7\n", + "3 0.03237 0.0 2.18 0.0 0.458 ... 222.0 18.7 394.63 2.94 33.4\n", + "4 0.06905 0.0 2.18 0.0 0.458 ... 222.0 18.7 396.90 5.33 36.2\n", + "\n", + "[5 rows x 14 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r2qrTxo4ljZp", + "colab_type": "text" + }, + "source": [ + "### Split Train from Test\n", + "- For basic Linear Regression, we will predict `PRICE` (Median value of owner-occupied homes) based on `TAX` (full-value property-tax rate per $10,000)\n", + " - Go ahead and trim data to just these columns" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "spaDB10E3okF", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# simple linear regression X and Y\n", + "X = bos['TAX']\n", + "Y = bos['PRICE']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4TKLv8FjIBuI", + "colab_type": "text" + }, + "source": [ + "We can now set training and testing sets for our model\n", + "- Use `cuML`'s `train_test_split` to do this\n", + " - Train on 70% of data\n", + " - Test on 30% of data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1DC6FHsNIKH_", + "colab_type": "code", + "outputId": "4c932268-7a82-4ac3-c7b9-9966ffc2b12e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 86 + } + }, + "source": [ + "from cuml.preprocessing.model_selection import train_test_split\n", + "\n", + "# train/test split (70:30)\n", + "sX_train, sX_test, sY_train, sY_test = train_test_split(X, Y, train_size = 0.7)\n", + "\n", + "# see what it looks like\n", + "print(sX_train.shape)\n", + "print(sX_test.shape)\n", + "print(sY_train.shape)\n", + "print(sY_test.shape)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(354,)\n", + "(152,)\n", + "(354,)\n", + "(152,)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZLVg44gAmJG7", + "colab_type": "text" + }, + "source": [ + "### Predict Values\n", + "1. fit the model with `TAX` (*X_train*) and corresponding `PRICE` (*y_train*) values \n", + " - so it can build an understanding of their relationship \n", + "2. predict `PRICE` (*y_test*) for a test set of `TAX` (*X_test*) values\n", + " - and compare `PRICE` predictions to actual median house (*y_test*) values\n", + " - use `sklearn`'s `mean_squared_error` to do this" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZGMPloJxGtK3", + "colab_type": "code", + "outputId": "664b54fe-16d5-4140-a657-3dc782574da9", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "from cuml import LinearRegression\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "# call Linear Regression model\n", + "slr = LinearRegression()\n", + "\n", + "# train the model\n", + "slr.fit(sX_train, sY_train)\n", + "\n", + "# make predictions for test X values\n", + "sY_pred = slr.predict(sX_test)\n", + "\n", + "# calculate error\n", + "mse = mean_squared_error(sY_test, sY_pred)\n", + "print(mse)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "54.32312606491228\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T7BXjkPSGwqd", + "colab_type": "text" + }, + "source": [ + "3. visualize prediction accuracy with `matplotlib`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pp9RNPt_Iemk", + "colab_type": "code", + "outputId": "22a22472-50ad-4bb3-d104-35e9e100b8b6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 305 + } + }, + "source": [ + "import cupy\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# scatter actual and predicted results\n", + "plt.scatter(sY_test, sY_pred)\n", + "\n", + "# label graph\n", + "plt.xlabel(\"Actual Prices: $Y_i$\")\n", + "plt.ylabel(\"Predicted prices: $\\hat{Y}_i$\")\n", + "plt.title(\"Prices vs Predicted prices: $Y_i$ vs $\\hat{Y}_i$\")\n", + "\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEgCAYAAACq+TSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xu4XHV97/H3J5sNbi5lQwkom8Qo\nWiiIEAiCgqeAF1REU7yVI5ba1jxaTytesGCtBCsVjULxVJ82p1DFogeRkNLjBaliFRVoQoIhBGqV\n6wa5B4JsYCf5nj9mzWb2ZGbNzJo1M2tmPq/nyZOZNWvW+s6a2eu71u+qiMDMzKyeOb0OwMzMis2J\nwszMUjlRmJlZKicKMzNL5URhZmapnCjMzCyVE4WZmaVyojCzgSXpDZLe0Os4+p3c4c7MBpGkPYDv\nJU9fExEP9zKefuZEYWYDSdIXgSuAEeBNEfH+HofUt5wozMwslesozMwslROFIWm9pGN6HUdRSPqy\npE8lj7tybCr3mdP2/J1abpwoBpCkOyRNSXpC0v3JSWjneutHxIER8cMuhtiWVj9fO5o9NklMr+5E\nDFn08juVtJukTdWJStJXJa2QpEHc9yBzohhcJ0bEzsChwCLg49UrSNqu61HlZ9A/XyZF+MwR8Siw\nHDitvEzSXwO/C5wSHawY7eW+B5kTxYCLiEngO8BLYObK9y8l/Rz4jaTtKq+GJc1LrrwelPSwpL8v\nb0vS3pIuT167XdJfVLz2l5Imk6u52yS9qjqWZJ1vVi27QNIXmt1Gxs9XN+7kPQsl3Zjs91LgORWv\nzbpTqHV8JH0VmA/8W3KX89EmjlfdfdaSxHGmpFskPSrpnyU9J+UzN4y7UYxZv5PEecDxkl4o6W3A\nEkotj55M+Yx5/T5a3rc1EBH+N2D/gDuAVyeP5wHrgb+peG1tsnyscn1KzQhvAs4HdqJ08jo6WWcO\nsBr4BLA98ELgV8DxwH7A3cDeyboLgH1rxPV84Elgl+T5CHAfcGSz22j186XFnay/PXAn8EFgFHgr\nMA18qsa+0o7PzHpNHK/UfaZ85puTz7U78JOqGGt+p2lxN3Fs6n4nwJeALzX4HV4EfBd4EDi0id9t\nLr+PLPv2vwbfTa8D8L8OfKmlk8QTwMbkhPSlqhPIH9dY/9XAy5M/rO1qbPMI4K6qZWcC/wy8CHgg\n2cZog9iuBf4wefwa4JfJ41a20fTnS4s7efw/gHtJmoony35K7USRdnxm1mvieKXuM+Uzv7fi+Rsq\njl3d7zQt7iaOTdPfSZ2YXwIE8Paq5f8LeHGnfh9Z9+1/9f/1vDzTOmZxRPx7ndfurrN8HnBnRGyu\n8drzgb0lbaxYNgL8OCL+W9JpwFLgQElXAR+KiHtrbOdrwMnAxcD/TJ7T4jZa+Xx1404e7w1MRnIW\nSdxZZ7tpx6da2n5b2Welys91Z7KdWq9Vqxd36rHJ8J1U2x54GlhRuTAi/r726kB+v48s+7Y6XEcx\nnOpV6N0NzK9TIXo3cHtEjFf82yUi3gAQEV+LiKMpnXwC+EydfVwGHCNpH+D3SU4ELW6jkcrPlxo3\npaKNiarWMPPrbDft+FQf07T9trLPSvOq1q88SaZV0taLu9Gxafc7ORi4uTpBSfphynvy+n1k2bfV\n4URhlW6gdBI7V9JOkp4j6aiK1zYlFYpjkkYkvUTS4ZL2k3ScpB2Ap4ApYGutHUTEg8APKRXB3B4R\nGwBa2UaGz1Qz7uT1nwGbgb+QNCrpJOBlKduqd3zup1TG38x+W9lnpfdL2kfS7sBfAZe2cAxqxZ16\nbHL4Tg6hVHcyQ6Xxlx6o94Ycfx8t79vqc6KwGRGxBTiRUnnwXcA9wDsqXnsjpT/A24GHgH8CdgV2\nAM5Nlv0a2JNSWXc9X6NU1vy1imWtbqOVz1QvbiLiGeAk4I+ARyh93hUp26p5fIBPAx+XtFHSR9L2\n28o+q3yN0iB3vwJ+CTTVQa9e3I2ODSnfiaR/kPQPDXZ9MFUna+ClwLoG78vj95F131aDx3oy6wOS\n7gD+NKVepi8k9Qx3RMTKYdp3v/MdhZl100HAz4dw333NrZ7MrGsi4k+Gcd/9zkVPZmaWykVPZmaW\nyonCzMxSDUQdxR577BELFizodRhmZn1l9erVD0XE3EbrDUSiWLBgAatWrep1GGZmfUVSM0PHuOjJ\nzMzSOVGYmVkqJwozM0vVs0SRzLh1TTJj13pJH0iWHyLpOklrJa2S1MxgaWZm1iG9rMzeDHw4Im6U\ntAuwWtLVwGeBsyPiO5LekDw/podxmpkNtZ4lioi4j9LQx0TEJkkbgAlK48z/VrLarswec9/63Mo1\nkyy76jbu3TjF3uNjnH78fixeOJF5PTPrvEI0j5W0AFgIXA+cBlwl6XOUisZe0bvILE8r10xy5op1\nTE1vAWBy4xRnriiN+lyZBJpdz8y6o+eJQtLOwOXAaRHxuKRPAR+MiMslvR24kNLY9NXvWwIsAZg/\nv5nJwazXll1128zJv2xqegvLrrptVgJodj0rhrS7v1qvAS3fLfoOs7d6migkjVJKEpdERHnillOB\nDySPL6M0kco2ImI5sBxg0aJFHtmwD9y7caqp5c2uZ72XdvcHbPPa6ZfdBILpLbHN+vVO/L7D7L1e\ntnoSpbuFDRFxXsVL9wK/lzw+DvhFt2Ozzth7fKyp5c2uZ72XdvdX67XprTGTJKrXz7IP645e9qM4\nCngXcFzSFHZt0srpPcDnJd0E/C1J8ZL1v9OP34+x0ZFZy8ZGR2aKI1pdz3ov7e6vlTvAtHV9h9l7\nvWz1dC2gOi8f1s1YrDvKxQSNypqbXc96b+/xMSZrnLDLd3+1Xqu3naz7sM7reWW2DZfFCyeaOuE3\nu5711unH7zer/gBm3/1VvzY6R7PqKKrXz7IP6zwnCjPLrJm7v3ZbPfkOs/cGYirURYsWhYcZNzNr\njaTVEbGo0Xq+ozCzprk/w3ByojCzprg/w/ByojDrkiJdjWeJxT3mh5cThXVdkU6Y3YqpSFfjWWOp\n19TV/RkGnycusq4qn6QmN04RPHuSWrlmcqBjKlLv4iyxrFwzWbfTk/szDD4nCuuqIp0wy7oRU5F6\nF2eJZdlVt1GrfaTA/RmGgBOFdVWRTpiN9p1nTEUavypLLPWOReCK7GHgRGFdVaQTZqN95xlTkcav\nyhJLvWMx4WKnoeBEYV1VpBNmWTdiWrxwgk+fdBAT42OI0gn20ycd1JOr8SyxFPF7s+5xz2zrumFs\n9TQIfIwGT7M9s50orLB6eWLySdHHYBh4CA/ra73sd5D3vqtPuMfuP5drbn2w0CfgPI9BPyacfoy5\nk3xHYYV01Lk/qNnBa2J8jJ+ccVzb2087EbS670ZzRlcPkV1tbHSkZ/UV9eR1/Gt9/iJ+3kr9GHNW\nzd5RuDLbCqmTTVYbdbBrZd+NtlWrj0a1XvcjqSWv41/EfjON9GPMneZEYYXUySarjU4Erey70baa\nPbEWbRiMvI5/EfvNNNKPMXeaE4UVUiebYzY6EbSy70bbavbEWrRhMPI6/kXsN9NIP8bcaU4UVkid\n7HfQ6ETQyr4bbavWCbdaEfsj5HX8+7H/RT/G3GmuzLahk2dlZTPb6sdWT3nqxxZE/RhzFu5HYZYi\nzxNBN08qw3ICs+5wojAbMLXuXkRpYL4JJw3LoPDNYyXNk3SNpFskrZf0gYrX/lzSrcnyz/YqRrMi\nqdXCqnyZV4R5PWxw9bJn9mbgwxFxo6RdgNWSrgb2At4MHBwRT0vas4cxWh8btGKaRs0zPS2pdUrP\nEkVE3AfclzzeJGkDMAG8Bzg3Ip5OXnugVzEW1aCdADuhSFOP5mXv8bG605GWDXNbf+ucQoz1JGkB\nsBC4HlgGvFLSOcBTwEci4j97F12xDOIJsBPqdYQ77dK1LL1yPRJsfHK6bqLtRDJud5unH79fw+FA\nhrmtf158IbatnicKSTsDlwOnRcTjkrYDdgeOBA4HviHphVFV6y5pCbAEYP78+V2OunfSegIP+4+5\nUtqV9cap6ZnH5US76s5HZpqsju84yhNPbWZ6a8xaB7In4zwSfHm9ZVfdxuTGqZmK7LJhb+ufB1+I\n1dbTDneSRikliUsiYkWy+B5gRZTcAGwF9qh+b0Qsj4hFEbFo7ty53Qu6xzy8QHNaubKemt7CJdfd\nNTNe06NPTs8kicp12hnrZ+mV63MZP2jxwgl+csZx3HHuCZz/jkMKMRHSIPE4T7X17I5CkoALgQ0R\ncV7FSyuBY4FrJP0OsD3wUA9CLKR65dSdLHLo9K14J7bfTDFNpWYaiWdNxivXTM66i8ljm1BKGvWO\nk4tPsvGFWG29LHo6CngXsE7S2mTZx4CLgIsk3Qw8A5xaXew0zGqdADtZ5NDpW/G07QOZT3bVxTR5\nyJqM065GO5HgXXySXS8uxPqBO9z1oW5eLXZ6Xoh62x8fG+XpzVs7NsxGq9qZj+AFZ3wr9Y4l785y\nnf7OBtkwzUUBnuFuoKUVOeSt07fi9bZTq6imUaV9vQRaeXdxb1IP0cjoiNhp++14bKp+y6hmNWrW\nmvcVv4tPsqv+rbjYrsSJwlJ1+la8mb4Bleqd7BoVYZ39b+t59Mna9QS17LT9dix904G5nCCaqS/J\ns+Wai0/a080LsX7hYcYtVaeHXK63/d12HK25fr2TXb3WKkuvXM/p37yppSQBpTuavIbEqB6yu568\nrvg9TLblzYnCUnVyXoh623/LYRPUqjpLO9mlFWFNb8lWD5dns8hys9bbzz2BiQ5PjNPp78yGjyuz\nrS15V6zXq3jebcdRzjpxdlFQ5b7nSGzpwG9ZwO3nnpDrNoetwtSKy5XZ1nGdaIZZqwgJYMftt9sm\nSZx+2U0zHeNqJYnROaXksbWJ/FHdy7msE+X6rjC1fuNEMQTKV96TG6cYSa68JxrMtNbMnULaeErL\nrrqt6ZNfZXz1VBctLb1y/Ta9p4GZOoBdx0b5zTOb2bq14e6B2kmik+X6eVeY9kMHu36IMW+d/Mzd\nPJ4uehpwrfQhKF9V71Y11hE8WzQCzTczrVWcUmta0MtXTzaMr7oPwIIzvlV33TvOPaFuX4JmjY+N\nZmr11IuTYT8UZfVDjHnr5GfOa9uFn7jIuqNeUU4t5RN/vbGOzv639Zy5Yt3MmEiNVFcGl3/c5fdP\nbpzikuvuahhfliv7dlsQ7bTDdpmSRPXn68ZkQv0wPlE/xJi3Tn7mbh9PJ4oBl2cnq0efnG65d3Pl\n/tNmaEtT6yqpXvPZOSqdsNutW8hy3Hp1MuyHDnb9EGPeOvmZu308nSgGXK87WVXuP8uPeGJ8rOaV\n/VknHsjoyLa9ErYGnLliHcfuP3ebvgStyHLcenUyrBdrr7/7Sv0QY946+Zm7fTydKAZcrc5XWYyN\njjA+VvsqfmJ8jL97xyENO3nV+xHX64SWVuS0eOEEy956MCPa9t1T01u45tYHt+lLcMqR82eej4+N\n1kw0jfabplcnw37oYNcPMeatk5+528fTrZ4GXPUoqrVaPdWaBKfWWEdA3ZFrm2nyWW/k27ccNjET\nR2V8jSqCFy+c4IOXrq352r0bpxq2LKrXGixrBXS3R/Yt64fmtv0QY946+Zm7fTzd6smA5lvrtNuq\nJ+9WQUUbKXUYm4Ba/2q21ZMTheWq2yfKXjW77Hb7eMjn6jHvuyjrb04U1nWDeNKut79uto8fnSMQ\ns8asyrK/tD41g96nwWpzorCuq1cMNCKxNaIQRVV56GRxVysdBVvdX6Nte2Kj4dO1sZ4kfTLZzlpg\nbUT8V7vbtP5UrxloeRymWmNBpSWCTowllUfi6UX7+Dz212j9Qe7TYO1pqXmspFOql0XEJ4ALgMeA\n35f0f3KKzfpMM81AKzugNerJnHcHtrx6TveifXwe+2u0/iD3abD2tNqP4l2SLpA0qwFvRNwfEVdF\nxGci4j05xmd9pNk+G+Ur10aJIO8r97wST7fbx4/O0TZ9PrLsL+37GfQ+Ddae1EQh6UBJl1Qsej0w\nBfxA0tyORmZ9p3rCnFqd4eDZK9dGiaDZK/eVayY56twf8IIzvsVR5/6g7h1CXomnkxMD1dr2srcd\nzLK3Htz2/iq3Dc9+P57YyBppVEfx78DLy08iYitwhqSTgB9LOo9S3cTNEfFk58K0flHZya1e66Dy\nlWujuZ2b6cDWSj1GnnNJd3Je5XrbzisROSFYqxoVPb0WOKdygaQ3An8KPAMcCnwOuFvSf3ckQutb\nja68GxXhNHPl3kpx0jAOI2GWh9Q7iohYB7yz/FzS7cAtwPkRcXXlupL2aWXHkuYBFwN7URo9YnlE\nXFDx+ocpJaG5EfFQK9u27kprSZR2BdvMMASNroBbKU4axmEkzPLQavPY10fErbVeiIh7WtzWZuDD\nEXGjpF2A1ZKujohbkiTyWuCuFrdpXdZuE9Z2i0JaLU5y0YtZ61pq9VQvSWQREfdFxI3J403ABqD8\nF3w+8FGam67AeqjXE9LUayX05DObG1ZuF0WzlfFmvVKI0WMlLQAWAtdLejMwGRE3qU6rGSuOXk9I\nU12cVJ4r+9Enp4F8Oul1Uic6FZrlrefzUUjaGbgcOI1ScdTHgE808b4lklZJWvXggw92OEqrpwgT\n0ixeOMFPzjiO2889gZ122G7WmEhQ7Ck3e31HZtaMniYKSaOUksQlEbEC2Bd4AXCTpDuAfYAbJT23\n+r0RsTwiFkXEorlz3aWjV4rWkqjXdzit6rd4bThlThTVJ+9aJ/MG7xdwIbAhIs6DUiuriNgzIhZE\nxALgHuDQiPh11jitszrZ+SyLItzhtKLf4rXh1E4dxYXACSnPGzkKeBewTlJ5mrKPRcS324jJeqBI\nLYl6NctcVv0Wrw2nzIkiIk5Ie97E+6+l/nTJ5XUWtB6ZDbN+6yvRb/HacMo0H4WktwHfjYhNkj5O\nqYf230TEmrwDbIbnozAza12z81FkraP46yRJHA28mlKx0z9k3JaZmRVY1kRRLlA9gdLQG98Cts8n\nJDMzK5KsiWJS0j8CfwB8W9IObWzLzMwKLOvJ/e3AVcBrI2IjsDtwem5RmZlZYWRt9TQF7AScDHwS\nGAU25hWU9ad25qPOYy5rM+uMrHcUXwKOpJQoADYBX8wlIutL7cxHnddc1mbWGVkTxRER8X7gKYCI\neBRXZg+1dsYs8nhHZsWWtehpWtIIyTDgyfzZW3OLyvpOO2MW1VtncuMUK9dMDmUR1CAU4xUlDmtf\n1kTxBeAKYE9J5wBvBT6eW1TWd9qZj7ree4GhHHK7naHHOzFseZYTvodPHyyZEkVEXCJpNfAqSsNw\nLI6IDblGZn3l2P3ncsl1d82aaaqZMYtWrpnkyWc21329XARVfXJp92q18v3jO44SAY9NTbd15ZvX\nFXRaUVyj7bXz3lqynvDzjsN6q52xnm4FcpvxzvrXyjWTXL56claSEPCWw9IHC6w+CdVTXTTV7tVq\n9fvLkxxl2VZeMVXqRDFe1mHLs57wPXz6YMlUmS3pK5LGK57vJumi/MKyflLrZBLANbemTyhV6321\nVBdftVv53Wi/WSrS86yQb2fo8byHLc96wvfw6YMla6unlyYd7YCZVk8L8wnJ+k0zJ5Na80I3c3VZ\nq/iq3avVdq7MW10/yxV0O5NB5T2RVNYTftEmtLL2ZE0UcyTtVn4iaXcKMv+2dV+jk0m9fhLjO47W\nfN+IlDoJUrtXq+1cmbe6fpYr6HYmg8p7IqmsJ/yiTWhl7cl6cv888DNJlyXP3wack09I1m8aTb5T\nr1hmh+3mMDY6ss37Gp1Q2p3sp9b7KylZpxV5T0DUzmRQeU4k1c58GUWa0Mrak7XV08VJq6djk0Un\nRcQt+YVl/aTRyaRe8ctjU9Oc/45DWj4JtTvZT3m90y5dW/P1oPUK6EGegMgnfMs0cVHReOKiYjvq\n3B/U7CcxMT7GT844rgcRlRQ1LrNu6cjERZKuTf7fJOnxin+bJD2eNVgbbEWt2CxqXGZF01LRU0Qc\nLUnAgRFxV4disgFT1GKZosZlVjRZ58xeFxEHdSCeTFz0ZGbWumaLnrK2erpR0uER8Z8Z329Dot6w\nFh4wzqx/ZE0URwCnSLoD+A2lFoURES/NKzDrf/WGtVh15yNcvnrSA8aZ9YmsieL4dncsaR5wMbAX\npRaJyyPiAknLgBOBZ4BfAu+u7AVu2ZWv4ic3TpUye7J8tx1HOevEA9s6SVffIRy7/1y+fv3dbKkq\n2pya3lJ3uQeMMyumrD2z7wfeApwPnAeclCxrxWbgwxFxAKXZ8t4v6QDgauAlyd3JfwFnZozRKlT2\njgZmDeD36JPTnP7NmzLPKFer5/W/XHfXNsmgrN7yekONm1lvZU0UFwMHAv8b+HvgAOCrrWwgIu6L\niBuTx5uADcBERHwvIsrjTl8H7JMxRqvQaCC86S2ReUa5Zgf3a2REansbZpa/rEVPL0nuBMqukZS5\nZ7akBZQGFby+6qU/Bi6t854lwBKA+fPnZ9310OjEQHhZ3lc9ZEelencaZtZbWe8obpR0ZPmJpCOA\nTO1TJe0MXA6cFhGPVyz/K0rFU5fUel9ELI+IRRGxaO7cuVl2PVQ6MRBeq+8bkWYGiqul3nIz662s\nieIw4KeS7khaPv0MOFzSOkk/b3YjkkYpJYlLImJFxfI/At4IvDMGYYyRAqjVC7nS6Igy90hutG0o\n3Ul8/u0Hs3jhhHtEm/WZrEVPr2t3x0kP7wuBDRFxXsXy1wEfBX4vIp5sdz9WUtkLOe9WT7V6OB+7\n/1yuufXBmv0k3CParL/0bFBASUcDPwbWAVuTxR8DvgDsADycLLsuIt6bti33zDYza12ne2a3LSKu\npdRRr9q3ux2LmZnVl7WOwszMhoQThZmZpWqp6EnSh9Jer6yUNjOzwdBqHcUuyf/7AYcDVybPTwRu\nyCsoMzMrjlYnLjobQNKPgEOToTeQtBT4Vu7RmZlZz2Wto9iL0uiuZc8ky8zMbMBkbR57MXCDpCuS\n54uBr+QTkpmZFUmmRBER50j6DvDKZNG7I2JNfmGZmVlRZCp6SobfOADYNSIuAB6W9LJcIzMzs0LI\nWkfxJeDlwMnJ803AF3OJyMzMCiXznNkRcaikNQAR8aik7XOMy8zMCiLrHcW0pBGSAUglzeXZgf3M\nzGyAZE0UXwCuAPaUdA5wLfDp3KIyM7PCyNrq6RJJq4FXURoBdnFEbMg1MjMzK4RMiULSZyLiL4Fb\naywzM7MBkrXo6TU1lr2+nUDMzKyYWh099n3AnwH7Vs2NvQvw0zwDMzOzYmi16OlrwHcoVVyfUbF8\nU0Q8kltUZmZWGC0VPUXEYxFxB6VBAB+LiDsj4k4gJF3UiQDNzKy3stZRvDQiNpafRMSjwMJ8QjIz\nsyLJmijmSNqt/ETS7mTv5W1mZgWW9eT+eeBnki5Lnr8NOCefkMzMrEiydri7OOlwd2yy6KSIuCW/\nsMzMrCgyFxdFxHpgfdb3S5pHaQKkvSiNGbU8Ii5IirEuBRYAdwBvT+pAzMysB1qqo5B0bfL/JkmP\nV/zbJOnxFve9GfhwRBwAHAm8X9IBlJrdfj8iXgx8n9nNcM3MrMtauqOIiKOT/3dpd8cRcR9wX/J4\nk6QNwATwZuCYZLWvAD8EPDSImVmPtNoz+0Npr0fEeVmCkLSAUvPa64G9kiQC8GtKRVO13rMEWAIw\nf/78LLs1M7MmtNo8dpfk3yLgfZTuACaA9wKHZglA0s7A5cBpETGr+CoigmTOi2oRsTwiFkXEorlz\n52bZtZmZNaHVoqezAST9CDg0IjYlz5cC32p155JGKSWJSyJiRbL4fknPi4j7JD0PeKDV7ZqZWX6y\ndrjbi9IwHmXPUKeIqB5JAi4ENlQVWV0JnJo8PhX414wxmplZDrI2j70YuEHSFcnzxZQqnltxFPAu\nYJ2ktcmyjwHnAt+Q9CfAncDbM8ZoZmY5yNrh7hxJ3wFemSx6d0SsaXEb11KaHa+WV2WJy8zM8pep\n6CkpNjoA2DUiLgAelvSyXCMzM7NCyFpH8SXg5cDJyfNNwBdzicjMzAolax3FERFxqKQ1UBpmXNL2\nOcZlZmYFkfWOYlrSCEkfB0lzga25RWVmZoWRNVF8AbgC2FPSOcC1wN/mFpWZmRVGy0VPSUX2j4DV\nlFonCVgcERtyjs3MzAqg5UQRESHp2xFxEHBrB2IyM7MCyVr0dKOkw3ONxMzMCilzqyfgFEl3AL+h\nVPwUEfHSvAIzM7NiyJoojs81CjMzK6xW56N4DqUhxV8ErAMujIjNnQjMzMyKodU6iq9QmotiHfB6\n4PO5R2RmZoXSatHTAUlrJyRdCNyQf0hmZlYkrd5RTJcfuMjJzGw4tHpHcbCk8nSlAsaS5+VWT7+V\na3RmZtZzrU6FOtKpQMzMrJiydrgzM7Mh4URhZmapnCjMzCyVE4WZmaVyojAzs1ROFGZmlsqJwszM\nUvU0UUi6SNIDkm6uWHaIpOskrZW0StLLehmjmdmw6/UdxZeB11Ut+yxwdkQcAnwieW5mZj3S00QR\nET8CHqleDJSHAtkVuLerQZmZ2SxZJy7qpNOAqyR9jlIie0WtlSQtAZYAzJ8/v3vRmZkNmV4XPdXy\nPuCDETEP+CBwYa2VImJ5RCyKiEVz587taoBmZsOkiIniVGBF8vgywJXZZmY9VMREcS/we8nj44Bf\n9DAWM7Oh19M6CklfB44B9pB0D3AW8B7gAknbAU+R1EOYmVlv9DRRRMTJdV46rKuBmJlZXUUsejIz\nswJxojAzs1ROFGZmlsqJwszMUjlRmJlZKicKMzNL5URhZmapnCjMzCyVE4WZmaVyojAzs1ROFGZm\nlsqJwszMUjlRmJlZqiJOhdoVK9dMsuyq27h34xR7j49x+vH7Acxaduz+c7nm1gdnrbN44cTMeyc3\nTs1sb0Ti5CPm8anFB/Hxlev4+vV3syVi5vWJqvcvvXI9G6emAdhtx1HOOvHAbbY9R7A12YQoTSY+\nPjaKBI8+Oc2IxJaImf/HRufw9OatbI1SPEe+cDfueHhqm89Yve8DnrcLP/3VI1SEy/jYKEvfVIoJ\nqPmZKh217+5c8p6XZzrW4zuOEgEbp7b9TBM1vod630va/ipfb0ZRtmFWTzd/X4o6f/j9ZNGiRbFq\n1aqm11+5ZpIzV6xjanrLzLLkIdRbAAAKXUlEQVTREUHA9Nb6x2NsdIS3HDbB5asnZ7230ov33Ilf\nPPCb1PdfesPd2+xndES84/B5qdtu1+ic0sk35SNus/6ytx3Mqjsf4V+uu6vh+rWSRdZj3Yqx0RE+\nfdJBM4m2en+VrzejKNswqyev35ek1RGxqNF6Q1n0tOyq27Y5GU9viYYnrqnpLXz9+rtTT+T1kkTl\n+2vtZ3pLNNx2u6a3Np8kyusvu+o2vn793U2t/5NfPrLNsqzHuhVT01tYdtVtdfdX+XozirINs3q6\n/fsayqKneyuKjFpVr+glj/e3u+1OuHfjFO1E1c6xzrKfevtrJY6ibMOsnm7/vobyjmLv8bHM7x2R\n2tp32vvb3XYn7D0+1lZc7RzrLPupt79W4ijKNszq6fbvaygTxenH78fY6MisZaMjYnRO+glxbHSE\nk4+Yt817K714z50avr/WfkZH1HDb7RqdIxp8xG3WP/34/Tj5iHlNrX/UvrtvsyzrsW7F2OjITAV5\nrf1Vvt6MomzDrJ5u/75Gli5d2pENd9Py5cuXLlmypOn193/eb7HPbmOsm3yMJ57azMT4GEvfdCCv\nPfC5s5a9+ZC9efiJZ2aef+LEA/izY180895NT22e2eaIxDuPnM+X330EDz3xNOsnH59VZFP5/vm7\n78h1v3qYpzZvBUotj875/YO22facUp0vUGr1BKXWSGPbj/DU9FZGJCLZdwBjo3PYGjGz7BX77s7W\nYNZnPP7A526z78Pmj3NP1S3r+Ngof5tUjB23/141P1Oleq2emjnWu+04ynO2G+Gpzdt+plrfQ63v\npVyBV2t/la9n/X30Yhtm9eT1+zr77LPvW7p06fJG6w1lqyczM3OrJzMzy4kThZmZpeppopB0kaQH\nJN1ctfzPJd0qab2kz/YqPjMz6/0dxZeB11UukHQs8Gbg4Ig4EPhcD+IyM7NETxNFRPwIqO7O+z7g\n3Ih4Olnnga4HZmZmM3p9R1HL7wCvlHS9pP+QdHitlSQtkbRK0qoHH3ywyyGamQ2PIiaK7YDdgSOB\n04FvSNt2DY6I5RGxKCIWzZ07t9sxmpkNjSIminuAFVFyA7AV2KPHMZmZDa0iJoqVwLEAkn4H2B54\nqKcRmZkNsZ6OHivp68AxwB6S7gHOAi4CLkqazD4DnBqD0H28hzyBjpm1o6eJIiJOrvPSKV0NZIBV\nT3AyuXGKM1esA3CyMLOmFLHoyXLkCXTMrF1OFAPOE+iYWbucKAacJ9Axs3Y5UQw4T6BjZu0ayjmz\nh0m5wtqtnswsKyeKIbB44YQTg5ll5qInMzNL5URhZmapnCjMzCyVE4WZmaVyojAzs1QahPH2JD0I\n3NnrONq0Bx4lt5KPx2w+Hs/ysZitnePx/IhoOKHPQCSKQSBpVUQs6nUcReHjMZuPx7N8LGbrxvFw\n0ZOZmaVyojAzs1ROFMWxvNcBFIyPx2w+Hs/ysZit48fDdRRmZpbKdxRmZpbKicLMzFI5UfSApIsk\nPSDp5oplu0u6WtIvkv9362WM3SJpnqRrJN0iab2kDyTLh/V4PEfSDZJuSo7H2cnyF0i6XtJ/S7pU\n0va9jrVbJI1IWiPp/yXPh/lY3CFpnaS1klYlyzr+t+JE0RtfBl5XtewM4PsR8WLg+8nzYbAZ+HBE\nHAAcCbxf0gEM7/F4GjguIg4GDgFeJ+lI4DPA+RHxIuBR4E96GGO3fQDYUPF8mI8FwLERcUhF34mO\n/604UfRARPwIeKRq8ZuBrySPvwIs7mpQPRIR90XEjcnjTZROCBMM7/GIiHgieTqa/AvgOOCbyfKh\nOR6S9gFOAP4peS6G9Fik6PjfihNFcewVEfclj38N7NXLYHpB0gJgIXA9Q3w8kqKWtcADwNXAL4GN\nEbE5WeUeSsl0GPwd8FFga/L8txneYwGli4bvSVotaUmyrON/K57hroAiIiQNVbtlSTsDlwOnRcTj\npQvHkmE7HhGxBThE0jhwBbB/j0PqCUlvBB6IiNWSjul1PAVxdERMStoTuFrSrZUvdupvxXcUxXG/\npOcBJP8/0ON4ukbSKKUkcUlErEgWD+3xKIuIjcA1wMuBcUnlC7t9gMmeBdY9RwFvknQH8H8pFTld\nwHAeCwAiYjL5/wFKFxEvowt/K04UxXElcGry+FTgX3sYS9ckZc4XAhsi4ryKl4b1eMxN7iSQNAa8\nhlK9zTXAW5PVhuJ4RMSZEbFPRCwA/gD4QUS8kyE8FgCSdpK0S/kx8FrgZrrwt+Ke2T0g6evAMZSG\nB74fOAtYCXwDmE9pyPS3R0R1hffAkXQ08GNgHc+WQ3+MUj3FMB6Pl1KqkByhdCH3jYj4pKQXUrqq\n3h1YA5wSEU/3LtLuSoqePhIRbxzWY5F87iuSp9sBX4uIcyT9Nh3+W3GiMDOzVC56MjOzVE4UZmaW\nyonCzMxSOVGYmVkqJwozM0vlRGFmZqmcKGxgSFosKSSlDnkhaVzSn7W5ryfqLN+SDAF9s6TLJO1Y\nZ72ftrP/Zkn63WRo6jnJ8xFJ35P0h93Yvw0GJwobJCcD1yb/pxkH2koUKaaSIaBfAjwDvLfyRZXM\niYhXdGj/s0TEBko9u9+YLDoHuC0iLu7G/m0wOFHYQEgGFTya0twEf1Cx/A8l/TyZCOiryeJzgX2T\nK/9lkhZUTSL1EUlLk8crk5E611eM1tmsHwMvSrZ/m6SLKQ25MK/yjqROjEg6JZnEaK2kf0zuBnaS\n9K1k3ZslvaOJOM4H3ifpLZTGT/pQi5/DhpxHj7VB8WbguxHxX5IelnQY8BTwceAVEfGQpN2Tdc8A\nXhIRh8DM8Ob1/HFEPJKMu/Sfki6PiIcbBZMMWvd64LvJohcDp0bEdcnr5fUOrBWjpN8F3gEcFRHT\nkr4EvBP4DXBvRJyQrLdr8v+3gT+NiHurY4mI70n6PPBp4H9ExHSj+M0q+Y7CBsXJlMb/Ifn/ZEqj\njV4WEQ8BZBz/5i8k3QRcB8yjdMJPM5bMJbEKuIvSgIcAd5aTRJV6Mb4KOIxSclqbPH8hpTGxXiPp\nM5JeGRGPJe97Q60kUeGnwHkR8evyAkl/0+CzmAG+o7ABkFyFHwcclIzFP0JpgpdlTW5iM7Mvmp6T\nbPcY4NXAyyPiSUk/LL+WYqp8p1IRH5TuBFoh4CsRceY2L0iHAm8APiXp+xHxySa2dwDwzxXbeC6l\n2fPMGvIdhQ2CtwJfjYjnR8SCiJgH3A78HHhbMromFUVPm4BdKt5/P7CnpN+WtAPPVvzuCjyaJIn9\nKc3pnbcf1Inx+8BbkwlqkLS7pOdL2ht4MiL+hVIiPLTJ/RxIqX6k7BBgbR4fwAafE4UNgpN5dvjl\nssspVWqfA/xHUnx0HkBSx/CTpDJ4WVJm/0ngBkpTj5ZnDfsusJ2kDZQqwGsVHbUlItbXifEWSnUX\n35P08ySu5wEHATckxVFnAZ+CUh1FkkS2IWkepelDK5v0OlFY0zzMuNkQknQh8J6I2NpwZRt6ThRm\nZpbKRU9mZpbKicLMzFI5UZiZWSonCjMzS+VEYWZmqZwozMwslROFmZmlcqIwM7NUThRmZpbq/wNP\ni6cKUGWQlQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MqX73B4s5tv", + "colab_type": "text" + }, + "source": [ + "## Multiple Linear Regression \n", + "- Our mean squared error for Simple Linear Regression looks kinda high..\n", + " - Let's try Multiple Linear Regression (predicting based on multiple variables rather than just `TAX`) and see if that produces more accurate predictions\n", + "\n", + "1. Set X to contain all values that are not `PRICE` from the unsplit data\n", + " - i.e. `CRIM`, `ZN`, `INDUS`, `CHAS`, `NOX`, `RM`, `AGE`, `DIS`, `RAD`, `TAX`, `PTRATIO`, `B`, `LSTAT`\n", + " - Y to still represent just 1 target value (`PRICE`)\n", + " - also from the unsplit data\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZtQK5-f4M0Vg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# set X to all variables except price\n", + "mX = bos.drop('PRICE', axis=1)\n", + "# and, like in the simple Linear Regression, set Y to price\n", + "mY = bos['PRICE']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RTYG4-UwNDsK", + "colab_type": "text" + }, + "source": [ + "2. Split the data into `multi_X_train`, `multi_X_test`, `Y_train`, and `Y_test`\n", + " - Use `cuML`'s `train_test_split`\n", + " - And the same 70:30 train:test ratio" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EsKxK8u_F7t8", + "colab_type": "code", + "outputId": "673a1a44-4d2f-4a45-8333-8f29782eaf65", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 86 + } + }, + "source": [ + "# train/test split (70:30)\n", + "mX_train, mX_test, mY_train, mY_test = train_test_split(mX, mY, train_size = 0.7)\n", + "\n", + "# see what it looks like\n", + "print(mX_train.shape)\n", + "print(mX_test.shape)\n", + "print(mY_train.shape)\n", + "print(mY_test.shape)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(354, 13)\n", + "(152, 13)\n", + "(354,)\n", + "(152,)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_Y40R17LGHsI", + "colab_type": "text" + }, + "source": [ + "3. fit the model with `multi_X_train` and corresponding `PRICE` (*y_train*) values \n", + " - so it can build an understanding of their relationships \n", + "4. predict `PRICE` (*y_test*) for the test set of independent (*multi_X_test*) values\n", + " - and compare `PRICE` predictions to actual median house (*y_test*) values\n", + " - use `sklearn`'s `mean_squared_error` to do this" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "N7qm1HuVO-1k", + "colab_type": "code", + "outputId": "7e291cec-e602-4ad9-a5b3-b70d7261f63d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "# call Linear Regression model\n", + "mlr = LinearRegression()\n", + "\n", + "# train the model for multiple regression\n", + "mlr.fit(mX_train, mY_train)\n", + "\n", + "# make predictions for test X values\n", + "mY_pred = mlr.predict(mX_test)\n", + "\n", + "# calculate error\n", + "mmse = mean_squared_error(mY_test, mY_pred)\n", + "print(mmse)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "16.691811854229723\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jTdmleXCM_Xb", + "colab_type": "text" + }, + "source": [ + "5. visualize with `matplotlib`" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Q83NFMK1JKvL", + "colab_type": "code", + "outputId": "569cfa77-a66e-4b1b-9d70-ae4ef8e7936e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 305 + } + }, + "source": [ + "# scatter actual and predicted results\n", + "plt.scatter(mY_test, mY_pred)\n", + "\n", + "# label graph\n", + "plt.xlabel(\"Actual Prices: $Y_i$\")\n", + "plt.ylabel(\"Predicted prices: $\\hat{Y}_i$\")\n", + "plt.title(\"Prices vs Predicted prices: $Y_i$ vs $\\hat{Y}_i$\")\n", + "\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEgCAYAAACq+TSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3X20HXV97/H3J4cTOCBygkYqAYxP\nCyQiCeJTQ7skSqMimiKKXLXc1tbV1q4K0tRwLy1I5RJXqlivWsuqtqBYAwKRVitYE66KRZuYQIhA\nfUDQg0JUjoIc4CT53j/27GRnZ8/sPbNnP39ea5119syeM/Pbc86Z78zv4ftTRGBmZpZmTq8LYGZm\n/c2BwszMMjlQmJlZJgcKMzPL5EBhZmaZHCjMzCyTA4WZmWVyoDCzoSTpNZJe0+tyDAN5wJ2ZDRtJ\nTwVuShZPiYif97I8g86BwsyGjqSPAtcDY8DrIuKdPS7SQHOgMDOzTG6jMDOzTA4UI07SNkkv73U5\n+oWkf5b0vuR1V85N7TFL2p9/p1YqB4ohI+mHkmYkPSLpgeQi9KS07SNiUUTc3MUitiXv52tHq+cm\nKdMrO1GGInr5O5U0T9LD9YFK0qckXSdJw3jsYedAMZxOi4gnAScAJwIX1G8gab+ul6o8w/75CumH\nzxwRDwGXA+dU10n6K+B5wFujg42ivTz2sHOgGGIRMQX8O/B82H3n+x5JtwO/lrRf7d2wpCOTO6/t\nkn4u6SPVfUk6XNK1yXv3SPrzmvfeI2kquZu7W9Ir6suSbPO5unV/J+nDre6j4OdLLXfyM0skfTs5\n7lrggJr39npSaHR+JH0KOAr41+Qp5y9bOF+px2wkKcf5kr4j6SFJ/yTpgIzP3LTczcpY9HeS+CCw\nXNKzJL0ReAeVnkePZnzGsv4+ch/bWhAR/hqiL+CHwCuT10cC24C/qXlvS7J+onZ7Kt0IbwMuAw6i\ncvE6KdlmDrAJ+GtgLvAs4AfAcuBo4EfA4cm2C4FnNyjXM4BHgYOT5THgJ8BLW91H3s+XVe5k+7nA\nvcC5wDhwBjALvK/BsbLOz+7tWjhfmcfM+Mx3JJ/rUOCWujI2/J1mlbuFc5P6OwE+Bnysyd/hJ4Ev\nAduBE1r4uy3l76PIsf3VwnWl1wXwV8m/0MpF4hFgOrkgfazuAvIHDbZ/JfCy5B9rvwb7fAlwX926\n84F/Ap4DPJjsY7xJ2b4O/F7y+hTg+8nrPPto+fNllTt5/dvA/STdxJN136BxoMg6P7u3a+F8ZR4z\n4zP/cc3ya2rOXervNKvcLZybln8nKWV+PhDAm+rW/xnw3E79fRQ9tr+yv3pep2kdsSIi/iPlvR+l\nrD8SuDcidjR47xnA4ZKma9aNAV+LiO9JOge4CFgk6Ubg3RFxf4P9fAY4C7gS+B/JMjn3kefzpZY7\neX04MBXJVSRxb8p+s85Pvazj5jlmrdrPdW+yn0bv1Usrd+a5KfA7qTcXeBy4rnZlRHyk8eZAeX8f\nRY5tGdxGMXrSGvR+BByV0iD6I+CeiJis+To4Il4DEBGfiYiTqFx8Anh/yjGuAV4u6Qjgd0kuBDn3\n0Uzt58ssN5WqjQV1vWGOStlv1vmpP6dZx81zzFpH1m1fe5HMaqRNK3ezc9Pu7+R44I76ACXp5oyf\nKevvo8ixLYMDhVV9i8pFbLWkgyQdIGlpzXsPJw2KE5LGJD1f0oskHS1pmaT9gceAGWBXowNExHbg\nZipVMPdExJ0AefZR4DM1LHfy/n8CO4A/lzQu6XTgxRn7Sjs/D1Cp42/luHmOWeudko6QdCjwv4G1\nOc5Bo3JnnpsSfieLqbSd7KZK/qUH036gxL+P3Me2bA4UBkBE7AROo1IffB/wY+DMmvdeS+Uf8B7g\nZ8A/AocA+wOrk3U/BZ5Gpa47zWeo1DV/pmZd3n3k+Uxp5SYingBOB/4n8Asqn/e6jH01PD/ApcAF\nkqYl/UXWcfMcs85nqCS5+wHwfaClAXpp5W52bsj4nUj6uKSPNzn08dRdrIEXAFub/FwZfx9Fj20p\nnOvJrM9J+iHwhxntMgMhaWf4YUSsG6VjDwM/UZhZtxwH3D6Cxx547vVkZl0REW8fxWMPA1c9mZlZ\nJlc9mZlZJgcKMzPLNBRtFE996lNj4cKFvS6GmdlA2bRp088iYn6z7YYiUCxcuJCNGzf2uhhmZgNF\nUivpY1z1ZGZm2RwozMwskwOFmZllcqAwM7NMDhRmZpZpKHo9mZn1g3Wbp1hz493cPz3D4ZMTrFx+\nNCuWLOjIfso6ViscKMzMSrBu8xTnX7eVmdmdAExNz3D+dZXM5nku4K3sp6xjtarnVU/JhCmbJf1b\nsvxMSd+U9D1JayXN7XUZzcyaWXPj3bsv3FUzsztZc+Pdpe+nrGO1queBAngXcGfN8vuByyLiOcBD\ngLM+mlnfu396Jtf6dvZT1rFa1dNAkcyNeyqVmbVI5hFeBnwu2eQKYEVvSmdm1rrDJydyrW9nP2Ud\nq1W9fqL4EPCX7Jn/9inAdM2k6D8GGla4SXqHpI2SNm7fvr3zJTUzy7By+dFMjI/ttW5ifIyVy48u\nfT9lHatVPQsUkl4LPBgRm4r8fERcHhEnRsSJ8+c3zWllZtZRK5Ys4NLTj2PB5AQCFkxOcOnpx+Vu\nXG5lP2Udq1U9m7hI0qXA24AdwAHAk4HrgeXAb0TEDkkvAy6KiOVZ+zrxxBPDSQHNzPKRtCkiTmy2\nXc+eKCLi/Ig4IiIWAm8G1kfEW4ANwBnJZmcDn+9REc3MjN63UTTyHuDdkr5Hpc3iEz0uj5nZSOuL\nAXcRcTNwc/L6B8CLe1keMzPboy8ChZlZv+tmyox+40BhZtZEt1Nm9Jt+bKMwM+sr3U6Z0W8cKMzM\nmuh2yox+40BhZtZEt1Nm9Bu3UZhZYaPSwLty+dF7tVFAZ1NmtMLzUZhZ3xulBt7q5+mXoNjtc+9A\nYTbEOnnXmdXAO2yBAioX4H75XN0+926jMBtS1bvOqekZgj13nes2T5Wy/7SG3KnpGZauXl/acWxf\nIzUfhZl1Tqe7dGY15JYdlGxvozYfhZllWLd5iqWr1/PMVV/IfZfe7I6/yD5rNZoTodYojTPotm7P\nR+E2CrM+1W6D5eGTE0w1CBZK9lVkn7VqG3gbHQdGZ5xBt3W7cb1n81GUyfNR2DBaunp9wwvwgskJ\nblm1rOnP1wcaqASJRv/xre6zU2W13uj7+SjMLFu7DZaNZkFLuy1s986/21Uh1l2uejLrU2lVR3ka\nLGu7dK7bPMW5a7c0DBbtNoL22ziDfjEsAxIdKMz6VNmjgdfceHfDIKHkWO3qp3EG/WCYBiQ6UJj1\nqbLv0tOql4LsC1e7d8XDcled1zANSHSgMOtjZd6lp1VlLciodlq3eYqV19zG7K7Ks8jU9Awrr7lt\nd9maGaa76ryGKeOsG7PNRkSRBueLbti2O0hUze4KLrphW0vHHOV5HIYp46yfKMyGQCvVO0WqsqZn\nZnOtrzdMd9V5dTrjrLPHmlnLGlXvnLt2C+es3cKCugtItxucy+i5Nag62RPM2WPNLJdG1TvVyqJ2\nLyDzDhznoUf3fXqYd+D47tdZd7b9OI9DN3UqMDt7rJnl0qwap502gQtPW8T4mPZaNz4mLjxtEdA8\nQ22jQX+Xnn7c0Ddkd1q3q/T8RGE24NKqd2oVvYA0qz5Ju7M97+o9PaM8vqJ83a7Sc6AwG3CNqnfq\ntXMBybrQpwWgnREj0w22F7pdpedAYdZj7fZeqc/iWp/4r5MXkKynmUEdXDYInD22AGePtUHVKMPr\nxPhYW/X43ew22aj8tQTcs/rUjhzb2tdq9lg/UZj1ULMBaUUu+N1sE6ge57yrb2Nng5vOUegGOwoc\nKMx6KGsWurR+8tBfWVqrxx7lbrDDzoHCrIcOmRhvOMpZ0PBJ46IbtvH4jl19lzvJacaHmwOFjZR+\ny2QqNV6f1nLYKKj0S6Oxu8EOLwcKGxn9mMl0usGo5yJGIXeS9Y5HZtvIyJPJdN3mKZauXs8zV32B\npavX7x5pXLa0xt55B443zPRamzqjlf2YlcFPFDbUaquaWp0vuptPHmkDp6opMuqrycCNxtZ9PQsU\nkg4Avgrsn5TjcxFxoaRnAp8FngJsAt4WEU/0qpw2uJr18a+qvxvvZsK1Zo3AacfL286ybvMUF92w\nbXcbx7wDx7nwtEVuU7CW9PKJ4nFgWUQ8Imkc+LqkfwfeDVwWEZ+V9HHg7cDf97CcNqAaXfDrNbob\n73bCtbyNwHm3r5+lDuChR2dZ+bnWZ6qz0dazNoqoeCRZHE++AlgGfC5ZfwWwogfFsyGQdWEXlbvq\n/febw7lrt+zVDjFMM5NBJWDWz1IHMLszRmKmOWtfTxuzJY1J2gI8CHwZ+D4wHRE7kk1+DPh2xwpJ\nu7AvmJzgsjMX89jsLqZnZvdJj11kytB+lhUw3VvKWtHTQBEROyNiMXAE8GLgmFZ/VtI7JG2UtHH7\n9u0dK6MNrqwLfrN2iGGaQyHrSWhQn5Ksu/qi11NETEvaALwMmJS0X/JUcQTQsF9iRFwOXA6VpIBd\nK6wNlAPG5+wOCJMT41z0ukoD7rlrtzTcvnqHPUyDx1YuP3qfNgqoTEA0qE9J1l09e6KQNF/SZPJ6\nAjgFuBPYAJyRbHY28PnelNAGWbXHU+00no/v2LX7ddqddEBHx01Uy9aNMRpVK5YsYM0bj2dyYs8Y\njHkHjrPmjOOHJhhaZ/UszbikF1BprB6jErCujoiLJT2LSvfYQ4HNwFsj4vGsfTnNuNVbunp9w3kS\nFkxOcMuqZU27zrab6jtNJ9KKmxXV92nGI+J2YEmD9T+g0l5hVlizLq71k/3U69S4iW6O0TAri1N4\n2FBqpYvriiULuGXVMlLy8nWkR1C3x2iYlcGBwoZSsy6ute0Ec1JSuHaiR9CwjdGw0dB21ZOki5P9\nbAG2RMR/t10qs0SetOD1277hhQvYcNf2fX62vp2g0cxsnRo3kZbbyb2PrJ/lChSS3hoRn65dFxF/\nLekwYDHwu5KeExF/VGYhbTTlSc7XaNtrN001bCROS+0xJrEroq15KpoFNk/wY4Mo7xPF2yS9CHh3\nROz+T4uIB4Abky+zUuRp+E3b9ryr981nlNYesCuCe1afWri8rQa2YRqjYaMhs41C0iJJV9WsejUw\nA6yXNL+jJbORl6fhN23bnRGcf91WLli3teNtEnnmuzAbJM2eKP6DymhpACJiF7BK0unA1yR9kErb\nxB0R8Wjnimmj6PDJiYZdVxtd0NPmnobKxfqqW+/bPR9Fp9ok3KPJhlWzXk+/A1xSu0LSa4E/BJ4A\nTgD+FviRpO91pIQ2svIk50ube7oqa1ipgDe8sP3qIPdosmGV+UQREVuBt1SXJd0DfIfKfBFfrt1W\n0hEdKaGNrOqF+73/um13Ko7992t8b9PO3NMBbLirklgyTy+reu7RZMMqb2P2qyPirkZvRMSPSyiP\n2T4em92To2l6ZnZ3AzHs6T00R2pYpQSVJ4ZmiWrun55pewpU92iyYdWzXE9lcq6n4ZWWs2lyYpzH\nd+xqOoPd5MQ4iw4/mFu+/4vM7RYk1UONjjUm8YE3OYGeDZ++z/Vko6VolU5aQ3Baw3X9WAhg95Sf\nWU4+Zj5X3Xpfw/eqPaeg8fgNP0HYsHMKD+u4apXO1PTMPrPJNZO3Ibg6FuKWVcsAOO/q25jd2fyp\necNd2zOP1aibazufy2yQOFBYx7UzviCt59O8A8cbbl+92Fcv4mntFvWmpmcaVjvVqn+68bgJGxWF\nq54k/UZE/DRt2ayqnfEFaQ3EQGYPo7Q0He2of+LI+7lcTWWDqp02ik8Ap2YsmwH5Bs41kpXyIu3C\nW/Ygt/E5+04bmvW56oPCycfM59pNU4V7VJn1UuGqp4g4NWvZrKpR9RHArx/f0VZ9fnU+icvOXAzA\nOWu38Ozzv8jCjDQdhTXYXVq12MnHzN+n7eKqW+9zNZUNrEKBQtIbJR2cvL5A0nWS9pmtzgwqF/RL\nTz9un3aF6ZlZzlm7hSUX31Q4YNQ2KMOe9BxpaTo+dObi1ImKsszujH0u6tXPtWByAlHpYnvp6cex\n4a7t+wSFtJYSp/ewQVD0ieKvIuJhSScBr6RS7fTx8oplw2bFkgUcOLdxTedDj84W7i3UrC1iTNrr\nIr5iyYLCKTUaXdSrTzXVnlYrlizIdfF3eg8bBEUDRfU/81Tg8oj4AjC3nCLZsMq6gBathml2Ud4Z\nwWVnLt59EYf0qrBm6i/qtbPkLV29fnegS7v41z/JOL2HDYqijdlTkv6BStLA90vaH3e1tSbSGn+r\nWrkTr28kzsoaW1XfaFzfk2rywHEeeWwHs7vSu9LWX9Sz0n2k5XxKm3HPrN8VDRRvAl4F/G1ETEt6\nOrCyvGLZMGp0Aa2VdideDQ5T0zN75W1qNu6hqtFkR7U9qZauXr876WCtrBnvssZQVAf7uSusDYui\ngWIGOAg4C7gYGAemyyqUDafqhfKiG7bt8xSQVg1Tf+deNDNZ1tNKkRnvmo2h8Cx2NkyKBoqPAbuA\nZVQCxcPAtcCLSiqXDZhWB5NVL6BZ29e+l5UVNo9DJhqP5Ib0KrE5Eus2TzX8HO2ODTEbJEUDxUsi\n4gRJmwEi4iFJbsweUUXSc6fdcdfvq4wgAfDrJ3akXvTTqsSykgF67gkbJUUboGcljZHUBCTzZ+/K\n/hEbVmXmPOpE6g1oPA6iqjoeYqzBIL20z5E2hsLVTTaMij5RfBi4HniapEuAM4ALSiuVDZQy54ru\n5AC0rH2vWLKAc9duyfVzboewUVEoUETEVZI2Aa+g0j18RUTcWWrJbGCUWV/frAttO5qVx+0OZo21\nk+vproj4aER8xEFitKXlPCpSX9/qYDhRmb1uTov5OFopT5mfw2yYFM31dIWkyZrleZI+WV6xbJDk\nra9PG9Fcv680CyYnuOzMxTy+YxcZY+Rytx+43cGssUJzZkvaHBFLmq3rFs+ZPTguWLeVq269b6/x\nEBPjYw0vyPU9oGq3rQ7AS7NgcmL3wDcza6zVObOLVj3NkTSv5mCH4vm3rYkL1m3l03VBAio9i85Z\nu4UL1m3dva46lmJmdufu3khztGfbZu0YJx8zv+zim42sohf3DwD/KemaZPmNwCXlFMmG0brNU3z6\n1vsyt6m+f+IzDm04liKrmqnehru2Fyuome2jaK+nK5NeTycnq06PiO+UVywrW6+n4Wx1TMVVt97H\nv3zzR20PtPM8D2blKVxdFBHbgG0llsU6pMjI6bK1euEOyhmNnbdLa68DqVk/y9VGIenryfeHJf2q\n5uthSb/Kua8jJW2Q9B1J2yS9K1l/qKQvS/pu8n1es31ZtjJHThc1eWB6rqVOyNOltXaWvOrUpUUn\nUjIbRrkCRUScJEnAooh4cs3XwRHx5JzH3gGcFxHHAi8F3inpWGAV8JWIeC7wlWTZ2lDmyOki1m2e\n4pHHdnTlWADzDhzP9TRQZiDN6vprNqhyVz1FREj6AnBcOweOiJ8AP0lePyzpTmAB8Hrg5clmVwA3\nA+9p51ijLm3E8SET4yxdvb7j1S1rbrw7c1KgMo2PiQtPW5TrZ8oKpP1QxWfWCUW7x35bUmkpxSUt\nBJYA3wQOS4IIwE+Bw1J+5h2SNkrauH27e7hkaTTieHyO+PUTO/aqbll5zW0sufimUu+G122eKjUl\nx9gcMZ4yHPuguWMcNHc/zl27JVf509oz8rZzpD2ZnHf1bX6ysIFWNFC8BLhV0vcl3S5pq6Tbi+xI\n0pOozGVxTkTs1c4RldGADW9FI+LyiDgxIk6cP9995rM0GnH8pAP2Y3bn3qd2dlfw0KOzpdXTV++w\ny7RzVzB3vzl7fZYPnbmYD525mF0B0zP5y19W6o60J5BqunIHCxtURXs9LS/j4JLGqQSJqyLiumT1\nA5KeHhE/SaZYfbCMY426+kynz1z1haY/Uz+FaLOeQfXvP/rEjo6kDP/1Ezu55Hf3PvbS1etT2xla\nSd0B7U9dmpXQsNWymPWjooHiAeBPgZOo3PF/Hfj7PDtIGsU/AdwZER+seesG4GxgdfL98wXLaBla\nzdJavUtuVv/e6P0sotITqtFc1a2ov+i2285QRsrwZnOCe2yHDaqiVU9XAouA/wt8BDgW+FTOfSwF\n3gYsk7Ql+XoNlQBxiqTvAq9Mlq1kraa4qNbTN6t/zzvhUACPze5iXsFus/UX3bLaGdqRNQFSt8ti\nVqaiTxTPT7q1Vm2QlGtkdkR8ncqNZSOvKFgua1ErKS5q6+mb1b8XqWKamd3J/vvNYWJ8LPfP1190\n+2Vq0upTST+Uxaws7fR6eml1QdJLAKdvHSBZ1SCNUmxn3Q3XJu7L65czs03TitfvudFFt59ShPdT\nWczKUDTN+J3A0UA1y9tRwN1UBtFFRLygtBK2oNtpxoch3cPS1esbtiOMSXzgTce3lPK7Xv2TwfiY\nIMgcQ1GbDjwtrfgbXriADXdtH+jzbdaPWk0zXrTq6VUFf27gDcugqrSG12pVEuz9eaqvz7v6toa5\nmBYkF/D6AArsnjtCsM88FCcfM3+vQX8OCmb9p9ATRb/p5hNF2p34IE6Us27zVOaFv9HnyZpMqNkF\nvf5J7ORj5nPtpqlC+zKz9nV64qKR1eu8Sc3kyTW0YskCdqXcKKR9nqL1742q6zbctb3nyQrNrDnP\nSpdT2viDfuj6WKRarMjnyTvmIK1cHm9gNhj8RJFTWekeOqFIFtQyP0/a00xauTzewGww5HqikPTu\nrPfrRlgPpbLSPXRCkWqxsj7PBeu2clXNfNi1TzNZYzDqe0r1S9A1sz3yVj0dnHw/GngRlXQbAKcB\n3yqrUP2ujHQPnVC0Wqzdz7Nu89ReQaKq+jSTVq60nlL9eG7NRlmuQBER7wWQ9FXghIh4OFm+CGie\nZc46qlejk9fceHfjFL9UniYuO3Nxarn6Neia2R5FG7MPA56oWX6ClHkjrHt6VS2WVbV1+OREX1fX\nmVlzRQPFlcC3JF2fLK+gMhud9Vgv7tDTqpbEnrmr/eRgNrgK9XqKiEuA3wceSr5+PyL+T5kFs8HR\nqOeUgLe89KiGqUA8p7TZYCn0RJHMJXEscEhEXCzpKEkvjoiRadC2PVqtWhqW9Cdmo6ZoUsC/B3YB\nyyLieZLmATdFRGnzaOfR7aSAvVAd2Tw1PcOYxM6I3b2G2r3IdivJ4TClPzEbBp1OCviSiDhB0maA\niHhI0tyC+7Im6u/Eq7mZyrgjX7d5ipXX3LY7w+vU9Awrr7mtrX2m6ff0J2bWWNGR2bOSxkiSgUqa\nT+UJwzoga/a4dnMjXXTDtn3SgM/uCi66YVvhfabph1nozCy/ok8UHwauB54m6RLgDOCvSivVkMtb\n1dPsjjvvHXnt8dMqHqdnis1lnSVrnMcwzPFhNqwKBYqIuErSJipTlgpYERF3llqyIVVm4r7a91s9\n9kU3bCs9CLR6kU9r9AbcyG3Wx4r2enp/RLwHuKvBOsuQlbgv7aKYNskQpI+8bmXuhyzzDhxvabu8\nga/ReIqlq9fnPidm1j1Fq55OAeqDwqsbrBtJWXfY7Sbuq+/1dPIx81lz492cu3ZL5h16o1xMacbH\nxIWnLWpp2yKBr54buc36W97ssX8C/CnwbEm317x1MPCNMgs2qJrdYZeZuC/tWAeMz9nn4t0sSIxJ\n7IrI3T5QxkW+n+f4MLP8vZ4+QyVT7OeT79WvF0bEW0ou20BqNidEmfM/pB3roUfztUFMjI/xgTcd\nzz2rT+WWVctyVfeU0ZOpn+f4MLOcgSIifhkRP6SSBPCXEXFvRNwLhKRPdqKAg6bZHXazqUTzpLgo\no2rmoLntzVFdxkW+6PSqZtYdRdsoXhAR09WFZMDdkpLKNNBaqUZJS5CXt2E47ViTE+P8+okdzO5s\n3ioxeeDcti7IZWWGddJAs/5VdMDdnCRtBwCSDsXzbwPt3WHnnco07VgXvW4RB81t7deR56kk7Wln\nxZIF3LJqWaGqKzPrf0Uv7h8A/lPSNcnyG4FLyinSYGvnDjtvw3DWsc5du6Wl8uYZg+GxDmajqeiA\nuyuTAXcnJ6tOj4jvlFeswVa0GqVI75+0YzUbpAf52hLK6AZrZoOpaNUTEbEtIj6SfA18kOiHeRLK\n7P2zcvnRjM9R6vt5G4w91sFsdOUdR/H1iDhJ0sPs3TVfQETEk0stXZf0S7VKmVOGVn+mNmXHvAPH\nufC0RV172jGz4VBoPop+0+58FKMwT0K7SffqgylUnnbcjdVscHVkPgpJ7856PyI+mGd//WLYq1XK\neGIq82nHzAZL3sbsg5PvRwMvAm5Ilk8DBnYa1GGvVimrIdpjHcxGU96R2e+NiPcCRwAnRMR5EXEe\n8ELgqE4UsBuGPYXEsD8xmVlnFe31dBiVNB5VTyTrcpH0SUkPSrqjZt2hkr4s6bvJ93lZ+yjDsKeQ\n8MxyZtaOogPurgS+Jen6ZHkFcEWB/fwz8JFkf1WrgK9ExGpJq5LljqcvH+ZqlayZ5czMmik64O4S\nSf8O/Fay6vcjYnOB/XxV0sK61a8HXp68vgK4Gc9z0RY3RJtZO4rOcCfgWOCQiLhY0lGSXhwRZTRo\nHxYRP0le/5QCVVq2r24+MXn+a7PhUrSN4mPAy4CzkuWHgY+WUqIaURnk0XCgh6R3SNooaeP27dvL\nPrQVVO2KOzU9Q7CnK24vRrqbWTmKBoqXRMQ7gcegkmYcmFtSmR6Q9HSA5PuDjTaKiMsj4sSIOHH+\n/PklHdralTcDrpn1v6KBYlbSGMndvqT5wK6SynQDcHby+mwqs+nZgHBXXLPhU7TX04eB64GnSboE\nOAO4IO9OJP0LlYbrp0r6MXAhsBq4WtLbgXuBNxUs41Dq9/r/YR+8aDaKcgeKpCH7q8Am4BVUEgKu\niIg78+4rIs5KeesVefc1CvoleWEWd8U1Gz65A0VEhKQvRsRxwF0dKFPf69VdfZ5UHL0qo7vimg2f\nolVP35b0ooj4r1JLMwB6eVffav1/r588hnnwotkoKtzrCbhV0vcl3S5pq6TbyyxYv+plr55WU3G4\n55GZlanoE8XyUksxQHrZq6fV+n/3PDKzMuWdj+IA4I+B5wBbgU9ExI5OFKxf9bJXT6v1/+55ZGZl\nyvtEcQUwC3wNeDWVNB7vKrtu6ZQDAAAJq0lEQVRQ/azVu/pONSa3Uv/vnkdmVqa8geLYpLcTkj7B\nAE9WVFQrd/X90JjcrIxmZq3KNWe2pG9HxAlpy73S7pzZZRuFObjNbPB1ZM5s4HhJv6oeA5hIlkVl\niMWTc+6vZzo5zsCNyWY2THIFiogYa75V/+t01ZAbk81smBQdRzHQOj3OYNjn4Daz0VJ0HMVAK6tq\nKK36yo3JZjZMRjJQlFE11Kz6ymkszGxYjGTVUxlVQ06TYWajYiSfKMqoGnLPJjMbFSMZKKD9DKfu\n2WRmo2Ikq57K4J5NZjYqRvaJol3u2WRmo8KBog3u2WRmo8BVT2ZmlsmBwszMMjlQmJlZJrdRlKiT\nGWnNzHrFgaIkvZ6syMysU1z1VBKn9DCzYeVAURKn9DCzYeVAUZK01B1O6WFmg86BoiRO6WFmw8qN\n2SVxSg8zG1YOFCVySg8zG0auejIzs0wOFGZmlsmBwszMMjlQmJlZJgcKMzPL1JeBQtKrJN0t6XuS\nVvW6PGZmo6zvAoWkMeCjwKuBY4GzJB3b21KZmY2uvgsUwIuB70XEDyLiCeCzwOt7XCYzs5HVj4Fi\nAfCjmuUfJ+v2IukdkjZK2rh9+/auFc7MbNT0Y6BoSURcHhEnRsSJ8+fP73VxzMyGVj8GiingyJrl\nI5J1ZmbWA/0YKP4LeK6kZ0qaC7wZuKHHZTIzG1l9lxQwInZI+jPgRmAM+GREbOtxsczMRlbfBQqA\niPgi8MVel8PMzPqz6snMzPqIA4WZmWVyoDAzs0wOFGZmlsmBwszMMjlQmJlZJgcKMzPL5EBhZmaZ\nHCjMzCyTA4WZmWVyoDAzs0wOFGZmlqkvkwL2s3Wbp1hz493cPz3D4ZMTrFx+NCuW7DMBn5nZ0HCg\nyGHd5inOv24rM7M7AZianuH867YCOFiY2dBy1VMOa268e3eQqJqZ3cmaG+/uUYnMzDrPgSKH+6dn\ncq03MxsGDhQ5HD45kWu9mdkwcKDIYeXyo5kYH9tr3cT4GCuXH92jEpmZdZ4bs3OoNli715OZjRIH\nipxWLFngwGBmI8VVT2ZmlsmBwszMMjlQmJlZJgcKMzPL5EBhZmaZFBG9LkPbJG0H7u11Odr0VOBn\nvS5EH/H52MPnYm8+H3tr53w8IyLmN9toKALFMJC0MSJO7HU5+oXPxx4+F3vz+dhbN86Hq57MzCyT\nA4WZmWVyoOgfl/e6AH3G52MPn4u9+XzsrePnw20UZmaWyU8UZmaWyYHCzMwyOVD0gKRPSnpQ0h01\n6w6V9GVJ302+z+tlGbtF0pGSNkj6jqRtkt6VrB/V83GApG9Jui05H+9N1j9T0jclfU/SWklze13W\nbpE0JmmzpH9Llkf5XPxQ0lZJWyRtTNZ1/H/FgaI3/hl4Vd26VcBXIuK5wFeS5VGwAzgvIo4FXgq8\nU9KxjO75eBxYFhHHA4uBV0l6KfB+4LKIeA7wEPD2Hpax294F3FmzPMrnAuDkiFhcM3ai4/8rDhQ9\nEBFfBX5Rt/r1wBXJ6yuAFV0tVI9ExE8i4tvJ64epXBAWMLrnIyLikWRxPPkKYBnwuWT9yJwPSUcA\npwL/mCyLET0XGTr+v+JA0T8Oi4ifJK9/ChzWy8L0gqSFwBLgm4zw+UiqWrYADwJfBr4PTEfEjmST\nH1MJpqPgQ8BfAruS5acwuucCKjcNN0naJOkdybqO/694hrs+FBEhaaT6LUt6EnAtcE5E/Kpy41gx\naucjInYCiyVNAtcDx/S4SD0h6bXAgxGxSdLLe12ePnFSRExJehrwZUl31b7Zqf8VP1H0jwckPR0g\n+f5gj8vTNZLGqQSJqyLiumT1yJ6PqoiYBjYALwMmJVVv7I4ApnpWsO5ZCrxO0g+Bz1Kpcvo7RvNc\nABARU8n3B6ncRLyYLvyvOFD0jxuAs5PXZwOf72FZuiapc/4EcGdEfLDmrVE9H/OTJwkkTQCnUGm3\n2QCckWw2EucjIs6PiCMiYiHwZmB9RLyFETwXAJIOknRw9TXwO8AddOF/xSOze0DSvwAvp5Ie+AHg\nQmAdcDVwFJWU6W+KiPoG76Ej6STga8BW9tRD/y8q7RSjeD5eQKVBcozKjdzVEXGxpGdRuas+FNgM\nvDUiHu9dSbsrqXr6i4h47aiei+RzX58s7gd8JiIukfQUOvy/4kBhZmaZXPVkZmaZHCjMzCyTA4WZ\nmWVyoDAzs0wOFGZmlsmBwszMMjlQ2NCQtEJSSMpMeSFpUtKftnmsR1LW70xSQN8h6RpJB6Zs9412\njt8qSc9LUlPPSZbHJN0k6fe6cXwbDg4UNkzOAr6efM8yCbQVKDLMJCmgnw88Afxx7ZuqmBMRv9mh\n4+8lIu6kMrL7tcmqS4C7I+LKbhzfhoMDhQ2FJKngSVTmJnhzzfrfk3R7MhHQp5LVq4FnJ3f+ayQt\nrJtE6i8kXZS8Xpdk6txWk62zVV8DnpPs/25JV1JJuXBk7RNJShmR9NZkEqMtkv4heRo4SNIXkm3v\nkHRmC+W4DPgTSW+gkj/p3Tk/h404Z4+1YfF64EsR8d+Sfi7phcBjwAXAb0bEzyQdmmy7Cnh+RCyG\n3enN0/xBRPwiybv0X5KujYifNytMkrTu1cCXklXPBc6OiFuT96vbLWpURknPA84ElkbErKSPAW8B\nfg3cHxGnJtsdknz/IvCHEXF/fVki4iZJHwAuBX47Imabld+slp8obFicRSX/D8n3s6hkG70mIn4G\nUDD/zZ9Lug24FTiSygU/y0Qyl8RG4D4qCQ8B7q0GiTppZXwF8EIqwWlLsvwsKjmxTpH0fkm/FRG/\nTH7uNY2CRI1vAB+MiJ9WV0j6myafxQzwE4UNgeQufBlwXJKLf4zKBC9rWtzFDva+aTog2e/LgVcC\nL4uIRyXdXH0vw0z1SaWmfFB5EshDwBURcf4+b0gnAK8B3ifpKxFxcQv7Oxb4p5p9/AaV2fPMmvIT\nhQ2DM4BPRcQzImJhRBwJ3APcDrwxya5JTdXTw8DBNT//APA0SU+RtD97Gn4PAR5KgsQxVOb0Ltv6\nlDJ+BTgjmaAGSYdKeoakw4FHI+LTVALhCS0eZxGV9pGqxcCWMj6ADT8HChsGZ7En/XLVtVQatS8B\n/l9SffRBgKSN4ZakMXhNUmd/MfAtKlOPVmcN+xKwn6Q7qTSAN6o6aktEbEsp43eotF3cJOn2pFxP\nB44DvpVUR10IvA8qbRRJENmHpCOpTB9a26XXgcJa5jTjZiNI0ieAP4qIXU03tpHnQGFmZplc9WRm\nZpkcKMzMLJMDhZmZZXKgMDOzTA4UZmaWyYHCzMwyOVCYmVkmBwozM8vkQGFmZpn+P0oQ58T6BoBj\nAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2X1RA6sgtZQ6", + "colab_type": "text" + }, + "source": [ + "## Conclusion\n", + "- looks like the multiple regression we ran does provide more accurate predictions than the simple linear regression\n", + " - this will not always be the case, so always be sure to check and confirm if the extra computing is worth it\n", + "\n", + "Anyways, that's how you implement both Simple and Multiple Linear Regression with `cuML`. Go forth and do great things. Thanks for stopping by!" + ] + } + ] +} From cd659b32ff7e34f183e7d8be617ef563c93dac8a Mon Sep 17 00:00:00 2001 From: Winston <43570913+gumdropsteve@users.noreply.github.com> Date: Fri, 4 Oct 2019 00:04:09 -0700 Subject: [PATCH 6/7] Delete linear_regression_boston_demo.ipynb Incorrect branch, find in patch-3 --- .../linear_regression_boston_demo.ipynb | 768 ------------------ 1 file changed, 768 deletions(-) delete mode 100644 intermediate_notebooks/examples/linear_regression_boston_demo.ipynb diff --git a/intermediate_notebooks/examples/linear_regression_boston_demo.ipynb b/intermediate_notebooks/examples/linear_regression_boston_demo.ipynb deleted file mode 100644 index 53b868d2..00000000 --- a/intermediate_notebooks/examples/linear_regression_boston_demo.ipynb +++ /dev/null @@ -1,768 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "LOCAL_intro_lin_reg_cuml", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "2tZ3RLnlkrkg", - "colab_type": "text" - }, - "source": [ - "# Intro to Linear Regression with cuML\n", - "Corresponding notebook to [*Beginner’s Guide to Linear Regression in Python with cuML*](https://medium.com/future-vision/beginners-guide-to-linear-regression-in-python-with-cuml-30e2709c761) story on Medium\n", - "\n", - "Linear Regression is a simple machine learning model where the response `y` is modelled by a linear combination of the predictors in `X`. The `LinearRegression` function implemented in the `cuML` library allows users to change the `fit_intercept`, `normalize`, and `algorithm` parameters. \n", - "\n", - "Here is a brief on RAPIDS' Linear Regression parameters:\n", - "\n", - "- `algorithm`: 'eig' or 'svd' (default = 'eig')\n", - " - `Eig` uses a eigendecomposition of the covariance matrix, and is much faster\n", - " - `SVD` is slower, but guaranteed to be stable\n", - "- `fit_intercept`: boolean (default = True)\n", - " - If `True`, `LinearRegresssion` tries to correct for the global mean of `y`\n", - " - If `False`, the model expects that you have centered the data.\n", - "- `normalize`: boolean (default = False)\n", - " - If True, the predictors in X will be normalized by dividing by it’s L2 norm\n", - " - If False, no scaling will be done\n", - "\n", - "Methods that can be used with `LinearRegression` are:\n", - "\n", - "- `fit`: Fit the model with `X` and `y`\n", - "- `get_params`: Sklearn style return parameter state\n", - "- `predict`: Predicts the `y` for `X`\n", - "- `set_params`: Sklearn style set parameter state to dictionary of params\n", - "\n", - "`cuML`'s `LinearRegression` expects expects either `cuDF` DataFrame or `NumPy` matrix inputs\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-tG6ezqKh1Z0", - "colab_type": "text" - }, - "source": [ - "Note: `CuPy` is not installed by default with RAPIDS `Conda` or `Docker` packages, but is needed for visualizing results in this notebook.\n", - "- install with `pip` via the cell below " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pxBcXor_0-Jd", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# install cupy\n", - "!pip install cupy" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N20le3_KlP3O", - "colab_type": "text" - }, - "source": [ - "## Load data\n", - "- for this demo, we will be utilizing the Boston housing dataset from `sklearn`\n", - " - start by loading in the set and printing a map of the contents" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "RFE-nxxlTajg", - "colab_type": "code", - "outputId": "04f89e88-61a3-4dd2-9088-123b410e508c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "from sklearn.datasets import load_boston\n", - "\n", - "# load Boston dataset\n", - "boston = load_boston()\n", - "\n", - "# let's see what's inside\n", - "print(boston.keys())" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wmcO8dxO0uOB", - "colab_type": "text" - }, - "source": [ - "#### Boston house prices dataset\n", - "- a description of the dataset is provided in `DESCR`\n", - " - let's explore " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "c3kLHAsP-Al2", - "colab_type": "code", - "outputId": "02518c3c-7767-42a7-b6f4-6756ace741cc", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 923 - } - }, - "source": [ - "# what do we know about this dataset?\n", - "print(boston.DESCR)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - ".. _boston_dataset:\n", - "\n", - "Boston house prices dataset\n", - "---------------------------\n", - "\n", - "**Data Set Characteristics:** \n", - "\n", - " :Number of Instances: 506 \n", - "\n", - " :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n", - "\n", - " :Attribute Information (in order):\n", - " - CRIM per capita crime rate by town\n", - " - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n", - " - INDUS proportion of non-retail business acres per town\n", - " - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n", - " - NOX nitric oxides concentration (parts per 10 million)\n", - " - RM average number of rooms per dwelling\n", - " - AGE proportion of owner-occupied units built prior to 1940\n", - " - DIS weighted distances to five Boston employment centres\n", - " - RAD index of accessibility to radial highways\n", - " - TAX full-value property-tax rate per $10,000\n", - " - PTRATIO pupil-teacher ratio by town\n", - " - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", - " - LSTAT % lower status of the population\n", - " - MEDV Median value of owner-occupied homes in $1000's\n", - "\n", - " :Missing Attribute Values: None\n", - "\n", - " :Creator: Harrison, D. and Rubinfeld, D.L.\n", - "\n", - "This is a copy of UCI ML housing dataset.\n", - "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n", - "\n", - "\n", - "This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n", - "\n", - "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\n", - "prices and the demand for clean air', J. Environ. Economics & Management,\n", - "vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n", - "...', Wiley, 1980. N.B. Various transformations are used in the table on\n", - "pages 244-261 of the latter.\n", - "\n", - "The Boston house-price data has been used in many machine learning papers that address regression\n", - "problems. \n", - " \n", - ".. topic:: References\n", - "\n", - " - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n", - " - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n", - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wI_sB78vE297", - "colab_type": "text" - }, - "source": [ - "### Build Dataframe\n", - "- Import `cuDF` and input the data into a DataFrame \n", - " - Then add a `PRICE` column equal to the `target` key" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "xiMmIZ8O5scJ", - "colab_type": "code", - "outputId": "fd09db1f-fb41-4494-bb8b-eab6e18c258f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "source": [ - "import cudf\n", - "\n", - "# build dataframe from data key\n", - "bos = cudf.DataFrame(list(boston.data))\n", - "# set column names to feature_names\n", - "bos.columns = boston.feature_names\n", - "\n", - "# add PRICE column from target\n", - "bos['PRICE'] = boston.target\n", - "\n", - "# let's see what we're working with\n", - "bos.head()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
\n", - "
" - ], - "text/plain": [ - " CRIM ZN INDUS CHAS NOX ... TAX PTRATIO B LSTAT PRICE\n", - "0 0.00632 18.0 2.31 0.0 0.538 ... 296.0 15.3 396.90 4.98 24.0\n", - "1 0.02731 0.0 7.07 0.0 0.469 ... 242.0 17.8 396.90 9.14 21.6\n", - "2 0.02729 0.0 7.07 0.0 0.469 ... 242.0 17.8 392.83 4.03 34.7\n", - "3 0.03237 0.0 2.18 0.0 0.458 ... 222.0 18.7 394.63 2.94 33.4\n", - "4 0.06905 0.0 2.18 0.0 0.458 ... 222.0 18.7 396.90 5.33 36.2\n", - "\n", - "[5 rows x 14 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 5 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r2qrTxo4ljZp", - "colab_type": "text" - }, - "source": [ - "### Split Train from Test\n", - "- For basic Linear Regression, we will predict `PRICE` (Median value of owner-occupied homes) based on `TAX` (full-value property-tax rate per $10,000)\n", - " - Go ahead and trim data to just these columns" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "spaDB10E3okF", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# simple linear regression X and Y\n", - "X = bos['TAX']\n", - "Y = bos['PRICE']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4TKLv8FjIBuI", - "colab_type": "text" - }, - "source": [ - "We can now set training and testing sets for our model\n", - "- Use `cuML`'s `train_test_split` to do this\n", - " - Train on 70% of data\n", - " - Test on 30% of data" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1DC6FHsNIKH_", - "colab_type": "code", - "outputId": "4c932268-7a82-4ac3-c7b9-9966ffc2b12e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 86 - } - }, - "source": [ - "from cuml.preprocessing.model_selection import train_test_split\n", - "\n", - "# train/test split (70:30)\n", - "sX_train, sX_test, sY_train, sY_test = train_test_split(X, Y, train_size = 0.7)\n", - "\n", - "# see what it looks like\n", - "print(sX_train.shape)\n", - "print(sX_test.shape)\n", - "print(sY_train.shape)\n", - "print(sY_test.shape)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "(354,)\n", - "(152,)\n", - "(354,)\n", - "(152,)\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZLVg44gAmJG7", - "colab_type": "text" - }, - "source": [ - "### Predict Values\n", - "1. fit the model with `TAX` (*X_train*) and corresponding `PRICE` (*y_train*) values \n", - " - so it can build an understanding of their relationship \n", - "2. predict `PRICE` (*y_test*) for a test set of `TAX` (*X_test*) values\n", - " - and compare `PRICE` predictions to actual median house (*y_test*) values\n", - " - use `sklearn`'s `mean_squared_error` to do this" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZGMPloJxGtK3", - "colab_type": "code", - "outputId": "664b54fe-16d5-4140-a657-3dc782574da9", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "from cuml import LinearRegression\n", - "from sklearn.metrics import mean_squared_error\n", - "\n", - "# call Linear Regression model\n", - "slr = LinearRegression()\n", - "\n", - "# train the model\n", - "slr.fit(sX_train, sY_train)\n", - "\n", - "# make predictions for test X values\n", - "sY_pred = slr.predict(sX_test)\n", - "\n", - "# calculate error\n", - "mse = mean_squared_error(sY_test, sY_pred)\n", - "print(mse)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "54.32312606491228\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T7BXjkPSGwqd", - "colab_type": "text" - }, - "source": [ - "3. visualize prediction accuracy with `matplotlib`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pp9RNPt_Iemk", - "colab_type": "code", - "outputId": "22a22472-50ad-4bb3-d104-35e9e100b8b6", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 305 - } - }, - "source": [ - "import cupy\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# scatter actual and predicted results\n", - "plt.scatter(sY_test, sY_pred)\n", - "\n", - "# label graph\n", - "plt.xlabel(\"Actual Prices: $Y_i$\")\n", - "plt.ylabel(\"Predicted prices: $\\hat{Y}_i$\")\n", - "plt.title(\"Prices vs Predicted prices: $Y_i$ vs $\\hat{Y}_i$\")\n", - "\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEgCAYAAACq+TSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xu4XHV97/H3J5sNbi5lQwkom8Qo\nWiiIEAiCgqeAF1REU7yVI5ba1jxaTytesGCtBCsVjULxVJ82p1DFogeRkNLjBaliFRVoQoIhBGqV\n6wa5B4JsYCf5nj9mzWb2ZGbNzJo1M2tmPq/nyZOZNWvW+s6a2eu71u+qiMDMzKyeOb0OwMzMis2J\nwszMUjlRmJlZKicKMzNL5URhZmapnCjMzCyVE4WZmaVyojCzgSXpDZLe0Os4+p3c4c7MBpGkPYDv\nJU9fExEP9zKefuZEYWYDSdIXgSuAEeBNEfH+HofUt5wozMwslesozMwslROFIWm9pGN6HUdRSPqy\npE8lj7tybCr3mdP2/J1abpwoBpCkOyRNSXpC0v3JSWjneutHxIER8cMuhtiWVj9fO5o9NklMr+5E\nDFn08juVtJukTdWJStJXJa2QpEHc9yBzohhcJ0bEzsChwCLg49UrSNqu61HlZ9A/XyZF+MwR8Siw\nHDitvEzSXwO/C5wSHawY7eW+B5kTxYCLiEngO8BLYObK9y8l/Rz4jaTtKq+GJc1LrrwelPSwpL8v\nb0vS3pIuT167XdJfVLz2l5Imk6u52yS9qjqWZJ1vVi27QNIXmt1Gxs9XN+7kPQsl3Zjs91LgORWv\nzbpTqHV8JH0VmA/8W3KX89EmjlfdfdaSxHGmpFskPSrpnyU9J+UzN4y7UYxZv5PEecDxkl4o6W3A\nEkotj55M+Yx5/T5a3rc1EBH+N2D/gDuAVyeP5wHrgb+peG1tsnyscn1KzQhvAs4HdqJ08jo6WWcO\nsBr4BLA98ELgV8DxwH7A3cDeyboLgH1rxPV84Elgl+T5CHAfcGSz22j186XFnay/PXAn8EFgFHgr\nMA18qsa+0o7PzHpNHK/UfaZ85puTz7U78JOqGGt+p2lxN3Fs6n4nwJeALzX4HV4EfBd4EDi0id9t\nLr+PLPv2vwbfTa8D8L8OfKmlk8QTwMbkhPSlqhPIH9dY/9XAy5M/rO1qbPMI4K6qZWcC/wy8CHgg\n2cZog9iuBf4wefwa4JfJ41a20fTnS4s7efw/gHtJmoony35K7USRdnxm1mvieKXuM+Uzv7fi+Rsq\njl3d7zQt7iaOTdPfSZ2YXwIE8Paq5f8LeHGnfh9Z9+1/9f/1vDzTOmZxRPx7ndfurrN8HnBnRGyu\n8drzgb0lbaxYNgL8OCL+W9JpwFLgQElXAR+KiHtrbOdrwMnAxcD/TJ7T4jZa+Xx1404e7w1MRnIW\nSdxZZ7tpx6da2n5b2Welys91Z7KdWq9Vqxd36rHJ8J1U2x54GlhRuTAi/r726kB+v48s+7Y6XEcx\nnOpV6N0NzK9TIXo3cHtEjFf82yUi3gAQEV+LiKMpnXwC+EydfVwGHCNpH+D3SU4ELW6jkcrPlxo3\npaKNiarWMPPrbDft+FQf07T9trLPSvOq1q88SaZV0taLu9Gxafc7ORi4uTpBSfphynvy+n1k2bfV\n4URhlW6gdBI7V9JOkp4j6aiK1zYlFYpjkkYkvUTS4ZL2k3ScpB2Ap4ApYGutHUTEg8APKRXB3B4R\nGwBa2UaGz1Qz7uT1nwGbgb+QNCrpJOBlKduqd3zup1TG38x+W9lnpfdL2kfS7sBfAZe2cAxqxZ16\nbHL4Tg6hVHcyQ6Xxlx6o94Ycfx8t79vqc6KwGRGxBTiRUnnwXcA9wDsqXnsjpT/A24GHgH8CdgV2\nAM5Nlv0a2JNSWXc9X6NU1vy1imWtbqOVz1QvbiLiGeAk4I+ARyh93hUp26p5fIBPAx+XtFHSR9L2\n28o+q3yN0iB3vwJ+CTTVQa9e3I2ODSnfiaR/kPQPDXZ9MFUna+ClwLoG78vj95F131aDx3oy6wOS\n7gD+NKVepi8k9Qx3RMTKYdp3v/MdhZl100HAz4dw333NrZ7MrGsi4k+Gcd/9zkVPZmaWykVPZmaW\nyonCzMxSDUQdxR577BELFizodRhmZn1l9erVD0XE3EbrDUSiWLBgAatWrep1GGZmfUVSM0PHuOjJ\nzMzSOVGYmVkqJwozM0vVs0SRzLh1TTJj13pJH0iWHyLpOklrJa2S1MxgaWZm1iG9rMzeDHw4Im6U\ntAuwWtLVwGeBsyPiO5LekDw/podxmpkNtZ4lioi4j9LQx0TEJkkbgAlK48z/VrLarswec9/63Mo1\nkyy76jbu3TjF3uNjnH78fixeOJF5PTPrvEI0j5W0AFgIXA+cBlwl6XOUisZe0bvILE8r10xy5op1\nTE1vAWBy4xRnriiN+lyZBJpdz8y6o+eJQtLOwOXAaRHxuKRPAR+MiMslvR24kNLY9NXvWwIsAZg/\nv5nJwazXll1128zJv2xqegvLrrptVgJodj0rhrS7v1qvAS3fLfoOs7d6migkjVJKEpdERHnillOB\nDySPL6M0kco2ImI5sBxg0aJFHtmwD9y7caqp5c2uZ72XdvcHbPPa6ZfdBILpLbHN+vVO/L7D7L1e\ntnoSpbuFDRFxXsVL9wK/lzw+DvhFt2Ozzth7fKyp5c2uZ72XdvdX67XprTGTJKrXz7IP645e9qM4\nCngXcFzSFHZt0srpPcDnJd0E/C1J8ZL1v9OP34+x0ZFZy8ZGR2aKI1pdz3ov7e6vlTvAtHV9h9l7\nvWz1dC2gOi8f1s1YrDvKxQSNypqbXc96b+/xMSZrnLDLd3+1Xqu3naz7sM7reWW2DZfFCyeaOuE3\nu5711unH7zer/gBm3/1VvzY6R7PqKKrXz7IP6zwnCjPLrJm7v3ZbPfkOs/cGYirURYsWhYcZNzNr\njaTVEbGo0Xq+ozCzprk/w3ByojCzprg/w/ByojDrkiJdjWeJxT3mh5cThXVdkU6Y3YqpSFfjWWOp\n19TV/RkGnycusq4qn6QmN04RPHuSWrlmcqBjKlLv4iyxrFwzWbfTk/szDD4nCuuqIp0wy7oRU5F6\nF2eJZdlVt1GrfaTA/RmGgBOFdVWRTpiN9p1nTEUavypLLPWOReCK7GHgRGFdVaQTZqN95xlTkcav\nyhJLvWMx4WKnoeBEYV1VpBNmWTdiWrxwgk+fdBAT42OI0gn20ycd1JOr8SyxFPF7s+5xz2zrumFs\n9TQIfIwGT7M9s50orLB6eWLySdHHYBh4CA/ra73sd5D3vqtPuMfuP5drbn2w0CfgPI9BPyacfoy5\nk3xHYYV01Lk/qNnBa2J8jJ+ccVzb2087EbS670ZzRlcPkV1tbHSkZ/UV9eR1/Gt9/iJ+3kr9GHNW\nzd5RuDLbCqmTTVYbdbBrZd+NtlWrj0a1XvcjqSWv41/EfjON9GPMneZEYYXUySarjU4Erey70baa\nPbEWbRiMvI5/EfvNNNKPMXeaE4UVUiebYzY6EbSy70bbavbEWrRhMPI6/kXsN9NIP8bcaU4UVkid\n7HfQ6ETQyr4bbavWCbdaEfsj5HX8+7H/RT/G3GmuzLahk2dlZTPb6sdWT3nqxxZE/RhzFu5HYZYi\nzxNBN08qw3ICs+5wojAbMLXuXkRpYL4JJw3LoPDNYyXNk3SNpFskrZf0gYrX/lzSrcnyz/YqRrMi\nqdXCqnyZV4R5PWxw9bJn9mbgwxFxo6RdgNWSrgb2At4MHBwRT0vas4cxWh8btGKaRs0zPS2pdUrP\nEkVE3AfclzzeJGkDMAG8Bzg3Ip5OXnugVzEW1aCdADuhSFOP5mXv8bG605GWDXNbf+ucQoz1JGkB\nsBC4HlgGvFLSOcBTwEci4j97F12xDOIJsBPqdYQ77dK1LL1yPRJsfHK6bqLtRDJud5unH79fw+FA\nhrmtf158IbatnicKSTsDlwOnRcTjkrYDdgeOBA4HviHphVFV6y5pCbAEYP78+V2OunfSegIP+4+5\nUtqV9cap6ZnH5US76s5HZpqsju84yhNPbWZ6a8xaB7In4zwSfHm9ZVfdxuTGqZmK7LJhb+ufB1+I\n1dbTDneSRikliUsiYkWy+B5gRZTcAGwF9qh+b0Qsj4hFEbFo7ty53Qu6xzy8QHNaubKemt7CJdfd\nNTNe06NPTs8kicp12hnrZ+mV63MZP2jxwgl+csZx3HHuCZz/jkMKMRHSIPE4T7X17I5CkoALgQ0R\ncV7FSyuBY4FrJP0OsD3wUA9CLKR65dSdLHLo9K14J7bfTDFNpWYaiWdNxivXTM66i8ljm1BKGvWO\nk4tPsvGFWG29LHo6CngXsE7S2mTZx4CLgIsk3Qw8A5xaXew0zGqdADtZ5NDpW/G07QOZT3bVxTR5\nyJqM065GO5HgXXySXS8uxPqBO9z1oW5eLXZ6Xoh62x8fG+XpzVs7NsxGq9qZj+AFZ3wr9Y4l785y\nnf7OBtkwzUUBnuFuoKUVOeSt07fi9bZTq6imUaV9vQRaeXdxb1IP0cjoiNhp++14bKp+y6hmNWrW\nmvcVv4tPsqv+rbjYrsSJwlJ1+la8mb4Bleqd7BoVYZ39b+t59Mna9QS17LT9dix904G5nCCaqS/J\ns+Wai0/a080LsX7hYcYtVaeHXK63/d12HK25fr2TXb3WKkuvXM/p37yppSQBpTuavIbEqB6yu568\nrvg9TLblzYnCUnVyXoh623/LYRPUqjpLO9mlFWFNb8lWD5dns8hys9bbzz2BiQ5PjNPp78yGjyuz\nrS15V6zXq3jebcdRzjpxdlFQ5b7nSGzpwG9ZwO3nnpDrNoetwtSKy5XZ1nGdaIZZqwgJYMftt9sm\nSZx+2U0zHeNqJYnROaXksbWJ/FHdy7msE+X6rjC1fuNEMQTKV96TG6cYSa68JxrMtNbMnULaeErL\nrrqt6ZNfZXz1VBctLb1y/Ta9p4GZOoBdx0b5zTOb2bq14e6B2kmik+X6eVeY9kMHu36IMW+d/Mzd\nPJ4uehpwrfQhKF9V71Y11hE8WzQCzTczrVWcUmta0MtXTzaMr7oPwIIzvlV33TvOPaFuX4JmjY+N\nZmr11IuTYT8UZfVDjHnr5GfOa9uFn7jIuqNeUU4t5RN/vbGOzv639Zy5Yt3MmEiNVFcGl3/c5fdP\nbpzikuvuahhfliv7dlsQ7bTDdpmSRPXn68ZkQv0wPlE/xJi3Tn7mbh9PJ4oBl2cnq0efnG65d3Pl\n/tNmaEtT6yqpXvPZOSqdsNutW8hy3Hp1MuyHDnb9EGPeOvmZu308nSgGXK87WVXuP8uPeGJ8rOaV\n/VknHsjoyLa9ErYGnLliHcfuP3ebvgStyHLcenUyrBdrr7/7Sv0QY946+Zm7fTydKAZcrc5XWYyN\njjA+VvsqfmJ8jL97xyENO3nV+xHX64SWVuS0eOEEy956MCPa9t1T01u45tYHt+lLcMqR82eej4+N\n1kw0jfabplcnw37oYNcPMeatk5+528fTrZ4GXPUoqrVaPdWaBKfWWEdA3ZFrm2nyWW/k27ccNjET\nR2V8jSqCFy+c4IOXrq352r0bpxq2LKrXGixrBXS3R/Yt64fmtv0QY946+Zm7fTzd6smA5lvrtNuq\nJ+9WQUUbKXUYm4Ba/2q21ZMTheWq2yfKXjW77Hb7eMjn6jHvuyjrb04U1nWDeNKut79uto8fnSMQ\ns8asyrK/tD41g96nwWpzorCuq1cMNCKxNaIQRVV56GRxVysdBVvdX6Nte2Kj4dO1sZ4kfTLZzlpg\nbUT8V7vbtP5UrxloeRymWmNBpSWCTowllUfi6UX7+Dz212j9Qe7TYO1pqXmspFOql0XEJ4ALgMeA\n35f0f3KKzfpMM81AKzugNerJnHcHtrx6TveifXwe+2u0/iD3abD2tNqP4l2SLpA0qwFvRNwfEVdF\nxGci4j05xmd9pNk+G+Ur10aJIO8r97wST7fbx4/O0TZ9PrLsL+37GfQ+Ddae1EQh6UBJl1Qsej0w\nBfxA0tyORmZ9p3rCnFqd4eDZK9dGiaDZK/eVayY56twf8IIzvsVR5/6g7h1CXomnkxMD1dr2srcd\nzLK3Htz2/iq3Dc9+P57YyBppVEfx78DLy08iYitwhqSTgB9LOo9S3cTNEfFk58K0flHZya1e66Dy\nlWujuZ2b6cDWSj1GnnNJd3Je5XrbzisROSFYqxoVPb0WOKdygaQ3An8KPAMcCnwOuFvSf3ckQutb\nja68GxXhNHPl3kpx0jAOI2GWh9Q7iohYB7yz/FzS7cAtwPkRcXXlupL2aWXHkuYBFwN7URo9YnlE\nXFDx+ocpJaG5EfFQK9u27kprSZR2BdvMMASNroBbKU4axmEkzPLQavPY10fErbVeiIh7WtzWZuDD\nEXGjpF2A1ZKujohbkiTyWuCuFrdpXdZuE9Z2i0JaLU5y0YtZ61pq9VQvSWQREfdFxI3J403ABqD8\nF3w+8FGam67AeqjXE9LUayX05DObG1ZuF0WzlfFmvVKI0WMlLQAWAtdLejMwGRE3qU6rGSuOXk9I\nU12cVJ4r+9Enp4F8Oul1Uic6FZrlrefzUUjaGbgcOI1ScdTHgE808b4lklZJWvXggw92OEqrpwgT\n0ixeOMFPzjiO2889gZ122G7WmEhQ7Ck3e31HZtaMniYKSaOUksQlEbEC2Bd4AXCTpDuAfYAbJT23\n+r0RsTwiFkXEorlz3aWjV4rWkqjXdzit6rd4bThlThTVJ+9aJ/MG7xdwIbAhIs6DUiuriNgzIhZE\nxALgHuDQiPh11jitszrZ+SyLItzhtKLf4rXh1E4dxYXACSnPGzkKeBewTlJ5mrKPRcS324jJeqBI\nLYl6NctcVv0Wrw2nzIkiIk5Ie97E+6+l/nTJ5XUWtB6ZDbN+6yvRb/HacMo0H4WktwHfjYhNkj5O\nqYf230TEmrwDbIbnozAza12z81FkraP46yRJHA28mlKx0z9k3JaZmRVY1kRRLlA9gdLQG98Cts8n\nJDMzK5KsiWJS0j8CfwB8W9IObWzLzMwKLOvJ/e3AVcBrI2IjsDtwem5RmZlZYWRt9TQF7AScDHwS\nGAU25hWU9ad25qPOYy5rM+uMrHcUXwKOpJQoADYBX8wlIutL7cxHnddc1mbWGVkTxRER8X7gKYCI\neBRXZg+1dsYs8nhHZsWWtehpWtIIyTDgyfzZW3OLyvpOO2MW1VtncuMUK9dMDmUR1CAU4xUlDmtf\n1kTxBeAKYE9J5wBvBT6eW1TWd9qZj7ree4GhHHK7naHHOzFseZYTvodPHyyZEkVEXCJpNfAqSsNw\nLI6IDblGZn3l2P3ncsl1d82aaaqZMYtWrpnkyWc21329XARVfXJp92q18v3jO44SAY9NTbd15ZvX\nFXRaUVyj7bXz3lqynvDzjsN6q52xnm4FcpvxzvrXyjWTXL56claSEPCWw9IHC6w+CdVTXTTV7tVq\n9fvLkxxl2VZeMVXqRDFe1mHLs57wPXz6YMlUmS3pK5LGK57vJumi/MKyflLrZBLANbemTyhV6321\nVBdftVv53Wi/WSrS86yQb2fo8byHLc96wvfw6YMla6unlyYd7YCZVk8L8wnJ+k0zJ5Na80I3c3VZ\nq/iq3avVdq7MW10/yxV0O5NB5T2RVNYTftEmtLL2ZE0UcyTtVn4iaXcKMv+2dV+jk0m9fhLjO47W\nfN+IlDoJUrtXq+1cmbe6fpYr6HYmg8p7IqmsJ/yiTWhl7cl6cv888DNJlyXP3wack09I1m8aTb5T\nr1hmh+3mMDY6ss37Gp1Q2p3sp9b7KylZpxV5T0DUzmRQeU4k1c58GUWa0Mrak7XV08VJq6djk0Un\nRcQt+YVl/aTRyaRe8ctjU9Oc/45DWj4JtTvZT3m90y5dW/P1oPUK6EGegMgnfMs0cVHReOKiYjvq\n3B/U7CcxMT7GT844rgcRlRQ1LrNu6cjERZKuTf7fJOnxin+bJD2eNVgbbEWt2CxqXGZF01LRU0Qc\nLUnAgRFxV4disgFT1GKZosZlVjRZ58xeFxEHdSCeTFz0ZGbWumaLnrK2erpR0uER8Z8Z329Dot6w\nFh4wzqx/ZE0URwCnSLoD+A2lFoURES/NKzDrf/WGtVh15yNcvnrSA8aZ9YmsieL4dncsaR5wMbAX\npRaJyyPiAknLgBOBZ4BfAu+u7AVu2ZWv4ic3TpUye7J8tx1HOevEA9s6SVffIRy7/1y+fv3dbKkq\n2pya3lJ3uQeMMyumrD2z7wfeApwPnAeclCxrxWbgwxFxAKXZ8t4v6QDgauAlyd3JfwFnZozRKlT2\njgZmDeD36JPTnP7NmzLPKFer5/W/XHfXNsmgrN7yekONm1lvZU0UFwMHAv8b+HvgAOCrrWwgIu6L\niBuTx5uADcBERHwvIsrjTl8H7JMxRqvQaCC86S2ReUa5Zgf3a2REansbZpa/rEVPL0nuBMqukZS5\nZ7akBZQGFby+6qU/Bi6t854lwBKA+fPnZ9310OjEQHhZ3lc9ZEelencaZtZbWe8obpR0ZPmJpCOA\nTO1TJe0MXA6cFhGPVyz/K0rFU5fUel9ELI+IRRGxaO7cuVl2PVQ6MRBeq+8bkWYGiqul3nIz662s\nieIw4KeS7khaPv0MOFzSOkk/b3YjkkYpJYlLImJFxfI/At4IvDMGYYyRAqjVC7nS6Igy90hutG0o\n3Ul8/u0Hs3jhhHtEm/WZrEVPr2t3x0kP7wuBDRFxXsXy1wEfBX4vIp5sdz9WUtkLOe9WT7V6OB+7\n/1yuufXBmv0k3CParL/0bFBASUcDPwbWAVuTxR8DvgDsADycLLsuIt6bti33zDYza12ne2a3LSKu\npdRRr9q3ux2LmZnVl7WOwszMhoQThZmZpWqp6EnSh9Jer6yUNjOzwdBqHcUuyf/7AYcDVybPTwRu\nyCsoMzMrjlYnLjobQNKPgEOToTeQtBT4Vu7RmZlZz2Wto9iL0uiuZc8ky8zMbMBkbR57MXCDpCuS\n54uBr+QTkpmZFUmmRBER50j6DvDKZNG7I2JNfmGZmVlRZCp6SobfOADYNSIuAB6W9LJcIzMzs0LI\nWkfxJeDlwMnJ803AF3OJyMzMCiXznNkRcaikNQAR8aik7XOMy8zMCiLrHcW0pBGSAUglzeXZgf3M\nzGyAZE0UXwCuAPaUdA5wLfDp3KIyM7PCyNrq6RJJq4FXURoBdnFEbMg1MjMzK4RMiULSZyLiL4Fb\naywzM7MBkrXo6TU1lr2+nUDMzKyYWh099n3AnwH7Vs2NvQvw0zwDMzOzYmi16OlrwHcoVVyfUbF8\nU0Q8kltUZmZWGC0VPUXEYxFxB6VBAB+LiDsj4k4gJF3UiQDNzKy3stZRvDQiNpafRMSjwMJ8QjIz\nsyLJmijmSNqt/ETS7mTv5W1mZgWW9eT+eeBnki5Lnr8NOCefkMzMrEiydri7OOlwd2yy6KSIuCW/\nsMzMrCgyFxdFxHpgfdb3S5pHaQKkvSiNGbU8Ii5IirEuBRYAdwBvT+pAzMysB1qqo5B0bfL/JkmP\nV/zbJOnxFve9GfhwRBwAHAm8X9IBlJrdfj8iXgx8n9nNcM3MrMtauqOIiKOT/3dpd8cRcR9wX/J4\nk6QNwATwZuCYZLWvAD8EPDSImVmPtNoz+0Npr0fEeVmCkLSAUvPa64G9kiQC8GtKRVO13rMEWAIw\nf/78LLs1M7MmtNo8dpfk3yLgfZTuACaA9wKHZglA0s7A5cBpETGr+CoigmTOi2oRsTwiFkXEorlz\n52bZtZmZNaHVoqezAST9CDg0IjYlz5cC32p155JGKSWJSyJiRbL4fknPi4j7JD0PeKDV7ZqZWX6y\ndrjbi9IwHmXPUKeIqB5JAi4ENlQVWV0JnJo8PhX414wxmplZDrI2j70YuEHSFcnzxZQqnltxFPAu\nYJ2ktcmyjwHnAt+Q9CfAncDbM8ZoZmY5yNrh7hxJ3wFemSx6d0SsaXEb11KaHa+WV2WJy8zM8pep\n6CkpNjoA2DUiLgAelvSyXCMzM7NCyFpH8SXg5cDJyfNNwBdzicjMzAolax3FERFxqKQ1UBpmXNL2\nOcZlZmYFkfWOYlrSCEkfB0lzga25RWVmZoWRNVF8AbgC2FPSOcC1wN/mFpWZmRVGy0VPSUX2j4DV\nlFonCVgcERtyjs3MzAqg5UQRESHp2xFxEHBrB2IyM7MCyVr0dKOkw3ONxMzMCilzqyfgFEl3AL+h\nVPwUEfHSvAIzM7NiyJoojs81CjMzK6xW56N4DqUhxV8ErAMujIjNnQjMzMyKodU6iq9QmotiHfB6\n4PO5R2RmZoXSatHTAUlrJyRdCNyQf0hmZlYkrd5RTJcfuMjJzGw4tHpHcbCk8nSlAsaS5+VWT7+V\na3RmZtZzrU6FOtKpQMzMrJiydrgzM7Mh4URhZmapnCjMzCyVE4WZmaVyojAzs1ROFGZmlsqJwszM\nUvU0UUi6SNIDkm6uWHaIpOskrZW0StLLehmjmdmw6/UdxZeB11Ut+yxwdkQcAnwieW5mZj3S00QR\nET8CHqleDJSHAtkVuLerQZmZ2SxZJy7qpNOAqyR9jlIie0WtlSQtAZYAzJ8/v3vRmZkNmV4XPdXy\nPuCDETEP+CBwYa2VImJ5RCyKiEVz587taoBmZsOkiIniVGBF8vgywJXZZmY9VMREcS/we8nj44Bf\n9DAWM7Oh19M6CklfB44B9pB0D3AW8B7gAknbAU+R1EOYmVlv9DRRRMTJdV46rKuBmJlZXUUsejIz\nswJxojAzs1ROFGZmlsqJwszMUjlRmJlZKicKMzNL5URhZmapnCjMzCyVE4WZmaVyojAzs1ROFGZm\nlsqJwszMUjlRmJlZqiJOhdoVK9dMsuyq27h34xR7j49x+vH7Acxaduz+c7nm1gdnrbN44cTMeyc3\nTs1sb0Ti5CPm8anFB/Hxlev4+vV3syVi5vWJqvcvvXI9G6emAdhtx1HOOvHAbbY9R7A12YQoTSY+\nPjaKBI8+Oc2IxJaImf/HRufw9OatbI1SPEe+cDfueHhqm89Yve8DnrcLP/3VI1SEy/jYKEvfVIoJ\nqPmZKh217+5c8p6XZzrW4zuOEgEbp7b9TBM1vod630va/ipfb0ZRtmFWTzd/X4o6f/j9ZNGiRbFq\n1aqm11+5ZpIzV6xjanrLzLLkIdRbAAAKXUlEQVTREUHA9Nb6x2NsdIS3HDbB5asnZ7230ov33Ilf\nPPCb1PdfesPd2+xndES84/B5qdtu1+ic0sk35SNus/6ytx3Mqjsf4V+uu6vh+rWSRdZj3Yqx0RE+\nfdJBM4m2en+VrzejKNswqyev35ek1RGxqNF6Q1n0tOyq27Y5GU9viYYnrqnpLXz9+rtTT+T1kkTl\n+2vtZ3pLNNx2u6a3Np8kyusvu+o2vn793U2t/5NfPrLNsqzHuhVT01tYdtVtdfdX+XozirINs3q6\n/fsayqKneyuKjFpVr+glj/e3u+1OuHfjFO1E1c6xzrKfevtrJY6ibMOsnm7/vobyjmLv8bHM7x2R\n2tp32vvb3XYn7D0+1lZc7RzrLPupt79W4ijKNszq6fbvaygTxenH78fY6MisZaMjYnRO+glxbHSE\nk4+Yt817K714z50avr/WfkZH1HDb7RqdIxp8xG3WP/34/Tj5iHlNrX/UvrtvsyzrsW7F2OjITAV5\nrf1Vvt6MomzDrJ5u/75Gli5d2pENd9Py5cuXLlmypOn193/eb7HPbmOsm3yMJ57azMT4GEvfdCCv\nPfC5s5a9+ZC9efiJZ2aef+LEA/izY180895NT22e2eaIxDuPnM+X330EDz3xNOsnH59VZFP5/vm7\n78h1v3qYpzZvBUotj875/YO22facUp0vUGr1BKXWSGPbj/DU9FZGJCLZdwBjo3PYGjGz7BX77s7W\nYNZnPP7A526z78Pmj3NP1S3r+Ngof5tUjB23/141P1Oleq2emjnWu+04ynO2G+Gpzdt+plrfQ63v\npVyBV2t/la9n/X30Yhtm9eT1+zr77LPvW7p06fJG6w1lqyczM3OrJzMzy4kThZmZpeppopB0kaQH\nJN1ctfzPJd0qab2kz/YqPjMz6/0dxZeB11UukHQs8Gbg4Ig4EPhcD+IyM7NETxNFRPwIqO7O+z7g\n3Ih4Olnnga4HZmZmM3p9R1HL7wCvlHS9pP+QdHitlSQtkbRK0qoHH3ywyyGamQ2PIiaK7YDdgSOB\n04FvSNt2DY6I5RGxKCIWzZ07t9sxmpkNjSIminuAFVFyA7AV2KPHMZmZDa0iJoqVwLEAkn4H2B54\nqKcRmZkNsZ6OHivp68AxwB6S7gHOAi4CLkqazD4DnBqD0H28hzyBjpm1o6eJIiJOrvPSKV0NZIBV\nT3AyuXGKM1esA3CyMLOmFLHoyXLkCXTMrF1OFAPOE+iYWbucKAacJ9Axs3Y5UQw4T6BjZu0ayjmz\nh0m5wtqtnswsKyeKIbB44YQTg5ll5qInMzNL5URhZmapnCjMzCyVE4WZmaVyojAzs1QahPH2JD0I\n3NnrONq0Bx4lt5KPx2w+Hs/ysZitnePx/IhoOKHPQCSKQSBpVUQs6nUcReHjMZuPx7N8LGbrxvFw\n0ZOZmaVyojAzs1ROFMWxvNcBFIyPx2w+Hs/ysZit48fDdRRmZpbKdxRmZpbKicLMzFI5UfSApIsk\nPSDp5oplu0u6WtIvkv9362WM3SJpnqRrJN0iab2kDyTLh/V4PEfSDZJuSo7H2cnyF0i6XtJ/S7pU\n0va9jrVbJI1IWiPp/yXPh/lY3CFpnaS1klYlyzr+t+JE0RtfBl5XtewM4PsR8WLg+8nzYbAZ+HBE\nHAAcCbxf0gEM7/F4GjguIg4GDgFeJ+lI4DPA+RHxIuBR4E96GGO3fQDYUPF8mI8FwLERcUhF34mO\n/604UfRARPwIeKRq8ZuBrySPvwIs7mpQPRIR90XEjcnjTZROCBMM7/GIiHgieTqa/AvgOOCbyfKh\nOR6S9gFOAP4peS6G9Fik6PjfihNFcewVEfclj38N7NXLYHpB0gJgIXA9Q3w8kqKWtcADwNXAL4GN\nEbE5WeUeSsl0GPwd8FFga/L8txneYwGli4bvSVotaUmyrON/K57hroAiIiQNVbtlSTsDlwOnRcTj\npQvHkmE7HhGxBThE0jhwBbB/j0PqCUlvBB6IiNWSjul1PAVxdERMStoTuFrSrZUvdupvxXcUxXG/\npOcBJP8/0ON4ukbSKKUkcUlErEgWD+3xKIuIjcA1wMuBcUnlC7t9gMmeBdY9RwFvknQH8H8pFTld\nwHAeCwAiYjL5/wFKFxEvowt/K04UxXElcGry+FTgX3sYS9ckZc4XAhsi4ryKl4b1eMxN7iSQNAa8\nhlK9zTXAW5PVhuJ4RMSZEbFPRCwA/gD4QUS8kyE8FgCSdpK0S/kx8FrgZrrwt+Ke2T0g6evAMZSG\nB74fOAtYCXwDmE9pyPS3R0R1hffAkXQ08GNgHc+WQ3+MUj3FMB6Pl1KqkByhdCH3jYj4pKQXUrqq\n3h1YA5wSEU/3LtLuSoqePhIRbxzWY5F87iuSp9sBX4uIcyT9Nh3+W3GiMDOzVC56MjOzVE4UZmaW\nyonCzMxSOVGYmVkqJwozM0vlRGFmZqmcKGxgSFosKSSlDnkhaVzSn7W5ryfqLN+SDAF9s6TLJO1Y\nZ72ftrP/Zkn63WRo6jnJ8xFJ35P0h93Yvw0GJwobJCcD1yb/pxkH2koUKaaSIaBfAjwDvLfyRZXM\niYhXdGj/s0TEBko9u9+YLDoHuC0iLu7G/m0wOFHYQEgGFTya0twEf1Cx/A8l/TyZCOiryeJzgX2T\nK/9lkhZUTSL1EUlLk8crk5E611eM1tmsHwMvSrZ/m6SLKQ25MK/yjqROjEg6JZnEaK2kf0zuBnaS\n9K1k3ZslvaOJOM4H3ifpLZTGT/pQi5/DhpxHj7VB8WbguxHxX5IelnQY8BTwceAVEfGQpN2Tdc8A\nXhIRh8DM8Ob1/HFEPJKMu/Sfki6PiIcbBZMMWvd64LvJohcDp0bEdcnr5fUOrBWjpN8F3gEcFRHT\nkr4EvBP4DXBvRJyQrLdr8v+3gT+NiHurY4mI70n6PPBp4H9ExHSj+M0q+Y7CBsXJlMb/Ifn/ZEqj\njV4WEQ8BZBz/5i8k3QRcB8yjdMJPM5bMJbEKuIvSgIcAd5aTRJV6Mb4KOIxSclqbPH8hpTGxXiPp\nM5JeGRGPJe97Q60kUeGnwHkR8evyAkl/0+CzmAG+o7ABkFyFHwcclIzFP0JpgpdlTW5iM7Mvmp6T\nbPcY4NXAyyPiSUk/LL+WYqp8p1IRH5TuBFoh4CsRceY2L0iHAm8APiXp+xHxySa2dwDwzxXbeC6l\n2fPMGvIdhQ2CtwJfjYjnR8SCiJgH3A78HHhbMromFUVPm4BdKt5/P7CnpN+WtAPPVvzuCjyaJIn9\nKc3pnbcf1Inx+8BbkwlqkLS7pOdL2ht4MiL+hVIiPLTJ/RxIqX6k7BBgbR4fwAafE4UNgpN5dvjl\nssspVWqfA/xHUnx0HkBSx/CTpDJ4WVJm/0ngBkpTj5ZnDfsusJ2kDZQqwGsVHbUlItbXifEWSnUX\n35P08ySu5wEHATckxVFnAZ+CUh1FkkS2IWkepelDK5v0OlFY0zzMuNkQknQh8J6I2NpwZRt6ThRm\nZpbKRU9mZpbKicLMzFI5UZiZWSonCjMzS+VEYWZmqZwozMwslROFmZmlcqIwM7NUThRmZpbq/wNP\ni6cKUGWQlQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MqX73B4s5tv", - "colab_type": "text" - }, - "source": [ - "## Multiple Linear Regression \n", - "- Our mean squared error for Simple Linear Regression looks kinda high..\n", - " - Let's try Multiple Linear Regression (predicting based on multiple variables rather than just `TAX`) and see if that produces more accurate predictions\n", - "\n", - "1. Set X to contain all values that are not `PRICE` from the unsplit data\n", - " - i.e. `CRIM`, `ZN`, `INDUS`, `CHAS`, `NOX`, `RM`, `AGE`, `DIS`, `RAD`, `TAX`, `PTRATIO`, `B`, `LSTAT`\n", - " - Y to still represent just 1 target value (`PRICE`)\n", - " - also from the unsplit data\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZtQK5-f4M0Vg", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# set X to all variables except price\n", - "mX = bos.drop('PRICE', axis=1)\n", - "# and, like in the simple Linear Regression, set Y to price\n", - "mY = bos['PRICE']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RTYG4-UwNDsK", - "colab_type": "text" - }, - "source": [ - "2. Split the data into `multi_X_train`, `multi_X_test`, `Y_train`, and `Y_test`\n", - " - Use `cuML`'s `train_test_split`\n", - " - And the same 70:30 train:test ratio" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "EsKxK8u_F7t8", - "colab_type": "code", - "outputId": "673a1a44-4d2f-4a45-8333-8f29782eaf65", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 86 - } - }, - "source": [ - "# train/test split (70:30)\n", - "mX_train, mX_test, mY_train, mY_test = train_test_split(mX, mY, train_size = 0.7)\n", - "\n", - "# see what it looks like\n", - "print(mX_train.shape)\n", - "print(mX_test.shape)\n", - "print(mY_train.shape)\n", - "print(mY_test.shape)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "(354, 13)\n", - "(152, 13)\n", - "(354,)\n", - "(152,)\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_Y40R17LGHsI", - "colab_type": "text" - }, - "source": [ - "3. fit the model with `multi_X_train` and corresponding `PRICE` (*y_train*) values \n", - " - so it can build an understanding of their relationships \n", - "4. predict `PRICE` (*y_test*) for the test set of independent (*multi_X_test*) values\n", - " - and compare `PRICE` predictions to actual median house (*y_test*) values\n", - " - use `sklearn`'s `mean_squared_error` to do this" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "N7qm1HuVO-1k", - "colab_type": "code", - "outputId": "7e291cec-e602-4ad9-a5b3-b70d7261f63d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "# call Linear Regression model\n", - "mlr = LinearRegression()\n", - "\n", - "# train the model for multiple regression\n", - "mlr.fit(mX_train, mY_train)\n", - "\n", - "# make predictions for test X values\n", - "mY_pred = mlr.predict(mX_test)\n", - "\n", - "# calculate error\n", - "mmse = mean_squared_error(mY_test, mY_pred)\n", - "print(mmse)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "16.691811854229723\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jTdmleXCM_Xb", - "colab_type": "text" - }, - "source": [ - "5. visualize with `matplotlib`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Q83NFMK1JKvL", - "colab_type": "code", - "outputId": "569cfa77-a66e-4b1b-9d70-ae4ef8e7936e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 305 - } - }, - "source": [ - "# scatter actual and predicted results\n", - "plt.scatter(mY_test, mY_pred)\n", - "\n", - "# label graph\n", - "plt.xlabel(\"Actual Prices: $Y_i$\")\n", - "plt.ylabel(\"Predicted prices: $\\hat{Y}_i$\")\n", - "plt.title(\"Prices vs Predicted prices: $Y_i$ vs $\\hat{Y}_i$\")\n", - "\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEgCAYAAACq+TSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3X20HXV97/H3J4cTOCBygkYqAYxP\nCyQiCeJTQ7skSqMimiKKXLXc1tbV1q4K0tRwLy1I5RJXqlivWsuqtqBYAwKRVitYE66KRZuYQIhA\nfUDQg0JUjoIc4CT53j/27GRnZ8/sPbNnP39ea5119syeM/Pbc86Z78zv4ftTRGBmZpZmTq8LYGZm\n/c2BwszMMjlQmJlZJgcKMzPL5EBhZmaZHCjMzCyTA4WZmWVyoDCzoSTpNZJe0+tyDAN5wJ2ZDRtJ\nTwVuShZPiYif97I8g86BwsyGjqSPAtcDY8DrIuKdPS7SQHOgMDOzTG6jMDOzTA4UI07SNkkv73U5\n+oWkf5b0vuR1V85N7TFL2p9/p1YqB4ohI+mHkmYkPSLpgeQi9KS07SNiUUTc3MUitiXv52tHq+cm\nKdMrO1GGInr5O5U0T9LD9YFK0qckXSdJw3jsYedAMZxOi4gnAScAJwIX1G8gab+ul6o8w/75CumH\nzxwRDwGXA+dU10n6K+B5wFujg42ivTz2sHOgGGIRMQX8O/B82H3n+x5JtwO/lrRf7d2wpCOTO6/t\nkn4u6SPVfUk6XNK1yXv3SPrzmvfeI2kquZu7W9Ir6suSbPO5unV/J+nDre6j4OdLLXfyM0skfTs5\n7lrggJr39npSaHR+JH0KOAr41+Qp5y9bOF+px2wkKcf5kr4j6SFJ/yTpgIzP3LTczcpY9HeS+CCw\nXNKzJL0ReAeVnkePZnzGsv4+ch/bWhAR/hqiL+CHwCuT10cC24C/qXlvS7J+onZ7Kt0IbwMuAw6i\ncvE6KdlmDrAJ+GtgLvAs4AfAcuBo4EfA4cm2C4FnNyjXM4BHgYOT5THgJ8BLW91H3s+XVe5k+7nA\nvcC5wDhwBjALvK/BsbLOz+7tWjhfmcfM+Mx3JJ/rUOCWujI2/J1mlbuFc5P6OwE+Bnysyd/hJ4Ev\nAduBE1r4uy3l76PIsf3VwnWl1wXwV8m/0MpF4hFgOrkgfazuAvIHDbZ/JfCy5B9rvwb7fAlwX926\n84F/Ap4DPJjsY7xJ2b4O/F7y+hTg+8nrPPto+fNllTt5/dvA/STdxJN136BxoMg6P7u3a+F8ZR4z\n4zP/cc3ya2rOXervNKvcLZybln8nKWV+PhDAm+rW/xnw3E79fRQ9tr+yv3pep2kdsSIi/iPlvR+l\nrD8SuDcidjR47xnA4ZKma9aNAV+LiO9JOge4CFgk6Ubg3RFxf4P9fAY4C7gS+B/JMjn3kefzpZY7\neX04MBXJVSRxb8p+s85Pvazj5jlmrdrPdW+yn0bv1Usrd+a5KfA7qTcXeBy4rnZlRHyk8eZAeX8f\nRY5tGdxGMXrSGvR+BByV0iD6I+CeiJis+To4Il4DEBGfiYiTqFx8Anh/yjGuAV4u6Qjgd0kuBDn3\n0Uzt58ssN5WqjQV1vWGOStlv1vmpP6dZx81zzFpH1m1fe5HMaqRNK3ezc9Pu7+R44I76ACXp5oyf\nKevvo8ixLYMDhVV9i8pFbLWkgyQdIGlpzXsPJw2KE5LGJD1f0oskHS1pmaT9gceAGWBXowNExHbg\nZipVMPdExJ0AefZR4DM1LHfy/n8CO4A/lzQu6XTgxRn7Sjs/D1Cp42/luHmOWeudko6QdCjwv4G1\nOc5Bo3JnnpsSfieLqbSd7KZK/qUH036gxL+P3Me2bA4UBkBE7AROo1IffB/wY+DMmvdeS+Uf8B7g\nZ8A/AocA+wOrk3U/BZ5Gpa47zWeo1DV/pmZd3n3k+Uxp5SYingBOB/4n8Asqn/e6jH01PD/ApcAF\nkqYl/UXWcfMcs85nqCS5+wHwfaClAXpp5W52bsj4nUj6uKSPNzn08dRdrIEXAFub/FwZfx9Fj20p\nnOvJrM9J+iHwhxntMgMhaWf4YUSsG6VjDwM/UZhZtxwH3D6Cxx547vVkZl0REW8fxWMPA1c9mZlZ\nJlc9mZlZJgcKMzPLNBRtFE996lNj4cKFvS6GmdlA2bRp088iYn6z7YYiUCxcuJCNGzf2uhhmZgNF\nUivpY1z1ZGZm2RwozMwskwOFmZllcqAwM7NMDhRmZpZpKHo9mZn1g3Wbp1hz493cPz3D4ZMTrFx+\nNCuWLOjIfso6ViscKMzMSrBu8xTnX7eVmdmdAExNz3D+dZXM5nku4K3sp6xjtarnVU/JhCmbJf1b\nsvxMSd+U9D1JayXN7XUZzcyaWXPj3bsv3FUzsztZc+Pdpe+nrGO1queBAngXcGfN8vuByyLiOcBD\ngLM+mlnfu396Jtf6dvZT1rFa1dNAkcyNeyqVmbVI5hFeBnwu2eQKYEVvSmdm1rrDJydyrW9nP2Ud\nq1W9fqL4EPCX7Jn/9inAdM2k6D8GGla4SXqHpI2SNm7fvr3zJTUzy7By+dFMjI/ttW5ifIyVy48u\nfT9lHatVPQsUkl4LPBgRm4r8fERcHhEnRsSJ8+c3zWllZtZRK5Ys4NLTj2PB5AQCFkxOcOnpx+Vu\nXG5lP2Udq1U9m7hI0qXA24AdwAHAk4HrgeXAb0TEDkkvAy6KiOVZ+zrxxBPDSQHNzPKRtCkiTmy2\nXc+eKCLi/Ig4IiIWAm8G1kfEW4ANwBnJZmcDn+9REc3MjN63UTTyHuDdkr5Hpc3iEz0uj5nZSOuL\nAXcRcTNwc/L6B8CLe1keMzPboy8ChZlZv+tmyox+40BhZtZEt1Nm9Jt+bKMwM+sr3U6Z0W8cKMzM\nmuh2yox+40BhZtZEt1Nm9Bu3UZhZYaPSwLty+dF7tVFAZ1NmtMLzUZhZ3xulBt7q5+mXoNjtc+9A\nYTbEOnnXmdXAO2yBAioX4H75XN0+926jMBtS1bvOqekZgj13nes2T5Wy/7SG3KnpGZauXl/acWxf\nIzUfhZl1Tqe7dGY15JYdlGxvozYfhZllWLd5iqWr1/PMVV/IfZfe7I6/yD5rNZoTodYojTPotm7P\nR+E2CrM+1W6D5eGTE0w1CBZK9lVkn7VqG3gbHQdGZ5xBt3W7cb1n81GUyfNR2DBaunp9wwvwgskJ\nblm1rOnP1wcaqASJRv/xre6zU2W13uj7+SjMLFu7DZaNZkFLuy1s986/21Uh1l2uejLrU2lVR3ka\nLGu7dK7bPMW5a7c0DBbtNoL22ziDfjEsAxIdKMz6VNmjgdfceHfDIKHkWO3qp3EG/WCYBiQ6UJj1\nqbLv0tOql4LsC1e7d8XDcled1zANSHSgMOtjZd6lp1VlLciodlq3eYqV19zG7K7Ks8jU9Awrr7lt\nd9maGaa76ryGKeOsG7PNRkSRBueLbti2O0hUze4KLrphW0vHHOV5HIYp46yfKMyGQCvVO0WqsqZn\nZnOtrzdMd9V5dTrjrLPHmlnLGlXvnLt2C+es3cKCugtItxucy+i5Nag62RPM2WPNLJdG1TvVyqJ2\nLyDzDhznoUf3fXqYd+D47tdZd7b9OI9DN3UqMDt7rJnl0qwap502gQtPW8T4mPZaNz4mLjxtEdA8\nQ22jQX+Xnn7c0Ddkd1q3q/T8RGE24NKqd2oVvYA0qz5Ju7M97+o9PaM8vqJ83a7Sc6AwG3CNqnfq\ntXMBybrQpwWgnREj0w22F7pdpedAYdZj7fZeqc/iWp/4r5MXkKynmUEdXDYInD22AGePtUHVKMPr\nxPhYW/X43ew22aj8tQTcs/rUjhzb2tdq9lg/UZj1ULMBaUUu+N1sE6ge57yrb2Nng5vOUegGOwoc\nKMx6KGsWurR+8tBfWVqrxx7lbrDDzoHCrIcOmRhvOMpZ0PBJ46IbtvH4jl19lzvJacaHmwOFjZR+\ny2QqNV6f1nLYKKj0S6Oxu8EOLwcKGxn9mMl0usGo5yJGIXeS9Y5HZtvIyJPJdN3mKZauXs8zV32B\npavX7x5pXLa0xt55B443zPRamzqjlf2YlcFPFDbUaquaWp0vuptPHmkDp6opMuqrycCNxtZ9PQsU\nkg4Avgrsn5TjcxFxoaRnAp8FngJsAt4WEU/0qpw2uJr18a+qvxvvZsK1Zo3AacfL286ybvMUF92w\nbXcbx7wDx7nwtEVuU7CW9PKJ4nFgWUQ8Imkc+LqkfwfeDVwWEZ+V9HHg7cDf97CcNqAaXfDrNbob\n73bCtbyNwHm3r5+lDuChR2dZ+bnWZ6qz0dazNoqoeCRZHE++AlgGfC5ZfwWwogfFsyGQdWEXlbvq\n/febw7lrt+zVDjFMM5NBJWDWz1IHMLszRmKmOWtfTxuzJY1J2gI8CHwZ+D4wHRE7kk1+DPh2xwpJ\nu7AvmJzgsjMX89jsLqZnZvdJj11kytB+lhUw3VvKWtHTQBEROyNiMXAE8GLgmFZ/VtI7JG2UtHH7\n9u0dK6MNrqwLfrN2iGGaQyHrSWhQn5Ksu/qi11NETEvaALwMmJS0X/JUcQTQsF9iRFwOXA6VpIBd\nK6wNlAPG5+wOCJMT41z0ukoD7rlrtzTcvnqHPUyDx1YuP3qfNgqoTEA0qE9J1l09e6KQNF/SZPJ6\nAjgFuBPYAJyRbHY28PnelNAGWbXHU+00no/v2LX7ddqddEBHx01Uy9aNMRpVK5YsYM0bj2dyYs8Y\njHkHjrPmjOOHJhhaZ/UszbikF1BprB6jErCujoiLJT2LSvfYQ4HNwFsj4vGsfTnNuNVbunp9w3kS\nFkxOcMuqZU27zrab6jtNJ9KKmxXV92nGI+J2YEmD9T+g0l5hVlizLq71k/3U69S4iW6O0TAri1N4\n2FBqpYvriiULuGXVMlLy8nWkR1C3x2iYlcGBwoZSsy6ute0Ec1JSuHaiR9CwjdGw0dB21ZOki5P9\nbAG2RMR/t10qs0SetOD1277hhQvYcNf2fX62vp2g0cxsnRo3kZbbyb2PrJ/lChSS3hoRn65dFxF/\nLekwYDHwu5KeExF/VGYhbTTlSc7XaNtrN001bCROS+0xJrEroq15KpoFNk/wY4Mo7xPF2yS9CHh3\nROz+T4uIB4Abky+zUuRp+E3b9ryr981nlNYesCuCe1afWri8rQa2YRqjYaMhs41C0iJJV9WsejUw\nA6yXNL+jJbORl6fhN23bnRGcf91WLli3teNtEnnmuzAbJM2eKP6DymhpACJiF7BK0unA1yR9kErb\nxB0R8Wjnimmj6PDJiYZdVxtd0NPmnobKxfqqW+/bPR9Fp9ok3KPJhlWzXk+/A1xSu0LSa4E/BJ4A\nTgD+FviRpO91pIQ2svIk50ube7oqa1ipgDe8sP3qIPdosmGV+UQREVuBt1SXJd0DfIfKfBFfrt1W\n0hEdKaGNrOqF+73/um13Ko7992t8b9PO3NMBbLirklgyTy+reu7RZMMqb2P2qyPirkZvRMSPSyiP\n2T4em92To2l6ZnZ3AzHs6T00R2pYpQSVJ4ZmiWrun55pewpU92iyYdWzXE9lcq6n4ZWWs2lyYpzH\nd+xqOoPd5MQ4iw4/mFu+/4vM7RYk1UONjjUm8YE3OYGeDZ++z/Vko6VolU5aQ3Baw3X9WAhg95Sf\nWU4+Zj5X3Xpfw/eqPaeg8fgNP0HYsHMKD+u4apXO1PTMPrPJNZO3Ibg6FuKWVcsAOO/q25jd2fyp\necNd2zOP1aibazufy2yQOFBYx7UzviCt59O8A8cbbl+92Fcv4mntFvWmpmcaVjvVqn+68bgJGxWF\nq54k/UZE/DRt2ayqnfEFaQ3EQGYPo7Q0He2of+LI+7lcTWWDqp02ik8Ap2YsmwH5Bs41kpXyIu3C\nW/Ygt/E5+04bmvW56oPCycfM59pNU4V7VJn1UuGqp4g4NWvZrKpR9RHArx/f0VZ9fnU+icvOXAzA\nOWu38Ozzv8jCjDQdhTXYXVq12MnHzN+n7eKqW+9zNZUNrEKBQtIbJR2cvL5A0nWS9pmtzgwqF/RL\nTz9un3aF6ZlZzlm7hSUX31Q4YNQ2KMOe9BxpaTo+dObi1ImKsszujH0u6tXPtWByAlHpYnvp6cex\n4a7t+wSFtJYSp/ewQVD0ieKvIuJhSScBr6RS7fTx8oplw2bFkgUcOLdxTedDj84W7i3UrC1iTNrr\nIr5iyYLCKTUaXdSrTzXVnlYrlizIdfF3eg8bBEUDRfU/81Tg8oj4AjC3nCLZsMq6gBathml2Ud4Z\nwWVnLt59EYf0qrBm6i/qtbPkLV29fnegS7v41z/JOL2HDYqijdlTkv6BStLA90vaH3e1tSbSGn+r\nWrkTr28kzsoaW1XfaFzfk2rywHEeeWwHs7vSu9LWX9Sz0n2k5XxKm3HPrN8VDRRvAl4F/G1ETEt6\nOrCyvGLZMGp0Aa2VdideDQ5T0zN75W1qNu6hqtFkR7U9qZauXr876WCtrBnvssZQVAf7uSusDYui\ngWIGOAg4C7gYGAemyyqUDafqhfKiG7bt8xSQVg1Tf+deNDNZ1tNKkRnvmo2h8Cx2NkyKBoqPAbuA\nZVQCxcPAtcCLSiqXDZhWB5NVL6BZ29e+l5UVNo9DJhqP5Ib0KrE5Eus2TzX8HO2ODTEbJEUDxUsi\n4gRJmwEi4iFJbsweUUXSc6fdcdfvq4wgAfDrJ3akXvTTqsSykgF67gkbJUUboGcljZHUBCTzZ+/K\n/hEbVmXmPOpE6g1oPA6iqjoeYqzBIL20z5E2hsLVTTaMij5RfBi4HniapEuAM4ALSiuVDZQy54ru\n5AC0rH2vWLKAc9duyfVzboewUVEoUETEVZI2Aa+g0j18RUTcWWrJbGCUWV/frAttO5qVx+0OZo21\nk+vproj4aER8xEFitKXlPCpSX9/qYDhRmb1uTov5OFopT5mfw2yYFM31dIWkyZrleZI+WV6xbJDk\nra9PG9Fcv680CyYnuOzMxTy+YxcZY+Rytx+43cGssUJzZkvaHBFLmq3rFs+ZPTguWLeVq269b6/x\nEBPjYw0vyPU9oGq3rQ7AS7NgcmL3wDcza6zVObOLVj3NkTSv5mCH4vm3rYkL1m3l03VBAio9i85Z\nu4UL1m3dva46lmJmdufu3khztGfbZu0YJx8zv+zim42sohf3DwD/KemaZPmNwCXlFMmG0brNU3z6\n1vsyt6m+f+IzDm04liKrmqnehru2Fyuome2jaK+nK5NeTycnq06PiO+UVywrW6+n4Wx1TMVVt97H\nv3zzR20PtPM8D2blKVxdFBHbgG0llsU6pMjI6bK1euEOyhmNnbdLa68DqVk/y9VGIenryfeHJf2q\n5uthSb/Kua8jJW2Q9B1J2yS9K1l/qKQvS/pu8n1es31ZtjJHThc1eWB6rqVOyNOltXaWvOrUpUUn\nUjIbRrkCRUScJEnAooh4cs3XwRHx5JzH3gGcFxHHAi8F3inpWGAV8JWIeC7wlWTZ2lDmyOki1m2e\n4pHHdnTlWADzDhzP9TRQZiDN6vprNqhyVz1FREj6AnBcOweOiJ8AP0lePyzpTmAB8Hrg5clmVwA3\nA+9p51ijLm3E8SET4yxdvb7j1S1rbrw7c1KgMo2PiQtPW5TrZ8oKpP1QxWfWCUW7x35bUmkpxSUt\nBJYA3wQOS4IIwE+Bw1J+5h2SNkrauH27e7hkaTTieHyO+PUTO/aqbll5zW0sufimUu+G122eKjUl\nx9gcMZ4yHPuguWMcNHc/zl27JVf509oz8rZzpD2ZnHf1bX6ysIFWNFC8BLhV0vcl3S5pq6Tbi+xI\n0pOozGVxTkTs1c4RldGADW9FI+LyiDgxIk6cP9995rM0GnH8pAP2Y3bn3qd2dlfw0KOzpdXTV++w\ny7RzVzB3vzl7fZYPnbmYD525mF0B0zP5y19W6o60J5BqunIHCxtURXs9LS/j4JLGqQSJqyLiumT1\nA5KeHhE/SaZYfbCMY426+kynz1z1haY/Uz+FaLOeQfXvP/rEjo6kDP/1Ezu55Hf3PvbS1etT2xla\nSd0B7U9dmpXQsNWymPWjooHiAeBPgZOo3PF/Hfj7PDtIGsU/AdwZER+seesG4GxgdfL98wXLaBla\nzdJavUtuVv/e6P0sotITqtFc1a2ov+i2285QRsrwZnOCe2yHDaqiVU9XAouA/wt8BDgW+FTOfSwF\n3gYsk7Ql+XoNlQBxiqTvAq9Mlq1kraa4qNbTN6t/zzvhUACPze5iXsFus/UX3bLaGdqRNQFSt8ti\nVqaiTxTPT7q1Vm2QlGtkdkR8ncqNZSOvKFgua1ErKS5q6+mb1b8XqWKamd3J/vvNYWJ8LPfP1190\n+2Vq0upTST+Uxaws7fR6eml1QdJLAKdvHSBZ1SCNUmxn3Q3XJu7L65czs03TitfvudFFt59ShPdT\nWczKUDTN+J3A0UA1y9tRwN1UBtFFRLygtBK2oNtpxoch3cPS1esbtiOMSXzgTce3lPK7Xv2TwfiY\nIMgcQ1GbDjwtrfgbXriADXdtH+jzbdaPWk0zXrTq6VUFf27gDcugqrSG12pVEuz9eaqvz7v6toa5\nmBYkF/D6AArsnjtCsM88FCcfM3+vQX8OCmb9p9ATRb/p5hNF2p34IE6Us27zVOaFv9HnyZpMqNkF\nvf5J7ORj5nPtpqlC+zKz9nV64qKR1eu8Sc3kyTW0YskCdqXcKKR9nqL1742q6zbctb3nyQrNrDnP\nSpdT2viDfuj6WKRarMjnyTvmIK1cHm9gNhj8RJFTWekeOqFIFtQyP0/a00xauTzewGww5HqikPTu\nrPfrRlgPpbLSPXRCkWqxsj7PBeu2clXNfNi1TzNZYzDqe0r1S9A1sz3yVj0dnHw/GngRlXQbAKcB\n3yqrUP2ujHQPnVC0Wqzdz7Nu89ReQaKq+jSTVq60nlL9eG7NRlmuQBER7wWQ9FXghIh4OFm+CGie\nZc46qlejk9fceHfjFL9UniYuO3Nxarn6Neia2R5FG7MPA56oWX6ClHkjrHt6VS2WVbV1+OREX1fX\nmVlzRQPFlcC3JF2fLK+gMhud9Vgv7tDTqpbEnrmr/eRgNrgK9XqKiEuA3wceSr5+PyL+T5kFs8HR\nqOeUgLe89KiGqUA8p7TZYCn0RJHMJXEscEhEXCzpKEkvjoiRadC2PVqtWhqW9Cdmo6ZoUsC/B3YB\nyyLieZLmATdFRGnzaOfR7aSAvVAd2Tw1PcOYxM6I3b2G2r3IdivJ4TClPzEbBp1OCviSiDhB0maA\niHhI0tyC+7Im6u/Eq7mZyrgjX7d5ipXX3LY7w+vU9Awrr7mtrX2m6ff0J2bWWNGR2bOSxkiSgUqa\nT+UJwzoga/a4dnMjXXTDtn3SgM/uCi66YVvhfabph1nozCy/ok8UHwauB54m6RLgDOCvSivVkMtb\n1dPsjjvvHXnt8dMqHqdnis1lnSVrnMcwzPFhNqwKBYqIuErSJipTlgpYERF3llqyIVVm4r7a91s9\n9kU3bCs9CLR6kU9r9AbcyG3Wx4r2enp/RLwHuKvBOsuQlbgv7aKYNskQpI+8bmXuhyzzDhxvabu8\nga/ReIqlq9fnPidm1j1Fq55OAeqDwqsbrBtJWXfY7Sbuq+/1dPIx81lz492cu3ZL5h16o1xMacbH\nxIWnLWpp2yKBr54buc36W97ssX8C/CnwbEm317x1MPCNMgs2qJrdYZeZuC/tWAeMz9nn4t0sSIxJ\n7IrI3T5QxkW+n+f4MLP8vZ4+QyVT7OeT79WvF0bEW0ou20BqNidEmfM/pB3roUfztUFMjI/xgTcd\nzz2rT+WWVctyVfeU0ZOpn+f4MLOcgSIifhkRP6SSBPCXEXFvRNwLhKRPdqKAg6bZHXazqUTzpLgo\no2rmoLntzVFdxkW+6PSqZtYdRdsoXhAR09WFZMDdkpLKNNBaqUZJS5CXt2E47ViTE+P8+okdzO5s\n3ioxeeDcti7IZWWGddJAs/5VdMDdnCRtBwCSDsXzbwPt3WHnnco07VgXvW4RB81t7deR56kk7Wln\nxZIF3LJqWaGqKzPrf0Uv7h8A/lPSNcnyG4FLyinSYGvnDjtvw3DWsc5du6Wl8uYZg+GxDmajqeiA\nuyuTAXcnJ6tOj4jvlFeswVa0GqVI75+0YzUbpAf52hLK6AZrZoOpaNUTEbEtIj6SfA18kOiHeRLK\n7P2zcvnRjM9R6vt5G4w91sFsdOUdR/H1iDhJ0sPs3TVfQETEk0stXZf0S7VKmVOGVn+mNmXHvAPH\nufC0RV172jGz4VBoPop+0+58FKMwT0K7SffqgylUnnbcjdVscHVkPgpJ7856PyI+mGd//WLYq1XK\neGIq82nHzAZL3sbsg5PvRwMvAm5Ilk8DBnYa1GGvVimrIdpjHcxGU96R2e+NiPcCRwAnRMR5EXEe\n8ELgqE4UsBuGPYXEsD8xmVlnFe31dBiVNB5VTyTrcpH0SUkPSrqjZt2hkr4s6bvJ93lZ+yjDsKeQ\n8MxyZtaOogPurgS+Jen6ZHkFcEWB/fwz8JFkf1WrgK9ExGpJq5LljqcvH+ZqlayZ5czMmik64O4S\nSf8O/Fay6vcjYnOB/XxV0sK61a8HXp68vgK4Gc9z0RY3RJtZO4rOcCfgWOCQiLhY0lGSXhwRZTRo\nHxYRP0le/5QCVVq2r24+MXn+a7PhUrSN4mPAy4CzkuWHgY+WUqIaURnk0XCgh6R3SNooaeP27dvL\nPrQVVO2KOzU9Q7CnK24vRrqbWTmKBoqXRMQ7gcegkmYcmFtSmR6Q9HSA5PuDjTaKiMsj4sSIOHH+\n/PklHdralTcDrpn1v6KBYlbSGMndvqT5wK6SynQDcHby+mwqs+nZgHBXXLPhU7TX04eB64GnSboE\nOAO4IO9OJP0LlYbrp0r6MXAhsBq4WtLbgXuBNxUs41Dq9/r/YR+8aDaKcgeKpCH7q8Am4BVUEgKu\niIg78+4rIs5KeesVefc1CvoleWEWd8U1Gz65A0VEhKQvRsRxwF0dKFPf69VdfZ5UHL0qo7vimg2f\nolVP35b0ooj4r1JLMwB6eVffav1/r588hnnwotkoKtzrCbhV0vcl3S5pq6TbyyxYv+plr55WU3G4\n55GZlanoE8XyUksxQHrZq6fV+n/3PDKzMuWdj+IA4I+B5wBbgU9ExI5OFKxf9bJXT6v1/+55ZGZl\nyvtEcQUwC3wNeDWVNB7vKrtu6ZQDAAAJq0lEQVRQ/azVu/pONSa3Uv/vnkdmVqa8geLYpLcTkj7B\nAE9WVFQrd/X90JjcrIxmZq3KNWe2pG9HxAlpy73S7pzZZRuFObjNbPB1ZM5s4HhJv6oeA5hIlkVl\niMWTc+6vZzo5zsCNyWY2THIFiogYa75V/+t01ZAbk81smBQdRzHQOj3OYNjn4Daz0VJ0HMVAK6tq\nKK36yo3JZjZMRjJQlFE11Kz6ymkszGxYjGTVUxlVQ06TYWajYiSfKMqoGnLPJjMbFSMZKKD9DKfu\n2WRmo2Ikq57K4J5NZjYqRvaJol3u2WRmo8KBog3u2WRmo8BVT2ZmlsmBwszMMjlQmJlZJrdRlKiT\nGWnNzHrFgaIkvZ6syMysU1z1VBKn9DCzYeVAURKn9DCzYeVAUZK01B1O6WFmg86BoiRO6WFmw8qN\n2SVxSg8zG1YOFCVySg8zG0auejIzs0wOFGZmlsmBwszMMjlQmJlZJgcKMzPL1JeBQtKrJN0t6XuS\nVvW6PGZmo6zvAoWkMeCjwKuBY4GzJB3b21KZmY2uvgsUwIuB70XEDyLiCeCzwOt7XCYzs5HVj4Fi\nAfCjmuUfJ+v2IukdkjZK2rh9+/auFc7MbNT0Y6BoSURcHhEnRsSJ8+fP73VxzMyGVj8GiingyJrl\nI5J1ZmbWA/0YKP4LeK6kZ0qaC7wZuKHHZTIzG1l9lxQwInZI+jPgRmAM+GREbOtxsczMRlbfBQqA\niPgi8MVel8PMzPqz6snMzPqIA4WZmWVyoDAzs0wOFGZmlsmBwszMMjlQmJlZJgcKMzPL5EBhZmaZ\nHCjMzCyTA4WZmWVyoDAzs0wOFGZmlqkvkwL2s3Wbp1hz493cPz3D4ZMTrFx+NCuW7DMBn5nZ0HCg\nyGHd5inOv24rM7M7AZianuH867YCOFiY2dBy1VMOa268e3eQqJqZ3cmaG+/uUYnMzDrPgSKH+6dn\ncq03MxsGDhQ5HD45kWu9mdkwcKDIYeXyo5kYH9tr3cT4GCuXH92jEpmZdZ4bs3OoNli715OZjRIH\nipxWLFngwGBmI8VVT2ZmlsmBwszMMjlQmJlZJgcKMzPL5EBhZmaZFBG9LkPbJG0H7u11Odr0VOBn\nvS5EH/H52MPnYm8+H3tr53w8IyLmN9toKALFMJC0MSJO7HU5+oXPxx4+F3vz+dhbN86Hq57MzCyT\nA4WZmWVyoOgfl/e6AH3G52MPn4u9+XzsrePnw20UZmaWyU8UZmaWyYHCzMwyOVD0gKRPSnpQ0h01\n6w6V9GVJ302+z+tlGbtF0pGSNkj6jqRtkt6VrB/V83GApG9Jui05H+9N1j9T0jclfU/SWklze13W\nbpE0JmmzpH9Llkf5XPxQ0lZJWyRtTNZ1/H/FgaI3/hl4Vd26VcBXIuK5wFeS5VGwAzgvIo4FXgq8\nU9KxjO75eBxYFhHHA4uBV0l6KfB+4LKIeA7wEPD2Hpax294F3FmzPMrnAuDkiFhcM3ai4/8rDhQ9\nEBFfBX5Rt/r1wBXJ6yuAFV0tVI9ExE8i4tvJ64epXBAWMLrnIyLikWRxPPkKYBnwuWT9yJwPSUcA\npwL/mCyLET0XGTr+v+JA0T8Oi4ifJK9/ChzWy8L0gqSFwBLgm4zw+UiqWrYADwJfBr4PTEfEjmST\nH1MJpqPgQ8BfAruS5acwuucCKjcNN0naJOkdybqO/694hrs+FBEhaaT6LUt6EnAtcE5E/Kpy41gx\naucjInYCiyVNAtcDx/S4SD0h6bXAgxGxSdLLe12ePnFSRExJehrwZUl31b7Zqf8VP1H0jwckPR0g\n+f5gj8vTNZLGqQSJqyLiumT1yJ6PqoiYBjYALwMmJVVv7I4ApnpWsO5ZCrxO0g+Bz1Kpcvo7RvNc\nABARU8n3B6ncRLyYLvyvOFD0jxuAs5PXZwOf72FZuiapc/4EcGdEfLDmrVE9H/OTJwkkTQCnUGm3\n2QCckWw2EucjIs6PiCMiYiHwZmB9RLyFETwXAJIOknRw9TXwO8AddOF/xSOze0DSvwAvp5Ie+AHg\nQmAdcDVwFJWU6W+KiPoG76Ej6STga8BW9tRD/y8q7RSjeD5eQKVBcozKjdzVEXGxpGdRuas+FNgM\nvDUiHu9dSbsrqXr6i4h47aiei+RzX58s7gd8JiIukfQUOvy/4kBhZmaZXPVkZmaZHCjMzCyTA4WZ\nmWVyoDAzs0wOFGZmlsmBwszMMjlQ2NCQtEJSSMpMeSFpUtKftnmsR1LW70xSQN8h6RpJB6Zs9412\njt8qSc9LUlPPSZbHJN0k6fe6cXwbDg4UNkzOAr6efM8yCbQVKDLMJCmgnw88Afxx7ZuqmBMRv9mh\n4+8lIu6kMrL7tcmqS4C7I+LKbhzfhoMDhQ2FJKngSVTmJnhzzfrfk3R7MhHQp5LVq4FnJ3f+ayQt\nrJtE6i8kXZS8Xpdk6txWk62zVV8DnpPs/25JV1JJuXBk7RNJShmR9NZkEqMtkv4heRo4SNIXkm3v\nkHRmC+W4DPgTSW+gkj/p3Tk/h404Z4+1YfF64EsR8d+Sfi7phcBjwAXAb0bEzyQdmmy7Cnh+RCyG\n3enN0/xBRPwiybv0X5KujYifNytMkrTu1cCXklXPBc6OiFuT96vbLWpURknPA84ElkbErKSPAW8B\nfg3cHxGnJtsdknz/IvCHEXF/fVki4iZJHwAuBX47Imabld+slp8obFicRSX/D8n3s6hkG70mIn4G\nUDD/zZ9Lug24FTiSygU/y0Qyl8RG4D4qCQ8B7q0GiTppZXwF8EIqwWlLsvwsKjmxTpH0fkm/FRG/\nTH7uNY2CRI1vAB+MiJ9WV0j6myafxQzwE4UNgeQufBlwXJKLf4zKBC9rWtzFDva+aTog2e/LgVcC\nL4uIRyXdXH0vw0z1SaWmfFB5EshDwBURcf4+b0gnAK8B3ifpKxFxcQv7Oxb4p5p9/AaV2fPMmvIT\nhQ2DM4BPRcQzImJhRBwJ3APcDrwxya5JTdXTw8DBNT//APA0SU+RtD97Gn4PAR5KgsQxVOb0Ltv6\nlDJ+BTgjmaAGSYdKeoakw4FHI+LTVALhCS0eZxGV9pGqxcCWMj6ADT8HChsGZ7En/XLVtVQatS8B\n/l9SffRBgKSN4ZakMXhNUmd/MfAtKlOPVmcN+xKwn6Q7qTSAN6o6aktEbEsp43eotF3cJOn2pFxP\nB44DvpVUR10IvA8qbRRJENmHpCOpTB9a26XXgcJa5jTjZiNI0ieAP4qIXU03tpHnQGFmZplc9WRm\nZpkcKMzMLJMDhZmZZXKgMDOzTA4UZmaWyYHCzMwyOVCYmVkmBwozM8vkQGFmZpn+P0oQ58T6BoBj\nAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2X1RA6sgtZQ6", - "colab_type": "text" - }, - "source": [ - "## Conclusion\n", - "- looks like the multiple regression we ran does provide more accurate predictions than the simple linear regression\n", - " - this will not always be the case, so always be sure to check and confirm if the extra computing is worth it\n", - "\n", - "Anyways, that's how you implement both Simple and Multiple Linear Regression with `cuML`. Go forth and do great things. Thanks for stopping by!" - ] - } - ] -} From 533f9238b768442f154a4cf338dec04ecf7fde28 Mon Sep 17 00:00:00 2001 From: Winston Robson <43570913+gumdropsteve@users.noreply.github.com> Date: Thu, 26 Mar 2020 14:34:51 -0700 Subject: [PATCH 7/7] [WIP] Zestimate (#2) * running locally; general update to allow current breaks to be more understandable * making issues easier to understand; working on flow of notebook --- .../zillow_kaggle_zestimate_comp.ipynb | 7756 +++++++++-------- 1 file changed, 4026 insertions(+), 3730 deletions(-) diff --git a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb index c8d68291..4dfce9de 100644 --- a/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb +++ b/colab_notebooks/zillow_kaggle_zestimate_comp.ipynb @@ -1,3734 +1,4030 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "scfLT2i0MLyD" + }, + "source": [ + "# Environment Sanity Check #\n", + "\n", + "Click the _Runtime_ dropdown at the top of the page, then _Change Runtime Type_ and confirm the instance type is _GPU_.\n", + "\n", + "Check the output of `!nvidia-smi` to make sure you've been allocated a Tesla T4.\n", + "\n", + "#Setup:\n", + "\n", + "1. Install most recent Miniconda release compatible with Google Colab's Python install (3.6.7)\n", + "2. Install RAPIDS libraries\n", + "3. Set necessary environment variables\n", + "4. Copy RAPIDS .so files into current working directory, a workaround for conda/colab interactions\n", + "- **TLDR**\n", + " - Hit `Shift` + `Enter`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "name": "zillow_kaggle_zestimate_comp.ipynb", - "version": "0.3.2", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "base_uri": "https://localhost:8080/", + "height": 312 + }, + "colab_type": "code", + "id": "W-um5d-x7o46", + "outputId": "a604e66b-95d7-44fb-f8d3-848fcedaf796" + }, + "outputs": [], + "source": [ + "\"\"\"make sure we have the right GPU\n", + "> column 1 row 3 == Tesla T4\n", + "\"\"\"\n", + "# display gpu specs\n", + "!nvidia-smi" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "scfLT2i0MLyD", - "colab_type": "text" - }, - "source": [ - "# Environment Sanity Check #\n", - "\n", - "Click the _Runtime_ dropdown at the top of the page, then _Change Runtime Type_ and confirm the instance type is _GPU_.\n", - "\n", - "Check the output of `!nvidia-smi` to make sure you've been allocated a Tesla T4.\n", - "\n", - "#Setup:\n", - "\n", - "1. Install most recent Miniconda release compatible with Google Colab's Python install (3.6.7)\n", - "2. Install RAPIDS libraries\n", - "3. Set necessary environment variables\n", - "4. Copy RAPIDS .so files into current working directory, a workaround for conda/colab interactions\n", - "- **TLDR**\n", - " - Hit `Shift` + `Enter`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "W-um5d-x7o46", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 312 - }, - "outputId": "a604e66b-95d7-44fb-f8d3-848fcedaf796" - }, - "source": [ - "\"\"\"make sure we have the right GPU\n", - "> column 1 row 3 == Tesla T4\n", - "\"\"\"\n", - "# display gpu specs\n", - "!nvidia-smi" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Mon Sep 9 14:17:51 2019 \n", - "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 430.40 Driver Version: 418.67 CUDA Version: 10.1 |\n", - "|-------------------------------+----------------------+----------------------+\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", - "|===============================+======================+======================|\n", - "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", - "| N/A 68C P0 28W / 70W | 0MiB / 15079MiB | 0% Default |\n", - "+-------------------------------+----------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------+\n", - "| Processes: GPU Memory |\n", - "| GPU PID Type Process name Usage |\n", - "|=============================================================================|\n", - "| No running processes found |\n", - "+-----------------------------------------------------------------------------+\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kkEdr1VmigyU", - "colab_type": "text" - }, - "source": [ - "### Install RAPIDS AI" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "p129YxxnihcV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!wget -nc https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh\n", - "# RAPIDS 0.10 nightly\n", - "!bash rapids-colab.sh \n", - "\n", - "import sys, os\n", - "\n", - "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n", - "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", - "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1CsdVW7SU9Li", - "colab_type": "text" - }, - "source": [ - "# Zillow Kaggle Competition RAPIDS Conversion\n", - "- initially based off eswar3's [Zillow prediction models]( https://github.com/eswar3/Zillow-prediction-models) repo\n", - "## Download Data\n", - "- to download the data, please plug in your kaggle api username & key\n", - " - you can set up your kaggle api at `https://www.kaggle.com/YOUR USERNAME HERE/account`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "x1dLRTm168Tk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# 5b4ecdb3cb122fb692a8349124960424\n", - "# Info on how to get your api key (kaggle.json) here: https://github.com/Kaggle/kaggle-api#api-credentials\n", - "!pip install kaggle\n", - "!mkdir /root/.kaggle\n", - "# plug api -- get your own API key\n", - "!echo '{\"username\":\"warobson\",\"key\":\"\"}' > /root/.kaggle/kaggle.json\n", - "!chmod 600 /root/.kaggle/kaggle.json\n", - "# !kaggle datasets download\n", - "!kaggle competitions download -c zillow-prize-1\n", - "\n", - "# unzip kaggle data\n", - "!unzip -q \"/content/sample_submission.csv.zip\"\n", - "!unzip -q \"/content/train_2016_v2.csv.zip\"\n", - "!unzip -q \"/content/properties_2016.csv.zip\"\n", - "!unzip -q \"/content/train_2017.csv.zip\"\n", - "!unzip -q \"/content/properties_2017.csv.zip\"" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LICr9uz8do9K", - "colab_type": "text" - }, - "source": [ - "#### How is the data saved?\n", - "- inside content directory " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6n75DyJ-dm4B", - "colab_type": "code", - "outputId": "64ac687e-39d6-4bb1-f4b7-5476c9de3b84", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 173 - } - }, - "source": [ - "# display content folder contents\n", - "!ls \"/content/\"" - ], - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "text": [ - "0.9\t\t\t\t sample_data\n", - "env-check.py\t\t\t sample_submission.csv\n", - "__MACOSX\t\t\t sample_submission.csv.zip\n", - "Miniconda3-4.5.4-Linux-x86_64.sh train_2016_v2.csv\n", - "properties_2016.csv\t\t train_2016_v2.csv.zip\n", - "properties_2016.csv.zip\t\t train_2017.csv\n", - "properties_2017.csv\t\t train_2017.csv.zip\n", - "properties_2017.csv.zip\t\t zillow_data_dictionary.xlsx.zip\n", - "rapids-colab.sh\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lpa1b4edIXuT", - "colab_type": "text" - }, - "source": [ - "# Imports\n", - "### RAPIDS\n", - "* `cuDf`\n", - " - words here\n", - "* `cuML`\n", - " - words here\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZKN5zuROroJD", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# rapids \n", - "import cudf, cuml \n", - "# switch to cupy next update (once docker has it)\n", - "import numpy as np\n", - "# general \n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YJeywzd2efw7", - "colab_type": "text" - }, - "source": [ - "## Data\n", - "* `properties_2016`\n", - " - aprox. 27,000,000 residential properties \n", - " - 58 attributes each\n", - "* `train_2016_v2`\n", - " - 90,000 transaction records for closings in the year 2016\n", - " * Merge datasets on `property_id`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2EfApIzCfEtr", - "colab_type": "code", - "outputId": "bc1e37d1-9ab8-4561-fa39-5af420480a72", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 156 - } - }, - "source": [ - "# import 2016 properties\n", - "prop2016 = cudf.read_csv('/content/properties_2016.csv')\n", - "# peek display 2016 properties\n", - "print(prop2016.head())" - ], - "execution_count": 154, - "outputs": [ - { - "output_type": "stream", - "text": [ - " parcelid airconditioningtypeid ... taxdelinquencyyear censustractandblock\n", - "0 10754147 null ... null null\n", - "1 10759547 null ... null null\n", - "2 10843547 null ... null null\n", - "3 10859147 null ... null null\n", - "4 10879947 null ... null null\n", - "\n", - "[5 rows x 58 columns]\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "uynoUxpx8Xsn", - "colab_type": "code", - "outputId": "b64b7b32-c1f9-4cf3-c50d-36e90dc51a64", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 121 - } - }, - "source": [ - "# import train 2016 data\n", - "train2016 = cudf.read_csv('/content/train_2016_v2.csv',\n", - " parse_dates=[\"transactiondate\"])\n", - "# peek display 2016 train\n", - "print(train2016.head())" - ], - "execution_count": 155, - "outputs": [ - { - "output_type": "stream", - "text": [ - " parcelid logerror transactiondate\n", - "0 11016594 0.0276 2016-01-01\n", - "1 14366692 -0.1684 2016-01-01\n", - "2 12098116 -0.0040 2016-01-01\n", - "3 12643413 0.0218 2016-01-02\n", - "4 14432541 -0.0050 2016-01-02\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gGiscxESJDrl", - "colab_type": "text" - }, - "source": [ - "## [Zillow Prediction Model](https://github.com/eswar3/Zillow-prediction-models/blob/master/Step%202a-Approach1.ipynb)\n", - "\n", - " In this approach the properties data and transaction data are merged together before adressing any missing values\n", - "\n", - "\n", - "#### Merging Data \n", - " - we will start by merging the two dataframes\n", - " - then rename the new dataframe's attributes to be meaningful \n", - " - e.g. from `pooltypeid7` to `pool_with_spa_tub_no` and `structuretaxvaluedollarcnt` to `structure_tax`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "o4CvSIcwm4B2", - "colab_type": "code", - "outputId": "4e59a51a-ebd6-4fe5-b037-3165e57e3b85", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 156 - } - }, - "source": [ - "# merge 2016 train and property dataframes by parcel id\n", - "train = train2016.merge(prop2016, how='left', on='parcelid')\n", - "\n", - "# work on a copy\n", - "df_train = train.copy() # [:int(0.5*len(train))]\n", - "\n", - "# add column inidcaticating month of transaction\n", - "df_train['transaction_month'] = df_train['transactiondate'].dt.month\n", - "\n", - "# set colums to be renamed for general english understandability \n", - "rename_these = {\"bathroomcnt\": \"total_bath\",\n", - " \"fullbathcnt\": \"full_bath\",\n", - " \"threequarterbathnbr\": \"half_bath\",\n", - " \"yardbuildingsqft17\": \"patio_sqft\",\n", - " \"yardbuildingsqft26\":\"storage_sqft\",\n", - " \"decktypeid\": \"deck_flag\",\n", - " \"pooltypeid7\": \"pool_with_spa_tub_no\", \n", - " \"pooltypeid2\": \"pool_with_spa_tub_yes\",\n", - " \"hashottuborspa\": \"has_hottub_or_spa\", \n", - " \"pooltypeid10\": \"just_hottub_or_spa\",\n", - " \"calculatedfinishedsquarefeet\":\"total_finished_living_area_sqft\", \n", - " \"finishedsquarefeet12\": \"finished_living_area_sqft\",\n", - " \"lotsizesquarefeet\": \"lot_area_sqft\",\n", - " \"finishedsquarefeet50\":\"finished_living_area_entryfloor_sqft1\",\n", - " \"finishedfloor1squarefeet\":\"finished_living_area_entryfloor_sqft2\",\n", - " \"finishedsquarefeet6\": \"base_unfinished_and_finished_area_sqft\",\n", - " \"finishedsquarefeet15\": \"total_area_sqft\",\n", - " \"finishedsquarefeet13\": \"preimeter_living_area_sqft\",\n", - " \"taxvaluedollarcnt\":\"total_parcel_tax\",\n", - " \"landtaxvaluedollarcnt\":\"land_tax\",\n", - " \"taxamount\":\"total_property_tax_2016\",\n", - " \"structuretaxvaluedollarcnt\":\"structure_tax\",\n", - " \"garagetotalsqft\":\"garage_sqft\",\n", - " \"fireplacecnt\":\"fireplace_count\",\n", - " \"buildingqualitytypeid \":\"building_quality_id\",\n", - " \"heatingorsystemtypeid\":\"heating_system_id\",\n", - " \"airconditioningtypeid\":\"ac_id\",\n", - " \"storytypeid\": \"basement_flag\",\n", - " \"basementsqft\": \"basement_sqft\",\n", - " \"poolsizesum\": \"pool_sqft\",\n", - " \"poolcnt\": \"pool_count\"}\n", - "# rename columns \n", - "df_train = df_train.rename(columns = rename_these)\n", - "\n", - "# what's the data frame look like?\n", - "print(df_train.head())" - ], - "execution_count": 156, - "outputs": [ - { - "output_type": "stream", - "text": [ - " parcelid logerror ... censustractandblock transaction_month\n", - "0 11827818 0.0402 ... 6.037532e+13 3\n", - "1 12123024 0.0296 ... 6.037463e+13 3\n", - "2 13867327 0.0344 ... 6.059011e+13 3\n", - "3 12681894 0.0060 ... 6.037651e+13 3\n", - "4 12848541 0.0695 ... 6.037409e+13 3\n", - "\n", - "[5 rows x 61 columns]\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YdtyBI2jFnJv", - "colab_type": "text" - }, - "source": [ - "## Conforming Attribute Values\n", - "### #0 boolean columns & null = 0s cases \n", - "* `pool_count`, `pool_with_spa_tub_no` and `pool_with_spa_tub_yes` are all binary variables, replace all NULL values with zero\n", - "* `basement_flag` has values 7 & `Null` but is supposed to be bool, convert the `7`s to `1`s and the `Null`s to `0`s \n", - "* patio and shed variables with null values are assumed to have none\n", - "* deck_flag has only 2 values, `66` and `null`\n", - " - convert it into binary flag\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "z3bPdNONHTYI", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# replace missing pool count values so we booling\n", - "the_bool_club = ['pool_count','pool_with_spa_tub_no','pool_with_spa_tub_yes',\n", - " 'basement_flag','patio_sqft','storage_sqft', 'deck_flag']\n", - "for col in the_bool_club:\n", - " # convert null values to 0\n", - " df_train[col]=df_train[col].fillna(0)\n", - "# convert 7s and 66s to 1s\n", - "df_train['basement_flag'] = df_train['basement_flag'].replace(7, 1)\n", - "df_train['deck_flag'] = df_train['deck_flag'].replace(66, 1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5MbGy6r7JLLD", - "colab_type": "text" - }, - "source": [ - "### #1 The pool\n", - "* When pool is present and if it has tub/spa then `just_hottub_or_spa` = 0" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "B3-1V93smA9A", - "colab_type": "code", - "outputId": "52e1a5d7-869a-443f-ac2d-40504992dc14", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 156 - } - }, - "source": [ - "print(f'before\\n{df_train.just_hottub_or_spa.value_counts()}\\n')\n", - "\n", - "# if poolcnt=1 and has_hottub_or_spa=1 and just_hottub_or_spa is null\n", - "conditions = ((df_train['pool_count'] == 1) \n", - " & (df_train['has_hottub_or_spa'] == 1) \n", - " & (df_train['just_hottub_or_spa'].isna() == True))\n", - "# then just_hottub_or_spa = 0\n", - "df_train.just_hottub_or_spa.loc[conditions] = 0\n", - "\n", - "print(f'after\\n{df_train.just_hottub_or_spa.value_counts()}')\n" - ], - "execution_count": 158, - "outputs": [ - { - "output_type": "stream", - "text": [ - "before\n", - "1.0 1161\n", - "Name: just_hottub_or_spa, dtype: int32\n", - "\n", - "after\n", - "0.0 1204\n", - "1.0 1161\n", - "Name: just_hottub_or_spa, dtype: int32\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v6E3-_XlSGBs", - "colab_type": "text" - }, - "source": [ - "\n", - "- when `has_hottub_or_spa` is null and `just_hottub_or_spa` is null\n", - " - both should be zero\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Xa12WFccSGM6", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# if both has hottub and just hottub are null\n", - "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", - " & (df_train['just_hottub_or_spa'].isna() == True))\n", - "# just hottub or spa = 0 \n", - "df_train.just_hottub_or_spa.loc[conditions] = 0\n", - "\n", - "# now, if has hottub is null and just hottub is 0 \n", - "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", - " & (df_train['just_hottub_or_spa'] == 0))\n", - "# has hottub or spa = 0 \n", - "df_train.has_hottub_or_spa.loc[conditions] = 0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5umCCWN73qxw", - "colab_type": "text" - }, - "source": [ - "- when there is no pool\n", - " - if there is tub/spa \n", - " - then `just_hottub_or_spa` = 1" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FBgs7zJm3qk-", - "colab_type": "code", - "outputId": "78c76ac5-2b7f-4f98-9615-8a335bc3214e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 69 - } - }, - "source": [ - "# when poolcnt=0, has_hottub_or_spa=1\n", - "conditions = ((df_train['pool_count'] == 0) \n", - " & (df_train['has_hottub_or_spa'] == 1))\n", - "# just_hottub_or_spa=1\n", - "df_train.just_hottub_or_spa.loc[conditions] = 1\n", - "\n", - "print(df_train.just_hottub_or_spa.value_counts())" - ], - "execution_count": 160, - "outputs": [ - { - "output_type": "stream", - "text": [ - "0.0 89114\n", - "1.0 1161\n", - "Name: just_hottub_or_spa, dtype: int32\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3LsRr1aoSCVx", - "colab_type": "text" - }, - "source": [ - "* When there is no pool, set pool size to zero instead of na" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NtdyXCbx0TKx", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# where there is no pool\n", - "conditions = df_train['pool_count']==0\n", - "# square footage of non existant pool is 0 \n", - "df_train.pool_sqft.loc[conditions] = 0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3hQFkXmAgQPY", - "colab_type": "text" - }, - "source": [ - "### #2 The basement\n", - "* Where `basement_flag` is zero, `basement_sqft` should also be zero\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kMuCOqAmLTmY", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# where there is no basement\n", - "conditions = df_train['basement_flag'] == 0\n", - "# fun fact: we just did this with the pool\n", - "df_train.basement_sqft.loc[conditions] = 0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wU6Uohb-PDYB", - "colab_type": "text" - }, - "source": [ - "### #3 The fireplace\n", - "There seems to be inconsistency between the `fireplace_flag` and `fireplace_count`\n", - "- 90,053 flag values are null\n", - "- 80,688 `fireplace_count` values are null\n", - " * 9,385 (-11.5%) difference, but a boatload either way" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "OZM6lXmmpj5k", - "colab_type": "code", - "outputId": "ecf62d1d-b036-41ad-8052-a3090ae590ef", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - } - }, - "source": [ - "print(f\"there are {df_train['fireplace_count'].isna().sum()} fireplace_count \\\n", - "nulls\\nthere are {df_train['fireplaceflag'].isna().sum()} fireplaceflag nulls\")" - ], - "execution_count": 163, - "outputs": [ - { - "output_type": "stream", - "text": [ - "there are 80668 fireplace_count nulls\n", - "there are 90053 fireplaceflag nulls\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v9ZAzFoIpkSF", - "colab_type": "text" - }, - "source": [ - "* context driven solutions\n", - " * where neither flag nor count exists, `fireplaceflag == False`\n", - " * when `fireplace_count` is more than zero `fireplaceflag` should be `True`\n", - " * if `fireplaceflag == False`, the `fireplace_count` is logically `0`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "i3YRZgU_qZhA", - "colab_type": "code", - "outputId": "e45a7a96-2e1d-47d2-a0bd-48ece42cbb6e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - } - }, - "source": [ - "# null flags with null counts are zero\n", - "conditions = ((df_train['fireplace_count'].isna()==True) \n", - " & (df_train['fireplaceflag'].isna()==True))\n", - "df_train.fireplaceflag.loc[conditions] = False\n", - "\n", - "# true flags for positive fireplace counts\n", - "conditions = df_train['fireplace_count'] > 0\n", - "df_train.fireplaceflag.loc[conditions] = True\n", - "\n", - "# set fireplace count nulls to 0 where false flags are\n", - "conditions = ((df_train['fireplace_count'].isna()==True) \n", - " & (df_train['fireplaceflag']==False))\n", - "df_train.fireplace_count.loc[conditions] = 0\n", - "\n", - "print(f\"there are {df_train['fireplace_count'].isna().sum()} fireplace_count \\\n", - "nulls\\nthere are {df_train['fireplaceflag'].isna().sum()} fireplaceflag nulls\")" - ], - "execution_count": 164, - "outputs": [ - { - "output_type": "stream", - "text": [ - "there are 222 fireplace_count nulls\n", - "there are 0 fireplaceflag nulls\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "pYntUejosOn3" - }, - "source": [ - "### #4 The garage\n", - "* Properties with no garages would have NA values for both " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "L9mGs-mK9E0Q", - "colab_type": "code", - "colab": {} - }, - "source": [ - "garage = ['garagecarcnt', 'garage_sqft']\n", - "# where garage car count and garage square feet are null\n", - "conditions = ((df_train['garagecarcnt'].isna()==True) \n", - " & (df_train['garage_sqft'].isna()==True))\n", - "# set both to 0\n", - "df_train[garage].loc[conditions] = 0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0uV115W6-ohW", - "colab_type": "text" - }, - "source": [ - "Exploring the data farther, we see\n", - "- `garage_sqft` holds over 8,900 measurements of 0 despite the garage's car count being 1 or more \n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "gbbUIbwJ-ouS", - "colab_type": "code", - "outputId": "310a4cdf-01a0-4fc3-ed1b-0e2f5e668518", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 121 - } - }, - "source": [ - "# show rows where garage count and square feet don't add up\n", - "conditions = (df_train.garagecarcnt > 0) & (df_train.garage_sqft == 0)\n", - "print(df_train.loc[conditions][garage].head())" - ], - "execution_count": 166, - "outputs": [ - { - "output_type": "stream", - "text": [ - " garagecarcnt garage_sqft\n", - "16 2.0 0.0\n", - "29 1.0 0.0\n", - "32 2.0 0.0\n", - "49 1.0 0.0\n", - "52 2.0 0.0\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5I1O76QKA8Cb", - "colab_type": "text" - }, - "source": [ - "- these 0 values need to be null\n", - " - because no garage holding 1 or more cars in 2016 measured 0sqft" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "eWVtoty0A9Jt", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# where garage count and square feet don't add up\n", - "conditions = (df_train.garagecarcnt>0) & (df_train.garage_sqft==0)\n", - "# insert a NaN value\n", - "df_train.garage_sqft.loc[conditions] = np.nan" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "seb6r5wx5Bbz" - }, - "source": [ - "### #5 The bath\n", - "* `total_bath` & `calculatedbathnbr` are near-duplicates w/ `calculated` having more nulls\n", - " - let's drop it\n", - "* if `full_bath` is null and `half_bath` is also null\n", - " - let's make `total_bath` = 0 \n", - " - because we can't truthfully assume it's any more " - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "EgMNToed5BMu", - "colab": {} - }, - "source": [ - "# drop calculated bath column\n", - "df_train = df_train.drop('calculatedbathnbr', axis=1)\n", - "\n", - "# if full_bath is null & half_bath is null\n", - "conditions = ((df_train['full_bath'].isnull()==True) \n", - " & (df_train['half_bath'].isnull()==True) \n", - " & (df_train['total_bath']==0))\n", - "# total_bath=0\n", - "df_train.total_bath.loc[conditions] = np.nan\n", - "\n", - "# when full_bath==total_bath, half_bath=0 \n", - "df_train.half_bath.loc[df_train.full_bath == df_train.total_bath] = 0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Sh8cG0pr4_hl" - }, - "source": [ - "### #6 Mode Imputation \n", - "* scaling down the latitude and longitide\n", - " - knn imput takes more time due to the larger numbers\n", - " - standardizing gives better results on most algorithms\n", - " - this is a competition, we came to win" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kitrNxKgLWUd", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_train['latitude'] = df_train.latitude / 100000\n", - "df_train['longitude'] = df_train.longitude / 100000" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y6bhRhu5YZ1d", - "colab_type": "text" - }, - "source": [ - "### #7 numberofstories & unitcnt & roomcnt\n", - "* we can devise unit count based on property land type\n", - " - so we can now go ahead and correct the unit counts for each given property" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "yHZH4rMNLfBA", - "colab_type": "code", - "outputId": "97106bb4-10f2-49a9-f821-03a3972db136", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 208 - } - }, - "source": [ - "# where room count is 0, go ahead and NaN it\n", - "df_train.roomcnt.loc[df_train['roomcnt'] == 0] = np.nan\n", - "\n", - "\"\"\"\n", - "propertylandusetypeid & unitcnt are related \n", - " these are the propertylandusetypeid codes & their definitions\n", - " \n", - "#246 -Duplex (2 Units, Any Combination)\n", - "#247 -Triplex (3 Units, Any Combination)\n", - "#248 -Quadruplex (4 Units, Any Combination)\n", - "#260 -Residential General\n", - "#261 -Single Family Residential\n", - "#263 -Mobile Home\n", - "#264 -Townhouse\n", - "#266 -Condominium\n", - "#267 -Cooperative\n", - "#269 -Planned Unit Development\n", - "#275 -Residential Common Area \n", - "#31 - Commercial/Office/Residential Mixed Used\n", - "#47 -Store/Office (Mixed Use)\n", - "#265 -Cluster Home\n", - "\"\"\"\n", - "\n", - "# one unit \n", - "ones = [260,261,263,264,266,267,269,275]\n", - "for one in ones:\n", - " # adjust conditions to one unit indicator\n", - " conditions = ((df_train['propertylandusetypeid'] == one) \n", - " & (df_train['unitcnt'].isna()))\n", - " df_train.unitcnt.loc[conditions] = 1\n", - "\n", - "# two units \n", - "twos = [31,47,246]\n", - "for two in twos:\n", - " # adjust conditions to two unit indicator\n", - " conditions = ((df_train['propertylandusetypeid'] == two) \n", - " & (df_train['unitcnt'].isna()))\n", - " df_train.unitcnt.loc[conditions] = 2\n", - "\n", - "# three units\n", - "conditions = ((df_train['propertylandusetypeid'] == 247) \n", - " & (df_train['unitcnt'].isna()))\n", - "df_train.unitcnt.loc[conditions] = 3\n", - "\n", - "# four units\n", - "conditions = ((df_train['propertylandusetypeid'] == 248) \n", - " & (df_train['unitcnt'].isna()))\n", - "df_train.unitcnt.loc[conditions] = 4\n", - "\n", - "# let's see how out unit counts look\n", - "print(df_train.unitcnt.value_counts())" - ], - "execution_count": 170, - "outputs": [ - { - "output_type": "stream", - "text": [ - "1.0 86035\n", - "2.0 2372\n", - "4.0 884\n", - "3.0 622\n", - "5.0 1\n", - "6.0 1\n", - "9.0 1\n", - "11.0 1\n", - "70.0 1\n", - "143.0 1\n", - "Name: unitcnt, dtype: int32\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "02yLicmxLs3C", - "colab_type": "text" - }, - "source": [ - "### #8 Time to Cut\n", - "**Because of the adjustments made so far a number of columns are no longer needed**\n", - "* transaction date column is no longer of use\n", - " - and can be dropped \n", - "* `preimeter_living_area_sqft` and `total_finished_living_area_sqft` have the same values \n", - " - except that `preimeter_living_area_sqft` has more duplicates\n", - "* `total_area_sqft` and `total_finished_living_area_sqft` have the same values \n", - " - except that \"total_area_sqft\" has more duplicates\n", - "* `total_finished_living_area_sqft` and `finished_living_area_sqft` have the same values \n", - " - except that `finished_living_area_sqft` has more duplicates\n", - "* `base_unfinished_and_finished_area_sqft` and `total_finished_living_area_sqft` have the same values \n", - " - except that `base_unfinished_and_finished_area_sqft` has more duplicates\n", - "* different counties follow different land use code\n", - " - to compare different counties, zillow has created it's own `propertylandusetypeid`\n", - " - hence we can drop `propertycountylandusecode`\n", - " - the same applies to `propertyzoningdesc`\n", - "* Most zip id's either invalid or out of city\n", - " - since enough information about location is given in latitude and longitude \n", - " - let's drop other location related fields\n", - " - `regionidcity`\n", - " - `regionidzip`\n", - " - `regionidneighborhood`\n", - "* `assessmentyear` has a constant value for all rows\n", - " - let's drop it" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "OtOgzOqHLyid", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# collect columns to drop\n", - "cut = ['propertyzoningdesc','propertycountylandusecode',\n", - " 'base_unfinished_and_finished_area_sqft','finished_living_area_sqft',\n", - " 'total_area_sqft','preimeter_living_area_sqft','regionidzip',\n", - " 'regionidcity','regionidneighborhood','assessmentyear','transactiondate',\n", - " 'censustractandblock']\n", - "# cut columns form dataframe\n", - "df_train = df_train.drop(cut, axis=1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "icDvpvSD6BSb", - "colab_type": "text" - }, - "source": [ - "### #9 Tax, Year, & Census\n", - "- if tax deliquency flag is null, assume there is no unpaid tax on the property\n", - " - an issue arrises here because `taxdelinquencyflag` is a `StringColumn`\n", - " - i.e. null values indicate no tax delinquency, all other values are `Y` for yes\n", - " - because of this, the normal method of.." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8lYcO_T5XKNN", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 311 - }, - "outputId": "596cfad3-890d-4241-b8b8-347673082a7f" - }, - "source": [ - "# how we'd normally take care of this\n", - "df_train['taxdelinquencyflag'].fillna(0)" - ], - "execution_count": 172, - "outputs": [ - { - "output_type": "error", - "ename": "TypeError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1165\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1166\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1167\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1168\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1169\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/column/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 720\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 721\u001b[0m ):\n\u001b[0;32m--> 722\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 723\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 724\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: fill_value must be a string or a string series" - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tA6xG6h59rLi", - "colab_type": "text" - }, - "source": [ - "- ...comes with error. \n", - " - Why?\n", - " - the series we are trying to fill the null values of is a string series\n", - " - because of this `.fillna()` requires a sting value (e.g. '0') instead of an int value (e.g. 0)\n", - " - So, what now?\n", - " - there is an easy and straightforward solution with masked assigning!! \n", - " - First\n", - " - switch 1 (current True, actual False) to -1\n", - " - Then\n", - " - switch 0 (current False, actual True) to 1 to reflect True status\n", - " - Finally\n", - " - switch -1 (old True, actual False) to 0 to reflect False status" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Svp6J0cJ5dL0", - "colab_type": "code", - "outputId": "03862711-e104-4954-bf9c-61bd51b3a9e3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 69 - } - }, - "source": [ - "# if bool 'Y'/None is already set, change string to int bool column via .isna()\n", - "df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'].isna()\n", - "\n", - "# next we must correct the values, with 1 (True) for 'Y' and 0 for no\n", - "switcharoo = [(1,-1),(0,1),(-1,0)]\n", - "# switch values in order\n", - "for pair in switcharoo:\n", - " # tag old value and new value it will be replaced with\n", - " old, new = pair\n", - " # replace old value with new value\n", - " df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'].replace(old, \n", - " new)\n", - "# display values in tax delinquency flag column\n", - "print(df_train['taxdelinquencyflag'].value_counts())" - ], - "execution_count": 173, - "outputs": [ - { - "output_type": "stream", - "text": [ - "0 88492\n", - "1 1783\n", - "Name: taxdelinquencyflag, dtype: int32\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w5EAdWXaCTRU", - "colab_type": "text" - }, - "source": [ - "- Convert years\n", - " - from yy\n", - " - to 2016 - yyyy \n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lHh95mAIMrMy", - "colab_type": "code", - "outputId": "832c405d-d89f-4b85-d77d-7a6726a61907", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 69 - } - }, - "source": [ - "print(df_train.taxdelinquencyflag.value_counts())" - ], - "execution_count": 174, - "outputs": [ - { - "output_type": "stream", - "text": [ - "0 88492\n", - "1 1783\n", - "Name: taxdelinquencyflag, dtype: int32\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6Bic66I9LfGC", - "colab_type": "code", - "outputId": "baaa5387-bbd7-4242-a336-0b6b90606935", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 243 - } - }, - "source": [ - "# no delinquency? set year to 0\n", - "df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyflag == 0] = 0\n", - "# collect x and xx formatted delinquency years w/ matching xxxx year format pair\n", - "year_pairs = [(99,1999), (6,2006), (7,2007), (8,2008), (9,2009), (10,2010),\n", - " (11,2011), (12,2012), (13,2013), (14,2014), (15,2015)]\n", - "# go through the pairs individually \n", - "for year in year_pairs:\n", - " # split the pair in question \n", - " old, new = year\n", - " # replace old year (e.g. 99) with new year (e.g. 1999)\n", - " df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear == old] = new\n", - "\n", - "# adjust delinquency year relative to training year (2016) \n", - "df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear>0] = 2016 - df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear>0]\n", - "# what've we got? \n", - "print(df_train.taxdelinquencyyear.value_counts())" - ], - "execution_count": 175, - "outputs": [ - { - "output_type": "stream", - "text": [ - "0.0 88492\n", - "2.0 628\n", - "1.0 518\n", - "3.0 210\n", - "4.0 154\n", - "6.0 89\n", - "5.0 85\n", - "7.0 63\n", - "8.0 24\n", - "9.0 8\n", - "10.0 3\n", - "17.0 1\n", - "Name: taxdelinquencyyear, dtype: int32\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ya7xLHzdGVcs", - "colab_type": "text" - }, - "source": [ - "- values in `rawcensustractandblock` represent multiple fields concatened together as float values\n", - " - by converting those values to string we can split each and build new columns:\n", - " - `census_tractnumber`\n", - " - `block_number`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AWx7lq0xkDV2", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# make a copy of dataframe at this point\n", - "# pre_string = df_train.copy()\n", - "df_train = pre_string.copy()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Sg0eN-K1QdZy", - "colab_type": "code", - "outputId": "a90de47f-5c88-4834-df44-75a9dedcd07c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 489 - } - }, - "source": [ - "# copy rawcensustractandblock with values as string instead of float\n", - "string_data = cudf.Series(df_train['rawcensustractandblock'].values_to_string())\n", - "\n", - "# print(type(string_data))\n", - "# print(len(string_data))\n", - "# print(string_data)\n", - "\n", - "\"\"\"\n", - "CURRENT ERROR IN CONVERSION OF VALUES\n", - "\"\"\"\n", - "print(f\"\\nNOTE: THERE APPEARS TO BE AN ERROR WHEN CONVERTING TO STRING\\n\"\n", - " f\" > somewhat random numbers added to end of some values\\n >> e.g. 004, 006\"\n", - " f\"\\n\\n\\ndf_train['rawcensustractandblock'].head(10).values\\n\"\n", - " f\"{df_train['rawcensustractandblock'].head(10).values}\\n\\n\"\n", - " f\"data.head(10).values\\n{string_data.head(10).values}\\n\\n\\n\"\n", - " f\"THE SAME NUMBERS OCCOUR IN THE FIRST WHEN PUT INTO A LIST\\n\"\n", - " f\" > not sure how to deal with this now\\n\"\n", - " f\" >> difficult to reproduce without data\\n\\n\")\n", - "\"\"\"\n", - "CURRENT ERROR IN CONVERSION OF VALUES\n", - "\"\"\"\n", - "\n", - "# set new tract number \n", - "df_train['census_tractnumber'] = string_data.str.slice(4, 11)\n", - "\n", - "# set/adjust block number\n", - "df_train['block_number'] = string_data.str.slice(11)\n", - "df_train['block_number'] = df_train.block_number.str.slice(0,4).str.cat(df_train.block_number.str.slice(4), '.')\n", - "df_train['block_number'] = df_train.block_number.astype('float').round(0).astype('int')\n", - "df_train['block_number'] = df_train.block_number.astype('str').str.ljust(4, '0')\n", - "\n", - "# drop raw census tract and block column, no longer needed\n", - "df_train=df_train.drop('rawcensustractandblock', axis=1)\n", - "\n", - "\"\"\"\n", - "CORRECT NUMBERS THAT SHOULD BE DISPLAYED BY BELOW PRINT STATEMENT\n", - " > currently not being seen due to prior mentioned error\n", - "\n", - "tractnumber\n", - "0 1066.46\n", - "1 0524.22\n", - "2 4638.00\n", - "3 2963.00\n", - "4 0423.38\n", - "dtype: object\n", - "\n", - "blocknumber\n", - "0 1001\n", - "1 2024\n", - "2 3004\n", - "3 2002\n", - "4 1006\n", - "dtype: object\n", - "\"\"\"\n", - "print(df_train[['census_tractnumber', 'block_number']].head())" - ], - "execution_count": 177, - "outputs": [ - { - "output_type": "stream", - "text": [ - "\n", - "NOTE: THERE APPEARS TO BE AN ERROR WHEN CONVERTING TO STRING\n", - " > somewhat random numbers added to end of some values\n", - " >> e.g. 004, 006\n", - "\n", - "\n", - "df_train['rawcensustractandblock'].head(10).values\n", - "[60375315.031013 60374625.001017 60590114.012017 60376513.02100401\n", - " 60374087.031018 60375759.011001 60590630.044 60374061.011006\n", - " 60378001.022007 60590524.19100901]\n", - "\n", - "data.head(10).values\n", - "['60375315.031013004', '60374625.001017004', '60590114.012017', '60376513.021004006', '60374087.031018004', '60375759.011001', '60590630.044', '60374061.011006', '60378001.022007', '60590524.19100901']\n", - "\n", - "\n", - "THE SAME NUMBERS OCCOUR IN THE FIRST WHEN PUT INTO A LIST\n", - " > not sure how to deal with this now\n", - " >> difficult to reproduce without data\n", - "\n", - "\n", - " census_tractnumber block_number\n", - "0 5315.03 1013\n", - "1 4625.00 1017\n", - "2 0114.01 2017\n", - "3 6513.02 1004\n", - "4 4087.03 1018\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T71orw51lpTN", - "colab_type": "text" - }, - "source": [ - "## Dealing with Missing Values\n", - "### #1 Setting standards\n", - "- Despite corecting and adjusting the data to this point, there are still some columns holding a large majority of null values\n", - "- For some columns, this majority represents over 95% of values\n", - " - Let's identify those columns\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "xhCosNpXvTVU", - "colab_type": "code", - "outputId": "2d969756-decb-4912-94f6-19836eb0323a", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 86 - } - }, - "source": [ - "# calculate null value % for each column & frame it\n", - "missingvalues_prop = (df_train.isnull().sum()/len(df_train)).reset_index()\n", - "missingvalues_prop.columns = ['field','percentage']\n", - "\n", - "# sort by null values percentage, from highest % to lowest\n", - "missingvalues_prop = missingvalues_prop.sort_values(by='percentage', \n", - " ascending=False)\n", - "# identify columns with > 95% of values null\n", - "missingvaluescols = missingvalues_prop.loc[missingvalues_prop['percentage'] > 0.95]\n", - "\n", - "# display columns with highest % null values\n", - "print(missingvaluescols)\n", - "\n", - "# drop columns with more than 95% null values\n", - "df_train = df_train.drop(missingvaluescols['field'], axis=1)" - ], - "execution_count": 178, - "outputs": [ - { - "output_type": "stream", - "text": [ - " field percentage\n", - "7 buildingclasstypeid 0.999823\n", - "3 architecturalstyletypeid 0.997109\n", - "33 typeconstructiontypeid 0.996688\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8eBIDWEUBHwz", - "colab_type": "text" - }, - "source": [ - "- and drop columns with more than 95% null values" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "az6t2ntBCMRe", - "colab_type": "text" - }, - "source": [ - "### #2 Working with Remaining Values\n", - "- the majority of values still missing in unitcnt are rows were `propertylandusetypeid` = 265, \n", - " - which is Cluster Home (i.e. group of houses with shared walls)\n", - " - each cluster is anywhere between 5 to 25 units\n", - " - here we will asssume 10 units as reassonable count" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "yB2lzAyopS_S", - "colab_type": "code", - "outputId": "db6c7add-5452-4535-8948-a426654851b7", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 225 - } - }, - "source": [ - "# highly related propertylandusetypeid\n", - "df_train['unitcnt'].loc[df_train['propertylandusetypeid'] == 265] = 10\n", - "# let's see what we've got\n", - "print(df_train['unitcnt'].value_counts())" - ], - "execution_count": 179, - "outputs": [ - { - "output_type": "stream", - "text": [ - "1.0 86035\n", - "2.0 2372\n", - "4.0 884\n", - "3.0 622\n", - "10.0 356\n", - "5.0 1\n", - "6.0 1\n", - "9.0 1\n", - "11.0 1\n", - "70.0 1\n", - "143.0 1\n", - "Name: unitcnt, dtype: int32\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iR1rBlz-dOdH", - "colab_type": "text" - }, - "source": [ - "- a number of pool sizes are null despite there being a pool\n", - " - let's calculate the average pool size\n", - " - and assume those null values are pools of average size" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "-icFDeLSoJwl", - "colab_type": "code", - "outputId": "b1ed39c3-3a14-4dc1-eb48-b3429da5cffe", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "# calculate the average pool square footage for properties with a pool(s)\n", - "poolsizesum_mean = df_train.pool_sqft.loc[df_train['pool_count'] > 0].mean()\n", - "\n", - "# where the property has a pool(s) but pool square feet is 0\n", - "conditions = ((df_train['pool_count'] > 0) \n", - " & (df_train['pool_sqft'].isna()==True))\n", - "\n", - "# set pool square feet to the average pool square footage of pool properties\n", - "df_train['pool_sqft'].loc[conditions] = poolsizesum_mean\n", - "\n", - "print(df_train.pool_sqft.isna().sum())" - ], - "execution_count": 180, - "outputs": [ - { - "output_type": "stream", - "text": [ - "0\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AyGeXJfEmJBU", - "colab_type": "text" - }, - "source": [ - "- total parcel tax\n", - "- structure tax\n", - "- land tax" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3pVABkZTYK9F", - "colab_type": "code", - "outputId": "b5cb7ced-7458-4971-936c-b6e5d33bc126", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 173 - } - }, - "source": [ - "#total_parcel_tax\n", - "#structure_tax\n", - "#land_tax\n", - "#total_property_tax_2016\n", - "#2)recalculate total_parcel_tax = structure_tax + land_tax\n", - "print(df_train.total_property_tax_2016.isnull().sum())\n", - "print(df_train.structure_tax.isnull().sum())\n", - "print(df_train.total_parcel_tax.isnull().sum())\n", - "print(df_train.land_tax.isnull().sum())\n", - "print()\n", - "\n", - "# total_parcel_tax =structure_tax + land_tax\n", - "#->structure_tax=total_parcel_tax -land_tax\n", - "\n", - "# where parcel and land taxes are greater than 0\n", - "parcel_taxes = df_train.total_parcel_tax.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)]\n", - "land_taxes = df_train.land_tax.loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)]\n", - "# set structure tax to be their difference\n", - "df_train['structure_tax'].loc[(df_train.total_parcel_tax>0) & (df_train.land_tax>0)] = parcel_taxes - land_taxes\n", - "\n", - "# where structure tax is still 0, there isn't structure tax\n", - "df_train.structure_tax.loc[df_train.structure_tax==0] = np.nan\n", - "\n", - "print(df_train.total_property_tax_2016.isnull().sum())\n", - "print(df_train.structure_tax.isnull().sum())\n", - "print(df_train.total_parcel_tax.isnull().sum())\n", - "print(df_train.land_tax.isnull().sum())" - ], - "execution_count": 181, - "outputs": [ - { - "output_type": "stream", - "text": [ - "6\n", - "380\n", - "1\n", - "1\n", - "\n", - "6\n", - "380\n", - "1\n", - "1\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8SID48LOpYvu", - "colab_type": "code", - "outputId": "6d20a3ba-4360-4554-908d-f6d673aece12", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "# regionidcounty is exact copy of fips code, dropping the dulicate column\n", - "df_train = df_train.drop(['regionidcounty'], axis=1)\n", - "df_train.shape" - ], - "execution_count": 182, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(90275, 45)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 182 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "tWmM2J8_pkg1", - "colab_type": "code", - "outputId": "6362e07f-e363-4884-b0c5-9380b5fee956", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "#*******************************\n", - "#bedroomcnt #1421 zero bed room houses ??, observed it's missing all other room count also missing\n", - "# where there is no bedroom, null is a better representation \n", - "df_train['bedroomcnt'].loc[df_train['bedroomcnt'] == 0] = np.nan\n", - "print(df_train.bedroomcnt.isnull().sum())" - ], - "execution_count": 183, - "outputs": [ - { - "output_type": "stream", - "text": [ - "1421\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3qnP2L9LpmeJ", - "colab_type": "code", - "outputId": "c0eabce4-3232-4435-8733-779526f18c57", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 208 - } - }, - "source": [ - "# propertylandusetypeid & total living area\n", - "# total_bath 1165\n", - "# full_bath 1182\n", - "# half_bath 1182\n", - "# bedroomcnt 1421\n", - "# roomcnt 1416\n", - "\n", - "print(df_train.total_bath.isna().sum())\n", - "print(df_train.full_bath.isnull().sum())\n", - "print(df_train.half_bath.isnull().sum())\n", - "print(df_train.bedroomcnt.isnull().sum())\n", - "print(df_train.roomcnt.isnull().sum())\n", - "print()\n", - "\n", - "# roomcnt = (full_bath + half_bath) + bedroomcnt\n", - "# total_bath = fullbath+ 0.5(half_bath)\n", - "\n", - "#caluculate full bath and half bath again from total bath as, it has few extra columns, (fixes 500 missing values in roomcnt )\n", - "\n", - "# where full & half bath and bedroom count are not null, but room count is null\n", - "conditions = ((df_train['full_bath'].isna() == False) \n", - " & (df_train['half_bath'].isna() == False) \n", - " & (df_train['bedroomcnt'].isna() == False) \n", - " & (df_train['roomcnt'].isna() == True))\n", - "# calculate room count including all full & half baths along with bedroom count\n", - "new_values = df_train.full_bath.loc[conditions] + df_train.half_bath.loc[conditions] + df_train.bedroomcnt.loc[conditions]\n", - "# df_train['roomcnt'] = df_train['roomcnt'].masked_assign(new_values, conditions)\n", - "df_train.roomcnt.loc[conditions] = new_values\n", - "\n", - "\n", - "# most bedroom count and roomcount null are in same place\n", - "# all column null count 1133 all columns are null\n", - "\n", - "print(df_train.total_bath.isna().sum())\n", - "print(df_train.full_bath.isnull().sum())\n", - "print(df_train.half_bath.isnull().sum())\n", - "print(df_train.bedroomcnt.isnull().sum())\n", - "print(df_train.roomcnt.isnull().sum())" - ], - "execution_count": 184, - "outputs": [ - { - "output_type": "stream", - "text": [ - "1165\n", - "1182\n", - "1182\n", - "1421\n", - "69700\n", - "\n", - "1165\n", - "1182\n", - "1182\n", - "1421\n", - "1416\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mvy51Ckev9CX", - "colab_type": "text" - }, - "source": [ - "- correct number of stories by Zillow's `propertylandusetypeid` indicator\n", - " - where null values are not\n", - " - number of stories can be set to mode\n", - " - where there are null values\n", - " - number of stories can be set to the generally accepted number of stories" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IW4CG2InpolD", - "colab_type": "code", - "outputId": "02375307-54e2-432b-8b87-1397c73d56b2", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 260 - } - }, - "source": [ - "# before (what's it look like?)\n", - "print(f'BEFORE\\n{df_train.numberofstories.value_counts()}\\n'\n", - " f'{df_train.numberofstories.isnull().sum()} remaining null values\\n')\n", - "\n", - "#numberofstories\t69705\n", - "\n", - "# store ids and general number of stories \n", - "zillow_type_ids = [(31,2), (246,2), (247,2), (248,2), (260,2), (261,1), \n", - " (263,1), (266,1), (267,1), (269, 2), (275,1)]\n", - "\n", - "# go through each id pair \n", - "for type_id in zillow_type_ids:\n", - " # split the pair into type id and number of stories\n", - " t_id, n_stories = type_id\n", - "\n", - " # when type id matches and story count is not null\n", - " conditions = ((df_train['propertylandusetypeid'] == t_id) \n", - " & (df_train['numberofstories'].isna() == False))\n", - " \n", - " # calculate the mode story count for matching id properties\n", - " mode_stories = df_train.numberofstories.loc[conditions].value_counts()\n", - " # when there is at least one value in the value_counts of this property type\n", - " if len(mode_stories) > 0:\n", - " # set mode stories to the most popular value\n", - " mode_stories = mode_stories[0]\n", - " # otherwise\n", - " else:\n", - " # set mode stories to the general average for this property type\n", - " mode_stories = n_stories\n", - " \n", - " # and set those non null values to the most common value seen\n", - " df_train['numberofstories'].loc[conditions] = mode_stories\n", - " \n", - " # when type id matches and story count is null\n", - " conditions = ((df_train['propertylandusetypeid'] == t_id) \n", - " & (df_train['numberofstories'].isna() == False))\n", - " # set null values to the common number of stories seen in that type id\n", - " df_train['numberofstories'].loc[conditions] = n_stories\n", - "\n", - "# edge cases\n", - "conditions = ((df_train.propertylandusetypeid==264) \n", - " & (df_train.numberofstories.isnull()))\n", - "df_train.numberofstories.loc[conditions] = 2\n", - "\n", - "# what's it looking like? \n", - "print(f'AFTER\\n{df_train.numberofstories.value_counts()}\\n'\n", - " f'{df_train.numberofstories.isnull().sum()} remaining null values')" - ], - "execution_count": 185, - "outputs": [ - { - "output_type": "stream", - "text": [ - "BEFORE\n", - "1.0 12016\n", - "2.0 8044\n", - "3.0 508\n", - "4.0 2\n", - "Name: numberofstories, dtype: int32\n", - "69705 remaining null values\n", - "\n", - "AFTER\n", - "1.0 20154\n", - "2.0 423\n", - "3.0 4\n", - "Name: numberofstories, dtype: int32\n", - "69694 remaining null values\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AHcMsDCxprd4", - "colab_type": "code", - "outputId": "30481b2c-e035-4478-d62f-63e10a09c17e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 295 - } - }, - "source": [ - "# before (what's it looking like?) \n", - "print(f'BEFORE\\n{df_train.fireplace_count.value_counts()}\\n'\n", - " f'{df_train.fireplace_count.isnull().sum()} remaining null values\\n')\n", - "\n", - "# where there is a fire place, and count is not null\n", - "conditions = ((df_train.fireplaceflag==1) \n", - " & (df_train.fireplace_count.isna() == False))\n", - "# calculate the mode fireplace count \n", - "mode_fire_count = df_train.loc[conditions, 'fireplace_count'].value_counts()[0]\n", - "# and set those non null values to the most common fireplace count\n", - "df_train['fireplace_count'].loc[conditions] = mode_fire_count\n", - "\n", - "# where there is a fire place, and count is null\n", - "conditions = ((df_train.fireplaceflag==1) \n", - " & (df_train.fireplace_count.isna() == True))\n", - "# set null values to the most common fireplace count\n", - "df_train.fireplace_count.loc[conditions] = 1\n", - "\n", - "# df_train.loc[(df_train.fireplaceflag==1) & (df_train.fireplace_count.notnull()),'fireplace_count'].mode()\n", - "# df_train.loc[(df_train.fireplaceflag==1) & (df_train.fireplace_count.isnull()),'fireplace_count']=1\n", - "\n", - "# after\n", - "print(f'AFTER\\n{df_train.fireplace_count.value_counts()}\\n'\n", - " f'{df_train.fireplace_count.isnull().sum()} remaining null values')" - ], - "execution_count": 186, - "outputs": [ - { - "output_type": "stream", - "text": [ - "BEFORE\n", - "0.0 80446\n", - "1.0 8165\n", - "2.0 1106\n", - "3.0 312\n", - "4.0 21\n", - "5.0 3\n", - "Name: fireplace_count, dtype: int32\n", - "222 remaining null values\n", - "\n", - "AFTER\n", - "0.0 80446\n", - "8165.0 9607\n", - "1.0 222\n", - "Name: fireplace_count, dtype: int32\n", - "0 remaining null values\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FIuSWoJspt3H", - "colab_type": "code", - "outputId": "cb11c3a1-1658-4bce-cbde-a1a47ccdc0a8", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 317 - } - }, - "source": [ - "# set basic sns \n", - "color = sns.color_palette()\n", - "sns.set(style=\"darkgrid\")\n", - "# convert dataframe to pandas for ease of use with sns\n", - "pd_train = df_train.to_pandas()\n", - "# set ax plot\n", - "ax = sns.countplot(x=\"buildingqualitytypeid\", data=pd_train)\n", - "# adjust fringe aesthetics\n", - "plt.xticks(rotation='vertical')\n", - "plt.title(\"Frequency of Bathroom count\", fontsize=15)\n", - "# display the graph\n", - "plt.show()" - ], - "execution_count": 187, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEsCAYAAACFRGf6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XlYVGX/P/D3DAiCghMIBFQuGEii\nYhBgYiaKIKGoaeKammtuZe4S9OAWiGEuqaVfzUxNS0UWBZdv+eRjmpUVov6U3NmUAXEDYeb8/uDL\neRwBHVC4B3y/rovrYs59Zs7n3LO859znzDkKSZIkEBER1TKl6AKIiOjZxAAiIiIhGEBERCQEA4iI\niIRgABERkRAMICIiEoIB9AxasWIFXFxcyv2NGDFCdGn1WkpKCgIDA+Hm5gZ/f/8K57l06ZLOc+Lq\n6oo333wT4eHhyMvLq/IyExMTsXv37nLTBw0ahA8//LDKj0dVc/LkSaxcuVJ0GQbLWHQBJIaFhQXW\nrVtXbhrVjJKSEsyaNQt+fn5YsGABGjVq9Mj558yZA3d3d2g0GqSnpyM2NhaZmZn46quvqrTcxMRE\n3L17F3369HmS8qmaTp48iS+++AKTJk0SXYpBYgA9o4yMjODu7q73/IWFhWjYsGENVlS/ZWVl4e7d\nu+jduzc8PT0fO3/Lli3l58fDwwOFhYVYvHix8OdB9PKpfuEQHJVTUlICFxcXfP3111iwYAF8fHx0\nvkGnpKSgX79+aNu2LXx9fRETE4OSkhKdx0hKSkKPHj3Qrl07DBs2DH/99RdcXFwQFxens4ytW7fq\n3C82NhadOnXSmXbt2jV88MEHeO2119C+fXuMHj0aFy9elNvLhq2Sk5MRFhYGDw8PvPHGG1i5ciUe\nPtHHmTNnMHbsWHh4eKBDhw545513cPToURQXF+P111/HF198Ua4/Bg0ahClTpjyyzxITExEcHAw3\nNze8+eab+Pzzz6HRaAAAO3bsQLdu3QAAY8eOhYuLS4XLeZRGjRpBq9VCq9XK03744QeEhobCy8sL\nXl5eePfdd3Hq1Cm5ffr06Th48CCOHj0qD+k9vNzdu3eje/fuePXVVzFmzBhkZ2fLbWX9mpiYiOnT\np8PT0xMTJ04EAGg0GixbtgxdunSBm5sbgoODkZiYWKV+KesbFxcXnD59GkOGDEH79u3Rt29fnD59\nGnfu3MGsWbPw6quvonv37khKSnpsP2k0GqxevRo9evSAm5sb3njjDcydO1dnnk2bNsHf3x9ubm7o\n0aMHNm3apNM+ffp0vPPOOzrTyvri8OHDAP77+t28eTNiYmLg7e2Njh07Yv78+bh//768bosXL4ZG\no+EwdyW4BfQMezg0jIyMoFAo5NtfffUVvL29ER0dLX+Qx8fHY+bMmRg0aBCmTZuGS5cuYenSpQBK\n37gA8Ndff+Gjjz5CQEAAwsLCcObMGXzwwQfVqlGtVmPQoEFo2rQpIiMjYWpqirVr12LUqFHYt28f\nTExM5HmjoqIQEBCA5cuX4+eff8aKFSvg7OyMHj16AADOnTuHQYMGwcnJCZGRkWjSpAlSU1ORmZmJ\nBg0aICQkBLt378b7778vP+bFixfx+++/48svv6y0xp9++gnTpk1Dv379MHPmTJw5cwbLly/HzZs3\nER4ejm7dusHCwgJTp06Vh9bs7e0fud6SJKGkpARarRbnz5/Hhg0b0KlTJ5ibm8vzZGRkoF+/fnjx\nxRdx//597NmzB0OGDEFiYiIcHR0xZcoUZGVlobCwEGFhYQCgs9zff/8dWVlZmDNnDu7evYtFixYh\nIiICa9as0all8eLFcr8qlaXfWT/77DN8/fXXmDRpEtq0aYO9e/di2rRpUCqV6Nmzp1798qCZM2di\n6NChGDt2LGJiYjB16lS0bt0aLVu2xIoVK7B9+3bMnDkTnp6esLW1rbTf5s2bh4SEBIwZMwaenp7I\nz8/HgQMH5PYtW7Zg0aJFGDlyJDp16oSjR49i0aJFKC4uxnvvvffI56Qi69atw+uvv46YmBicPn0a\nsbGxeOGFFzBy5Eh069YN586dw+bNm7FlyxYAHOYuR6JnzvLlyyVnZ+dyf0eOHJEkSZKKi4slZ2dn\nqV+/fjr302g0UufOnaV58+bpTN+2bZvUrl07KT8/X5IkSZo4caIUHBwsabVaeZ4VK1ZIzs7O0u7d\nu3WWsWXLFp3H+uyzz6TXX39dvh0TEyN5e3tLN2/elKep1WrJ3d1d2rp1qyRJknTx4kXJ2dlZmj17\nts5jvfXWW9JHH30k3548ebL05ptvSoWFhRX2y/nz5yVnZ2fp119/lactXbpU8vX1lUpKSiq8jyRJ\nUt++faURI0boTFu9erXk6uoqZWdn69T4008/Vfo4D8738F9wcLCUlZVV6f00Go1UXFwsde/eXVq9\nerU8fcKECdK7775bbv7Q0FDJ09NTKigokKetW7dOcnFxkYqKinRqmTx5ss59c3NzpbZt20pffPGF\nzvSRI0dKQUFBVeqX7du3S87OzlJcXJw8z4EDByRnZ2cpLCxMnpafny+1bt1a+u677yrtg7Nnz0rO\nzs7S5s2bK2wvLi6WXn/99XKv37CwMMnT01Ne748++kgaMGCAzjwPP39lr99hw4bpzDd27FgpNDRU\nvr1hwwbJ1dW10pqfdRyCe0ZZWFjg+++/1/lr166dzjxvvvmmzu309HRkZ2ejZ8+eKCkpkf98fHxQ\nWFiI8+fPAyjdAvLz89PZmqrsqK/H+c9//gNfX1+Ym5vLy7OwsECbNm2QmpqqM6+vr6/ObScnJ50h\npWPHjiEoKAimpqYVLsvJyQkdOnTAzp07AQBarRZxcXHo06cPjIyMKrxPcXExzpw5g8DAQJ3pQUFB\n0Gg0+PPPP6u8zgAQFhaG77//Hjt27MDKlSvRsGFDjB07Fvfu3ZPnOXfuHN5//328/vrrcHV1RZs2\nbXD58mWd4clHadeunc438latWkGSJOTk5OjM9/Dr4OzZsygqKiq3zj179sT58+eRn59f5X7p2LGj\n/H+zZs0AAD4+PvK0Jk2aQKVS6TyfDzt27BgAoF+/fhW2Z2Zm4saNGxXWVFBQIL9+q+Jxrzl6NA7B\nPaOMjIzQtm3bR85jbW2tc7vsMOBRo0ZVOH9WVhYAIDc3t9x9H76tr7y8PKSmpiI+Pr5c28NB8vDw\nRoMGDVBUVASgdEgrPz8fNjY2j1xe//79sWjRInz88cc4ceIEsrKyKv1AA0qHCDUaTaXrm5+f/8jl\nVaZZs2Y6z0+HDh3g6+uL3bt3Y9CgQbh16xZGjRoFOzs7zJkzB/b29jA1NcXcuXPldX4cS0tLndsN\nGjQAgHL3f3jdrl+/DgBo2rSpzvSy2wUFBSgqKqpSvzxYS1kdj3o+K5Kfnw8LCwuYmZlV2F4WrA/X\nXVbTzZs3K33sylS1RtLFAKJKPbgFA5R+CwWARYsWwdnZudz8L774IoDSN3Rubq5O28O3jYyMYGxs\njOLiYp3pBQUFOrdVKhVeeeUVjBs3rtzyGjdurOealK6LSqWSPzwrExQUhEWLFiE5ORmHDx/Gq6++\nihYtWlQ6v5WVFYyMjKBWq3Wml62vSqXSu8ZHadq0KZo0aYL09HQApftvcnJysHnzZnmLASjff0/D\nw6+DshDPzc3V+QC+ceMGgNIwadSoUa30y4NUKhVu3bqFe/fuVRhCZfuOKnttlr2+TU1Ny70uqxNO\n9HgcgiO9tWrVCjY2Nrh27Rratm1b7q/sQ6Vt27Y4dOiQzhFo+/fv13kshUIBOzs7+QMVKD2C6ejR\nozrz+fj44Ny5c3BxcSm3vEcFQ0V8fHyQlJQkH6VUEXNzcwQFBeGbb77BgQMHHrn1A5R+43V1dcW+\nfft0pu/duxdGRkZo3759lWqsTHZ2NvLz8+WDCAoLCwFA5yCMX3/9Vd4KfbC+p/2N3MXFBaampuXW\ned++fWjVqhVUKlWt9cuDyobxKvrhLQA4ODigadOmFdbUpEkTtGrVCgDw/PPP4+rVqzqvkyNHjlSr\npgYNGkCj0ZQ74IdKcQuI9GZkZISZM2di7ty5KCgoQOfOnWFsbIwrV65g//79WL16NUxMTDBmzBiE\nhoZi2rRp6Nu3L86ePSvvV3lQ9+7dsX37drRu3RoODg7YsWOH/MFa5r333kNCQgKGDx+OoUOHwtbW\nFjdu3MDx48fh5eWFoKAgveufMmUK+vfvj6FDh2LEiBFQqVQ4deoUmjZtir59+8rz9e/fHwMHDoS5\nubl8RNfjHnfs2LGYN28eAgMDcebMGaxYsQKhoaGPPGLrUf755x9YWlpCkiRkZWVh3bp1sLS0lNe3\nQ4cOMDMzQ1hYGEaNGoWMjAysWrWq3PJatmyJw4cP48CBA7Czs4OdnV21aypjZWWFYcOGYeXKlVAq\nlXjllVewb98+/Pzzz1i2bJk8X030y6O0atUKb7/9NhYuXIgbN27Aw8MDN2/exIEDB7B06VIYGRlh\n4sSJiIyMhKWlJTp27Ihjx45h+/btmDFjhhzm/v7+WLlyJcLCwtCnTx+kpqZWGmqP07JlSwDAxo0b\n4eXlBQsLiyp/carPGEBUJb1794alpSXWrl2L77//HkqlEi+99BK6du0KY+PSl5O7uzuWLl2K2NhY\nHDhwAO3atUNsbGy531ZMmTIFeXl5iI2NRYMGDTBs2DA4OTnh+++/l+extrbG9u3bERsbi0WLFqGg\noAC2trbw8PCAi4tLlWp3cnLCli1bEBMTg3nz5kGhUODll18ud0oad3d3NG3aFJ07d9ZrmK9Lly5Y\nunQp1qxZg7i4OFhZWWH06NGYPHlylep70OLFi+X/mzZtirZt22LhwoXyFpCtrS0+//xzREVFYfz4\n8WjRogUiIyOxevVqnccZOnQozp49izlz5qCgoABTp07VOcy8uj788EM0aNAAmzdvhlqtRvPmzbF0\n6VKdwK6Jfnmc+fPnw9HRETt37sTatWthbW2Nzp07y+2DBw9GcXExvvnmG3z99dewt7fHnDlz8O67\n78rztG7dGgsWLMDatWuRkpICHx8fLFy4EEOGDKlyPT4+Phg5ciQ2btyImJgY+Pj4YOPGjU9jVesF\nhSTxktxU8woKCvDaa68hOjoaISEhost5pDNnziAkJATffPMNvLy8RJdDVG9xC4jo/6jValy4cAHL\nli1D69atGT5ENYwHIRD9n4MHD2LIkCHIy8vTGQIjoprBITgiIhKCW0BERCQEA4iIiITgQQiVyMu7\nA62Wo5NERPpQKhV47rlHX2jxYQygSmi1EgOIiKgG1doQ3Pvvv4/evXujT58+GDx4ME6fPg0AuHDh\nAgYOHIiAgAAMHDhQ50y+NdFGRESGodaOgrt165Z84sIDBw5g1apV2LVrF4YPH463334bISEhiIuL\nww8//CBfobAm2vSVm3ubW0BERHpSKhWwttb/BMFALW4BPXjW3Nu3b0OhUCA3NxdpaWkIDg4GAAQH\nByMtLQ1qtbpG2oiIyHDU6j6gefPm4ciRI5AkCevWrUNmZibs7Ozki30ZGRnB1tYWmZmZkCTpqbdZ\nWVnpXWtVk5yIiKqmVgNo4cKFAEpPlx4dHY2pU6fW5uKrhENwRET6M+ghuAf16dMHx44dw/PPP4/s\n7GxoNBoApdeDycnJgb29Pezt7Z96GxERGY5aCaA7d+4gMzNTvn3o0CE0adIE1tbWcHV1RUJCAgAg\nISEBrq6usLKyqpE2IiIyHLVyFNyNGzfw/vvv4969e1AqlWjSpAlmzZqFNm3aID09HbNnz0ZBQQEs\nLS0RFRUlX8SpJtr0xSE4IiL9VWcIjicjrQQD6NmmsjBBg4amostAcWER8m9VfglxIkNRnQDimRCI\nKtCgoSmSho8UXQaCNm0AGEBUT/FkpEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAR\nEQnBACIiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQERE\nJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQEREJIRxbSwkLy8PM2fO\nxOXLl2FiYoJmzZohMjISVlZWcHFxgbOzM5TK0iyMjo6Gi4sLAODQoUOIjo6GRqNBmzZtsHjxYpiZ\nmT1RGxERGYZa2QJSKBQYPXo0kpOTER8fjxdffBExMTFy+7Zt2xAXF4e4uDg5fO7cuYOPP/4Ya9as\nwf79+9GoUSOsX7/+idqIiMhw1EoAqVQqeHt7y7fd3d2RkZHxyPscPnwYbm5uaN68OQAgNDQUe/fu\nfaI2IiIyHLUyBPcgrVaLrVu3ws/PT542bNgwaDQavPHGG5g8eTJMTEyQmZkJBwcHeR4HBwdkZmYC\nQLXbqsLaunGV70NUE2xsLESXQFQjaj2A5s+fD3NzcwwdOhQA8OOPP8Le3h63b9/GjBkzsGrVKnz4\n4Ye1XVY5ubm3odVKossgQQzpQ//69VuiSyB6LKVSUeUv7rV6FFxUVBQuXbqEZcuWyQcd2NvbAwAa\nN26MAQMG4Pfff5enPzhMl5GRIc9b3TYiIjIctRZAn332GVJTU7Fq1SqYmJgAAG7evInCwkIAQElJ\nCZKTk+Hq6goA6Ny5M/7++29cvHgRQOmBCj179nyiNiIiMhy1MgR37tw5rF27Fs2bN0doaCgA4IUX\nXsDo0aMRHh4OhUKBkpISdOjQAVOnTgVQukUUGRmJcePGQavVwtXVFfPmzXuiNiIiMhwKSZK4o6MC\n3Af0bLOxsUDS8JGiy0DQpg3cB0R1gsHvAyIiIirDACIiIiFq/TBsqjnPNTGBsYmp0BpK7hch7+Z9\noTUQUd3AAKpHjE1M8Vv0aKE1eMxcB4ABRESPxyE4IiISggFERERCMICIiEgIBhAREQnBACIiIiEY\nQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQEREJAQDiIiIhGAA\nERGREAwgIiISggFERERCMICIiEgIBhAREQnBACIiIiEYQEREJEStBFBeXh7GjBmDgIAA9OrVC5Mm\nTYJarQYAnDx5Er1790ZAQABGjRqF3Nxc+X410UZERIahVgJIoVBg9OjRSE5ORnx8PF588UXExMRA\nq9VixowZCA8PR3JyMjw9PRETEwMANdJGRESGo1YCSKVSwdvbW77t7u6OjIwMpKamwtTUFJ6engCA\n0NBQ7Nu3DwBqpI2IiAxHre8D0mq12Lp1K/z8/JCZmQkHBwe5zcrKClqtFvn5+TXSRkREhsO4thc4\nf/58mJubY+jQodi/f39tL15v1taNRZdQZ9nYWIguoV5hf1J9VasBFBUVhUuXLmHNmjVQKpWwt7dH\nRkaG3K5Wq6FUKqFSqWqkrSpyc29Dq5WeYG1rn6F8UF2/fkt0CU/MUPoSqB/9SfWfUqmo8hf3WhuC\n++yzz5CamopVq1bBxMQEAODm5obCwkKcOHECALBt2zYEBgbWWBsRERmOWtkCOnfuHNauXYvmzZsj\nNDQUAPDCCy9g1apViI6ORkREBIqKiuDo6IglS5YAAJRK5VNvIyIiw6GQJKlujTPVkro6BPdb9Gih\nNXjMXFcvhoxsbCyQNHyk6DIQtGlDvehPqv8MegiOiIjoQQwgIiISggFERERCMICIiEgIBhAREQnB\nACIiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEgIBhAREQmhdwCtX7++wukbNmx4\nasUQEdGzQ+8AWrVqVYXTV69e/dSKISKiZ8djL0h39OhRAIBWq8Uvv/yCBy8fdPXqVTRq1KjmqiMi\nonrrsQE0b948AEBRURHmzp0rT1coFLCxsUFYWFjNVUdERPXWYwPo0KFDAICZM2ciOjq6xgsiIqJn\nw2MDqMyD4aPVanXalEoeTEdERFWjdwCdOnUKkZGROHv2LIqKigAAkiRBoVDg9OnTNVYgERHVT3oH\n0OzZs9G1a1csWrQIDRs2rMmaiIjoGaB3AF27dg0ffvghFApFTdZDRETPCL133vj7++Pnn3+uyVqI\niOgZovcWUFFRESZNmgQPDw80bdpUp41HxxERUVXpHUCtWrVCq1atarIWIiJ6hugdQJMmTarJOoiI\n6BmjdwCVnZKnIh07dnwqxRAR0bND7wAqOyVPmby8PBQXF8POzg4HDx587P2joqKQnJyMa9euIT4+\nHs7OzgAAPz8/mJiYwNTUFAAwffp0dO7cGQBw8uRJhIeHo6ioCI6OjliyZAmsra2fqI2IiAyD3kfB\nHTp0SOfvxIkTGD9+PIYOHarX/bt164Zvv/0Wjo6O5dqWL1+OuLg4xMXFyeGj1WoxY8YMhIeHIzk5\nGZ6enoiJiXmiNiIiMhzVPoeOkZERxo8fj3Xr1uk1v6enJ+zt7fV+/NTUVJiamsLT0xMAEBoain37\n9j1RGxERGQ69h+AqcuTIkafyw9Tp06dDkiR4eHhg2rRpsLS0RGZmJhwcHOR5rKysoNVqkZ+fX+02\nlUqld03W1o2feL2eVTY2FqJLqFfYn1Rf6R1AXbp00Qmbe/fu4f79+4iIiHiiAr799lvY29vj/v37\nWLhwISIjIw1iyCw39za0WunxMxoQQ/mgun79lugSnpih9CVQP/qT6j+lUlHlL+56B9CSJUt0bpuZ\nmaFFixZo3PjJthTKhuVMTEwwePBgTJgwQZ6ekZEhz6dWq6FUKqFSqardRkREhkPvfUBeXl7w8vKC\np6cnmjdvjjZt2jxx+Ny9exe3bpV+u5MkCUlJSXB1dQUAuLm5obCwECdOnAAAbNu2DYGBgU/URkRE\nhkPvLaDbt28jMjISSUlJKCkpgbGxMd566y2EhYXBwuLxwxULFixASkoKbty4gZEjR0KlUmHNmjWY\nPHkyNBoNtFotnJyc5CE9pVKJ6OhoRERE6BxO/SRtRERkOBSSJOm1o2P27Nm4c+cOpk2bBkdHR1y7\ndg2xsbEwMzNDVFRUTddZ6+rqPqDfokcLrcFj5rp6sc/CxsYCScNHii4DQZs21Iv+pPqvRvcB/fvf\n/8aBAwdgZmYGAGjRogUWL14Mf3//qlVJRESEKuwDMjU1hVqt1pmWl5cHExOTp14UERHVf3pvAfXv\n3x+jRo3CiBEj4ODggIyMDGzcuBEDBgyoyfqIiKie0juAJkyYADs7O8THxyMnJwe2trYYPXo0A4iI\niKpF7yG4hQsXokWLFti4cSOSkpKwceNGODk5YeHChTVZHxER1VN6B1BCQgLc3Nx0prm5uSEhIeGp\nF0VERPWf3gGkUCig1Wp1ppX9foeIiKiq9A4gT09PfP7553LgaLVarFixQj7rNBERUVVU6YJ048aN\ng6+vLxwcHJCZmQkbGxusWbOmJusjIqJ6Su8Aev7557Fr1y789ddfyMzMhL29Pdq1awelstqXFCIi\nomdYla4HpFQq4e7uDnd395qqh4iInhHcfCEiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERC\nMICIiEgIBhAREQnBACIiIiEYQEREJAQDiIiIhGAAERGREAwgIiISggFERERCMICIiEiIWgmgqKgo\n+Pn5wcXFBf/v//0/efqFCxcwcOBABAQEYODAgbh48WKNthERkeGolQDq1q0bvv32Wzg6OupMj4iI\nwODBg5GcnIzBgwcjPDy8RtuIiMhw1EoAeXp6wt7eXmdabm4u0tLSEBwcDAAIDg5GWloa1Gp1jbQR\nEZFhqdIluZ+mzMxM2NnZwcjICABgZGQEW1tbZGZmQpKkp95mZWVVpfqsrRs/xbV9ttjYWIguoV5h\nf1J9JSyADF1u7m1otZLoMqrEUD6orl+/JbqEJ2YofQnUj/6k+k+pVFT5i7uwALK3t0d2djY0Gg2M\njIyg0WiQk5MDe3t7SJL01NuIiMiwCDsM29raGq6urkhISAAAJCQkwNXVFVZWVjXSRkREhkUhSVKN\njzMtWLAAKSkpuHHjBp577jmoVCokJiYiPT0ds2fPRkFBASwtLREVFYWWLVsCQI20VUVdHYL7LXq0\n0Bo8Zq6rF0NGNjYWSBo+UnQZCNq0oV70J9V/1RmCq5UAqosYQNXDAHq6GEBUV1QngHgmBCIiEoIB\nREREQjCAiIhICAYQEREJwQAiIiIhGEBERCQEA4iIiIRgABERkRAMICIiEoIBREREQjCAiIhICAYQ\nEREJwQAiIiIhGEBERCQEA4iIiIRgABERkRAMICIiEoIBREREQjCAiIhICAYQEREJwQAiIiIhGEBE\nRCQEA4iIiIRgABERkRAMICIiEoIBREREQhiLLgAA/Pz8YGJiAlNTUwDA9OnT0blzZ5w8eRLh4eEo\nKiqCo6MjlixZAmtrawCodhsRERkGg9kCWr58OeLi4hAXF4fOnTtDq9VixowZCA8PR3JyMjw9PRET\nEwMA1W4jIiLDYTAB9LDU1FSYmprC09MTABAaGop9+/Y9URsRERkOgxiCA0qH3SRJgoeHB6ZNm4bM\nzEw4ODjI7VZWVtBqtcjPz692m0qlqtV1IiKiyhlEAH377bewt7fH/fv3sXDhQkRGRsLf319oTdbW\njYUuvy6zsbEQXUK9wv6k+sogAsje3h4AYGJigsGDB2PChAkYPnw4MjIy5HnUajWUSiVUKhXs7e2r\n1VYVubm3odVKT7hmtctQPqiuX78luoQnZih9CdSP/qT6T6lUVPmLu/B9QHfv3sWtW6VvMEmSkJSU\nBFdXV7i5uaGwsBAnTpwAAGzbtg2BgYEAUO02IiIyHMK3gHJzczF58mRoNBpotVo4OTkhIiICSqUS\n0dHRiIiI0DmcGkC124iIyHAoJEmqW+NMtaSuDsH9Fj1aaA0eM9fViyEjGxsLJA0fKboMBG3aUC/6\nk+q/OjkER0REzyYGEBERCcEAIiIiIRhAREQkBAOIiIiEYAAREZEQDCAiIhKCAUREREIwgIiISAgG\nEBERCcEAIiIiIYSfjLQusLBsiIamDYTWUFhUjFsFhUJrICJ6mhhAemho2gCDZ34rtIYt0UNwCwwg\nIqo/OARHRERCMICIiEgIBhAREQnBACIiIiEYQEREJAQDiIiIhGAAERGREPwdEBHVuCaWJjAxNRVd\nBu4XFeFmwX3RZdD/YQARUY0oWKqDAAAQ7ElEQVQzMTXFZ3PGiS4D0xavBcAAMhQcgiMiIiEYQERE\nJAQDiIiIhGAAERGREAwgIiISot4G0IULFzBw4EAEBARg4MCBuHjxouiSiIjoAfU2gCIiIjB48GAk\nJydj8ODBCA8PF10SERE9oF7+Dig3NxdpaWnYsGEDACA4OBjz58+HWq2GlZWVXo+hVCp0bjd9rtFT\nr7OqHq6pIiaW1rVQyaPpU2ddYNZUfF8C9ac/LVXsz6fFwsIUJiZir9J8/34xbt0qkm9Xp18VkiRJ\nT7MoQ5CamopZs2YhMTFRnhYUFIQlS5agTZs2AisjIqIy9XYIjoiIDFu9DCB7e3tkZ2dDo9EAADQa\nDXJycmBvby+4MiIiKlMvA8ja2hqurq5ISEgAACQkJMDV1VXv/T9ERFTz6uU+IABIT0/H7NmzUVBQ\nAEtLS0RFRaFly5aiyyIiov9TbwOIiIgMW70cgiMiIsPHACIiIiEYQEREJAQDiIiIhGAAERGREAwg\nIiISol6ejJT0c+/ePRw+fBiZmZkASs8g0blzZ5ibmwuujIieBfwd0DPqxx9/RFhYGNzc3ORTFGVm\nZiI1NRXz589H165dBVf4XxkZGdi3b59OUAYEBMDR0VFwZbpYJxkyQ/zCyQCqIYb+Jg8KCsLq1avR\nrFkznekXL17EhAkTsHfvXkGV6dqxYwdWrlyJ7t276wTlwYMHMXHiRAwYMEBwhaVY59Nn6O+hMnWh\nTkP9wskAqgF14U3eo0cPpKSkVLmttgUEBGDr1q3lzuOnVqsRGhrKOquortRZF95DQN2p01C/cHIf\nUA1Yt24ddu3aVe5NPnHiRISGhhrEi7JNmzYIDw/HwIED4eDgAKD0m9x3330HV1dXwdX9l1arrfAk\nss899xwM6bsT63y66sJ7CKg7dZaUlJQLHwBo3ry5fNUAERhANaAuvMkXL16M9evXY9asWcjIyIBC\noYCDgwMCAgIwZ84c0eXJfH19MXr0aLzzzjs6Qbl9+3Z06tRJcHX/xTqfrrrwHgLqTp2G+oWTQ3A1\n4F//+heuXLlS4Zv8hRdewCeffCK2wDpEq9Viz5492Lt3LzIyMgAADg4OCAwMREhICJRKw/glAet8\nuurKe6iu1FlYWIj169fLz/uDXzjfe+89mJmZCamLAVQD6sqbvDLXr1+HjY2N6DLoGVZX3kN1pU5D\nxQCicvr06YPdu3eLLuOxTp06hTZt2ogu47FYJxkykV84Gc+17NSpU6JLqNR//vMfAKgT4QMAn3/+\nuegS9MI6ny5Dfg89qK7UOWbMGGHL5hZQLRs7diy+/PJL0WXg/Pnz5aa99957+J//+R9IkoRWrVoJ\nqOrx7ty5g4sXL6JZs2Zo3Lix6HLqvHv37iE9PR0vvfQSLC0tRZejF0N5Dz1OXalTJAbQM6p169Zw\ndHTUOVInOzsbdnZ2UCgUOHjwoMDq/is8PBwffPABrKys8Ntvv2Hy5Ml47rnnoFarsWTJEvj6+oou\nEQDg7e2NXr164e233zaow9gftn//fsyaNQu2traIjo7G1KlTYWZmhtzcXCxevBh+fn6iS6RniUS1\nKjg4WHQJkiRJ0ooVK6TRo0dL165dk6d17dpVYEUV69Wrl/z/sGHDpD///FOSJEn6559/pL59+4oq\nq5yuXbtKCxculHx8fKQ+ffpI33zzjZSfny+6rHJCQkKkM2fOSMePH5e8vLyk3377TZIkSTp//rwU\nEhIiuLry1Gq1lJaWJqWlpUlqtVp0OXWWWq2W5s6dK40cOVLavHmzTtukSZMEVSVJ/B1QDahoeKtM\nXl5eLVZSuUmTJiEtLQ3Tpk1DSEgIBg0aBIVCIbqscoqKiuT/79y5g3bt2gEAWrRogeLiYlFlldOk\nSRPMnTsXM2bMwMGDB7Fz504sXboUb775Jvr3729Qv7FxcXEBADRq1AivvvoqAMDJyUlkSeVcvnwZ\nH3/8MdLS0mBrawsAyMnJwSuvvIJ//etfaN68udgC9dCrVy/Ex8eLLgMAEBERgRdeeAFdunTB1q1b\ncfToUSxbtgzGxsa4cuWKsLoYQDUgODi43PBWmfz8fAEVVeyVV17Bpk2bsHz5cowYMcKgPtDLdOzY\nEZ9++immTp0Kb29vJCUlISgoCEeOHIFKpRJdXjkNGjRAYGAgAgMDkZ2djV27dmH+/PnYt2+f6NIA\nAAqFAunp6SgoKMDdu3dx8uRJuLu748KFC0J/Ef+wmTNnYvDgwdiwYYN8KLNWq0V8fDxmzZqF7777\nTnCFperCl02g9JQ7y5cvBwD4+/sjMjIS48aNwxdffCG0Lu4DqgHdunXDli1bYGdnV66tS5cu+Omn\nnwRU9WgnT57E8ePHMXbsWNGl6Lh//z6io6MRFxcHlUqFK1euwNjYGN7e3vjkk0/w4osvii4RQN05\ndP1///d/MWvWLCiVSsTGxuLLL7/E9evXkZWVhU8++QTBwcGiSwQABAYGVhraj2qrbRXtSy2Tk5OD\n1NRUAVWV17Nnz3Lne4uKikJaWhpycnLEnXxY2OBfPfbpp5/KY+sPmz9/fi1XUz/cuXNHOn36tHTq\n1CmD3Bdw9epV0SVUS0lJifT3339L169fF12KjoEDB0rx8fGSVquVp2m1WikuLk4aMGCAwMp0+fn5\nSVlZWRW2vfHGG7VcTeXGjBkjHT9+vNz0pUuXSi4uLgIqKsUtICIyOBcvXkRERAROnz4tjyRkZ2ej\ndevW+OSTT9CyZUvBFZaKioqCv7+/vC/tQQsWLEBYWJiAqsrLz8+HQqFAkyZNyrWdP39e2M8uGEBE\nZLDUarXOdXYqOvEnPRmRB0vwIAQiMlhWVlblQseQji57FEOq01APlmAAEZHBqewDU5Ikgzq6zFA/\n2B9mqEfmMoCIyOAY6gfmw+pKnY6Ojo88MlcUBhARGRxD/cB8WF2ps0ePHrh27VqFdfr7+wuoqBQD\niIgMjqF+YD6srtQ5a9asSttEHqnHo+CIiEgIXg+IiIiEYAAREZEQDCCqc/z8/OSrt1ZFeHg4Vq1a\nBQA4duwY3njjjUrnnT17NmJjYwEAJ06cQEBAQPWKrUUP9suaNWswb948wRU93p49ezBq1KhK24cN\nG4YdO3bUYkVUm3gQAj0zIiMjq3U/T09PJCcnP+Vqatb48ePl/69evYpu3brh1KlTMDZ+/Fv+2LFj\nmDFjBg4fPlyTJQIAevfujd69e9f4csgwcQuIiIiEYABRnfT3338jKCgIr732GubMmYOioiLs3LkT\ngwYN0pnPxcUFly5dAqA7rPawtLQ09O3bFx06dMAHH3ygcyG8h4fr/Pz8sH79evTq1QseHh7l5v/q\nq6/g6+sLX19f7NixQ6eGvLw8jB8/Hq+++ir69++PZcuWyTVfvXoVLi4uKCkpkR/rwSGoy5cvY/jw\n4fD29oa3tzc++ugjFBQUVLg+K1aswPTp0wEAQ4cOBQC89tpr6NChA44fPw4vLy+cPXtWnj83Nxft\n27fHtWvXMGbMGOTk5KBDhw7o0KEDsrOz0b59e51f9p86dQo+Pj4oLi7Gzp07ERoaisjISHh4eCAw\nMBBHjx6V57116xbmzp0LX19fdO7cGbGxsfK1hx5+zo4cOYLAwEB4eHggMjKywh94Uv3BAKI6KT4+\nHuvXr8f+/ftx4cKFJ7qw1v379zFx4kSEhITg+PHjCAwMREpKyiPvs3fvXqxbtw4HDx7E2bNnsXPn\nTgDA4cOHsXHjRmzYsAH79+/HsWPHdO4XGRkJU1NT/Pzzz1i0aBF++OEHveuUJAnjxo3Dv//9b+zd\nuxdZWVlYsWLFY++3efNmAMCvv/6KP/74A15eXggKCsKePXvkeRISEtCxY0c4Ojriq6++gq2tLf74\n4w/88ccfsLOzg5eXl841Y+Li4vDWW2+hQYMGAIC//voLL730En755RdMmTIFkyZNks8EMHv2bBgb\nGyMlJQW7d+/GkSNHKtyvo1arMWnSJHzwwQf45Zdf8NJLL+H333/Xu3+o7mEAUZ00ZMgQ2NvbQ6VS\nYcKECUhMTKz2Y/35558oLi7Gu+++K1/RtG3bto+8z7Bhw2BnZweVSoWuXbvi9OnTAEqDqV+/fnj5\n5ZdhZmaGyZMny/fRaDRISUnBlClTYG5uDmdnZ/Tt21fvOps1a4ZOnTrBxMQEVlZWGDlyJH799ddq\nrXPfvn2RmJgob2HExcU9cl9M37595cDSaDRITExESEiI3G5lZSX3X1BQEFq0aIEff/wRN27cwE8/\n/YS5c+fC3Nwc1tbWGDFiRIXP1+HDh/Hyyy8jMDAQDRo0wLvvvoumTZtWa/2obuBBCFQn2dvby/87\nODggJyen2o+Vk5MDOzs7KBQKncd8FBsbG/l/MzMzefk5OTlwc3OrsE61Wo2SkpJytevrxo0bWLhw\nIU6cOIE7d+5AkiRYWlrqff8HtW/fHg0bNsSxY8dgY2ODy5cvo1u3bpXO361bN0RERODKlSu4cOEC\nGjdujHbt2sntFfVfTk4OMjIyUFJSAl9fX7lNq9Xq9EGZnJwcPP/88/JthUJR4XxUfzCAqE4qu0YM\nAGRkZMDW1hZmZmYoLCyUp1+/fl2vx7KxsUF2djYkSZI/RDMyMqp1uW9bW1tkZ2dXWKeVlRWMjY2R\nmZkJJyencu3m5uYAgMLCQjRu3LjcOnz22WdQKBSIj4+HSqXCgQMH9Dqy78FgeFDZVo2NjQ0CAgJg\nampa6fympqbo2bMn9uzZg3/++Udn6wdAuf7LzMyEn58fnn/+eZiYmOCXX3557BF4NjY2yMrKkm9L\nkqTTP1T/cAiO6qQtW7YgKysL+fn5WLNmDYKCgtC6dWucO3cOp0+fRlFRkV77RwDA3d0dxsbG2LRp\nE4qLi5GSkoK///67WnUFBgZi586dSE9Px71793T2TRkZGcHf3x8rV67EvXv3cP78eezatUtut7Ky\ngp2dHeLi4qDRaPD999/jypUrcvudO3dgbm4OCwsLZGdnY926dXrVZGVlBaVSqfNYQOkh0AcOHMCe\nPXvQp08febq1tTXy8/Nx69YtnflDQkKwa9cuHDp0qFwAqdVquf/27t2L9PR0dOnSBba2tujUqRM+\n/fRT3L59G1qtFpcvX8bx48fL1dmlSxecO3cOKSkpKCkpwaZNm3Djxg291pHqJgYQ1UnBwcEYNWoU\nunfvjpdeegkTJkxAixYtMHHiRIwYMQI9evSAh4eHXo9lYmKCFStWYNeuXfDy8kJSUlK1TyTZpUsX\nDBs2DMOHD4e/vz/at28vLwMo/THs3bt30alTJ8yePRv9+vXTuf/8+fOxfv16eHt74/z58+jQoYPc\nNmnSJKSlpcHT0xNjx45Fjx499KrJzMwM48ePx6BBg+Dp6YmTJ08CKB0efOWVV6BQKODp6SnP7+Tk\nhLfeegvdu3eHp6envEXn4eEBpVKJNm3awNHRUWcZ7dq1w6VLl+Dj44Nly5Zh+fLleO655wAA0dHR\nKC4ulo9anDJlSoVbp1ZWVvj888+xdOlSeHt749KlSxVe6prqD56MlKgGpaenIzg4GH///XeFQ1A7\nd+7Ejh07sHXrVgHVAXPmzIGtrS0+/PBDveYfPnw4evXqhQEDBsjTRK8D1V3cAiJ6yvbv34/79+/j\n5s2bWLJkCbp27arXGQhq29WrV7F//370799fr/n/+usvpKWloWfPnjVcGT0rGEBET9m2bdvQsWNH\n+Pv7w8jICJ988onokspZtmwZevXqhffee0+vgy1mzZqFkSNHYu7cufIBEkRPikNwREQkBLeAiIhI\nCAYQEREJwQAiIiIhGEBERCQEA4iIiIT4/9XIitKxsMjJAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "KOHPCFRSp5y9", - "colab_type": "code", - "outputId": "e0f3fe2e-a82a-49e8-a798-a3f79a30bcee", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 274 - } - }, - "source": [ - "# let's look more into year built vs type \n", - "plt.plot(pd_train.yearbuilt, pd_train.buildingqualitytypeid, 'ro')\n", - "# display the graph\n", - "plt.show()" - ], - "execution_count": 188, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEBCAYAAACQbKXWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGxtJREFUeJzt3WtwE+ehBuB3JUXGxqi2ZAPmmmYa\nU2caQusMZtwUgiGYNkBaM1NSLk4GEtqmpNCGSQlNSxoIRCWhYQKJSQ8tIcnAH3toAz2FdLikQKHQ\nXKgzFFNjwDPgiyRzjIkvSPrOD5DiiyTrstLu+nufX7Cr/fbd1eqVWC1aRQghQEREUjFpHYCIiFKP\n5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQS\nYvkTEUnIonWAlpYb8PsFHI5MuN1tWseJC7Nrg9lTz6i5gYGT3WRSkJ09OOExNS9/v1/A7xfBPxsV\ns2uD2VPPqLkBZu+Op32IiCTE8icikhDLn4hIQlGVv9PpRElJCcaNG4eamhoAQEtLC5588kmUlpZi\n9uzZWLZsGTweT1LDEhGROqL6wnfatGkoLy/HggULgtMURcETTzyBoqIiALfeIF555RWsX78+OUkp\n6VpPHIerqhJejxswmQC/Hxa7Azllc2GbVNz3MQEmE2yTp2D4wscijtl7rHizBcbJnV0a97YmItpt\nCrmvbqsJMW5grGvH/o6Os2dDzutvPRa7Axnjx+PzM2d65AMQ9rntPi8w7fP/nkfrh0cAv7/P89vw\n7ttfzOtNUYDetwW/vT5TZiaEEBA3bgSnxcxkwqBx4+BtbILX44aSlgbR2RnVfmwYfy+6bnr77FtV\nmM2Az9cjZ7jXhF4osdzAvaSkBBUVFcjPz+8zb//+/di1axd27NgRUwC3uw1+v0Bu7hA0N1+PaVm9\nGAjZW08cR+POHRBdXX0eo1itGFb+OACEfQwA2B6c2uNgDzVmYKxY3gDCjfOVZT+Gcs/Xox5HDdFu\nU6T9GVGo8kx0PWYzAAXweaObFyaD7cGpSE+3ovF/90e5MRqKsB9TqfdrIl7dO8ZkUuBwZCY8pirn\n/P1+P3bt2oWSkhI1hiMNuKoqwxaI6OqCq6oy4mMA3Po02M+YgbESzSa6unD5nfdiGkcN0W5Tf/sq\nrAiFFfd6fL7QxR9uXpgMrR8eQeP+DyKvSy90UPxA39eEnqhynf/atWuRkZGBhQsXxrxs93ew3Nwh\nasTRhNGz17RE/r7G2898AIDf32M/hBvT2+KJaX+FG6fT5U75fo92m/rbn/FK1XpCiuc0jex6vSYS\nofaxnnD5O51OXLp0CRUVFTCZYv+HBE/7aCuQ3ZJtD3luOsCSbQeAiI+BydRjP4Qb05Jtj2l/hRsn\nLceR8v0e7Tb1tz8TWX8q1hNS4PXNN4Ho9XpNxEt3p302bdqE6upqbN26FVarNeEwpJ2csrlQwjyH\nitWKnLK5ER8DALbJU/odMzBWotkUqxVjFi0Is0TyRLtN/e2rsBQl/Kx412M2A+Ywn/NCzQuTwTZ5\nCoaVPhR5XXoRYT+mUu/XhJ6YX3jhhRf6e9C6devw/PPPo6mpCfv370dVVRUmTpyIZ555BnfccQf2\n7NmD3bt34+jRo3j44YdjCtDe3gUhgMGD0/D553GcI9WBgZA9bdRo3OFwoOPiRfjb2299yhMCFrsD\nQx+dD9uk4r6PCTCZYJvyYJ8vtno/vvtYsQg3zugZ01K+36PdprD7KgKL3YGhCxbC2/45vC5X33lR\nrMdidyCzaBJ8rde/yDd/ATK//vXQz22veYEMJpsNnZcv3zp33u35HT2lGP/X0PzFvN5Cle7t9Zky\nM4E77gBu3gxOi5nJhEFf/SrgF/C3t0NJS+t5lU2E/Wgbfy8Uu6PPvlWF2dxze8K8JuLVvWMURUFG\nRuIftmO62icZeNpHW8yuDaNmN2puYOBk18VpHyIiMiaWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGR\nhFj+REQSYvkTEUmI5U9EJCGWPxGRhFT5SWciolipdZc3ig/Ln4hSrvcdyLweNxp37gAAvgGkCE/7\nEFHKqXWXN4ofy5+IUi7cDWhSdmMaYvkTUepZ7I6YppP6WP5ElHJq3eWN4scvfIko5QJf6vJqH+2w\n/IlIE7ZJxSx7DfG0DxGRhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJCGWPxGR\nhPotf6fTiZKSEowbNw41NTXB6XV1dZg3bx5KS0sxb948XLx4MZk5iYhIRf3+vMO0adNQXl6OBQsW\n9Ji+Zs0azJ8/H4888gj+9Kc/4de//jV27tyZtKBaa3j3bbR+eATw+wGTCbbJU5DxlbvhqqpEjccN\nmEyA35/U3yjpfuejIEUBrFagszOqDL3HqOnziNQJ5Gx4+4/AzZsxLx8u+6CCAox55hd9pl9+1YmO\ns2e/mJCeDrS3x7xeNWi53xNh1NyAzrKnpyP/9Tc1jaAIIUQ0DywpKUFFRQXy8/PhdrtRWlqKkydP\nwmw2w+fzoaioCAcOHIDdbo8pgNvdBr9fIDd3CJqbr8e1EcnW8O7baD18qO8MRQFC7D7FasWw8sdV\nfQPofeej/oTKEOsYRtb7DaBP8RNpLYY3gO79aDIpcDgyE159XOf8r169imHDhsFsNgMAzGYzhg4d\niqtXryYcSI9aPzwSekaY981k3JEo1J2PIgmVIdYxjKx30bP4SXc0+ldngOa/6tn9HSw3d4iGScKr\n8ftjXsbb4lF1e2paPAlniGcMI+ux7RrmIAonlo5Qux/jKv+8vDw0NjbC5/MFT/s0NTUhLy8v5rGM\ncNoncC49FpZsu6rbY8m2x3yLu94Z4hnDyHR7PBHdFu0xqpvTPg6HAwUFBdi7dy8AYO/evSgoKIj5\nfL9R2CZPCT1DUUJPTsIdiULd+SiSUBliHcPIBhUURPw7kebS0zVdfb9f+K5btw4HDhyAy+VCdnY2\nsrKysG/fPtTW1mLVqlVobW2FzWaD0+nEXXfdFXMAQ3zyR+SrfbwGvtpHS4le7ROOEa72IcnFeLVP\nMj75R321T7IYpfwjYXZtMHvqGTU3MHCya3rah4iIjI3lT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0Qk\nIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMR\nSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIZY/EZGELIkO\ncOjQIWzevBlCCAghsGzZMsyYMUONbERElCQJlb8QAs8++yzee+895Ofn4z//+Q9+8IMfYPr06TCZ\n+I8KWbSeOA5XVSW8HjcsdgdyyubCNqlY61hxGUjbYiQDab+H2hYAutu+hD/5m0wmXL9+HQBw/fp1\nDB06lMUvkdYTx9G4cwdEVxcAwOtxo3HnDgDQ/OCO1UDaFiMZSPs91LY0/PEPAATg8wWn6WH7Empp\nRVHw2muv4amnnsLUqVPxk5/8BE6nU61sZACuqsrggR4gurrgqqrUKFH8BtK2GMlA2u+htgU+b7D4\nA/SwfQl98vd6vdi2bRveeOMNFBYW4l//+hdWrFiBffv2YfDgwVGN4XBkBv+cmzskkTiakjV7TYsn\n5HRviycl+0TNdaR6W4x6zKidO5X7Pdn7PNy2hBLr9qmdPaHyP3v2LJqamlBYWAgAKCwsRHp6Ompr\nazF+/PioxnC72+D3C+TmDkFz8/VE4mhG5uyWbDu8HnfI6cneJ2rv91Rui1GPmWTkTtV+T8U+D7ct\n4R4bbZ7u2U0mpceH5ngldNpn+PDhaGhowIULFwAAtbW1cLvdGDNmTMLByBhyyuZCsVp7TFOs1uCX\nXEYykLbFSAbSfg+1LTBbALO5xyQ9bF9Cn/xzc3PxwgsvYPny5VAUBQCwfv16ZGVlqRKO9C/whZXe\nrmSIx0DaFiMZSPs93LaEmqb19ilCCKFlAJ720Raza8Oo2Y2aGxg42XVx2oeIiIyJ5U9EJCGWPxGR\nhFj+REQSYvkTEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9EJKGEb+aSajVP/xhob+8xLfBbGZ//\n9zxaPzwC+P2AyQTb5CloPXwo+ZmSvobkYXZtGDW7UXMDGmZXFEAIwGQC/H7+tk9ALL/tE6r4gwI7\nmIhI5xSrFcPKH4/6DYC/7ROu+AEWPxEZhh7u5GWs8iciGiCivelLsrD8iYg0YLE7NF2/sco/PT38\nvNs3kyEi0js93MnLUOWf//qbId8ALHYHhi95ErYHp976Rh24dbXPg1NTnJCIqJfAB9Pb3WSxO2L6\nsjdZDHW1j14xuzaYPfWMmhsYONnlvNqHiIhUwfInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+I\nSEIsfyIiCbH8iYgklPDNXDo7O7F+/Xr84x//QFpaGiZMmIC1a9eqkY2IiJIk4fLfuHEj0tLSsH//\nfiiKApfLpUYuIiLDaz1xHK6qSng9bt3cwSsgofK/ceMG9uzZgyNHjkC5/eNFOTk5qgQjIjKy1hPH\n0bhzB0RXF4Bbv9/fuHMHAOjiDSChc/719fXIysrCli1bUFZWhkWLFuH06dNqZSMiMixXVWWw+AP0\ncAevgIQ++ft8PtTX1+Oee+7BL37xC3z66af40Y9+hA8++ACZmdH96lz3X6fLzR2SSBxNMbs2mD31\njJobSG32mhZPyOneFk9cOdTOnlD55+XlwWKxYNasWQCA++67D9nZ2airq8O9994b1Rj8SWdtMbs2\njJrdqLmB1Ge3ZNtD3qrRkm2POYfuftLZbrejqKgIx44dAwDU1dXB7XZj7NixCQcjIjKynLK5UKzW\nHtP0cAevgISv9vnNb36D1atXw+l0wmKx4Le//S1sNpsa2YiIDCvwpe6AvNoHAEaPHo133nlHjSxE\nRAOKbVKxbsq+N/4PXyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIi\nCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+I\nSEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpKQauW/ZcsWjBs3DjU1NWoNSURE\nSWJRY5DPPvsMn3zyCUaOHKnGcDFrPXEcrqpKeD1uWOwO5JTNhW1Scb/z4h3z8qtOdJw9G3xsj7c7\nRQGsVqCzEzCZAL8/uDyA4JiBeVoz8ls1s6eeUXMD+s2e/z87NFlvwuXf1dWFF198Ea+++irKy8vV\nyBST1hPH0bhzB0RXFwDA63GjceeO4Pxw8yK9AUQa89qxv/co/j6EuFX8QLDcvR43Gv74BwAC8Pl6\nzCMiudU88bgmbwAJl//mzZsxZ84cjBo1So08MXNVVQZLOkB0dcFVVRn8c6h5kco/0phejzu+oD5v\nfMsRESVBQuX/8ccfo7q6GitXrox7DIcjM/jn3NwhMS9f0+IJOd0bZnpgXqR1xTMmEVG8oum+ePox\nkoTK/9SpU6itrcW0adMAAA0NDViyZAk2bNiABx54IKox3O42+P0CublD0Nx8PeYMlmx7yE/jlmw7\nAISdF2ldkcaM+5M/EVEY/XVf9340mZQeH5rjldDVPkuXLsXRo0dx8OBBHDx4EMOHD8f27dujLn41\n5JTNhWK19pimWK3IKZsbcV68Yw4qKIgvqNkCmM3xLUtEpDJVrvbRUuDcfaQremK92ifSmLZJxX2u\n9unBYFf7EJG2tLraRxFCCE3WfFuip330gNm1weypZ9TcwMDJrovTPkREZEwsfyIiCbH8iYgkxPIn\nIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8\niYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIsfyIiCbH8iYgkxPInIpIQy5+ISEIs\nfyIiCbH8iYgkZElk4ZaWFjz77LO4fPkyrFYrxo4dixdffBF2u12tfERElAQJffJXFAVPPPEE9u/f\nj/fffx+jR4/GK6+8olY2IiJKkoTKPysrC0VFRcG/T5gwAVeuXEk4FBERJZcihBBqDOT3+7F48WKU\nlJSgvLxcjSGJiChJEjrn393atWuRkZGBhQsXxrSc290Gv18gN3cImpuvqxUnpZhdG8yeekbNDQyc\n7CaTAocjM+ExVSl/p9OJS5cuoaKiAiYTLyAiItK7hMt/06ZNqK6uxltvvQWr1apGJiIiSrKEyv/8\n+fPYtm0b7rzzTjz66KMAgFGjRmHr1q2qhCMiouRIqPzvvvtunDt3Tq0sRESUIjxBT0QkIZY/EZGE\nWP5ERBJi+RMRSYjlT0QkIZY/EZGEWP5ERBJi+RMRSYjlT0QkIdV+1ZNuaT1xHK6qSng9bljsDuSU\nzYVtUnHMy2WMH4/Pz5wJOU73x5oyMyGEgLhxo9/lQq2vxuMGTCbA71d9XyRbTZjpSloaRGcnLHYH\nLMOGouPcOd1tX7jsemfU3IAOs5tMsE2eguELH9Nk9ar9nn+8BtJPOreeOI7GnTsgurqC8xSrFcPK\nH4/4BhBqud4C4wDo97Ghluu+/mjWR0SpYXtwar9vAMn4SWee9lGRq6qyT6GKri64qipjXq63wDjR\nPLa/9cc6BhElT+uHRzRZL0/7qMjrccc0Pdr5sT6uv+XiHYeIkkCjU5L85K8ii90R0/Ro53d/XLSP\njTR+PGMQUZJodAMslr+KcsrmQul1QxvFakVO2dyYl+stME40j+1v/bGOQUTJY5s8RZP18rSPigJf\nqsZ6tU+o5fq7aieRq316r8+oV/uEY4SrfYh4tc8AutrHiJhdG0bNbtTcwMDJzqt9iIgobix/IiIJ\nsfyJiCTE8icikpDmV/uYTErIPxsNs2uD2VPPqLmBgZFdrW3Q/GofIiJKPZ72ISKSEMufiEhCLH8i\nIgmx/ImIJMTyJyKSEMufiEhCLH8iIgmx/ImIJMTyJyKSUFLK3+l0oqSkBOPGjUNNTU1w+qFDh/Dd\n734XjzzyCObMmYMDBw5ENa+urg7z5s1DaWkp5s2bh4sXLyYjdsTshw8fxve+9z3Mnj0bCxcuRH19\nfVT59Jy9paUFTz75JEpLSzF79mwsW7YMHo8nuNwnn3yCOXPmoLS0FIsXL4bbnbx7/8az3wO2bNnS\nZzm9Z+/s7MSaNWswY8YMzJ49G7/61a+C8/R8zAD6eK1GOnYjPffxztM6e11dHRYtWoSZM2di1qxZ\neO6559DR0REc8+DBg5g5cyYeeughrFixAu3t7f0HEUlw6tQpceXKFTF16lRx7tw5IYQQfr9f3H//\n/cG/nz17VkyYMEH4fL6I84QQYtGiRWLPnj1CCCH27NkjFi1alIzYYbNfu3ZNTJw4UVy4cCGYYfHi\nxcFlIuXTc/aWlhZx4sSJ4PIvv/yyeO6554QQQvh8PjF9+nRx6tQpIYQQW7duFatWrdJN9oDq6mqx\nZMmSHssZIfvatWvFSy+9JPx+vxBCiObm5uA8PR8zenmthjt2Iz338c7TQ/b6+nrx2WefBbMuX75c\nbNmyRQghRFtbmyguLhZ1dXVCCCFWr14tXn/99X5zJKX8A3qX/8SJE8Xp06eFEEL885//FDNmzOh3\nnsvlEoWFhcLr9QohhPB6vaKwsFC43e5kRu+R/dNPPxXf+c53gvNaWlpEfn6+cLvdEfPpPXtvf/3r\nX8Vjjz0WXO7hhx8OznO73WLChAlJzS1EbNk7OzvF97//fVFfX99nOT1nb2trE4WFhaKtra3PGHo/\nZvT4WhXii2M30nMf7zw9ZO9t+/btYvXq1UIIIf7yl7+IpUuXBuedOXOmx/MXTsp+1VNRFLz22mt4\n6qmnkJGRgRs3buCtt97qd97Vq1cxbNgwmM1mAIDZbMbQoUNx9epV2O32lGT/8pe/DJfLhTNnzmD8\n+PF4//33g9mEEGHzRZqnh+zdM/j9fuzatQslJSXB+SNGjAjOt9vt8Pv9uHbtGrKysnSRffPmzZgz\nZw5GjRrVYzm9ZzebzcjKysKWLVtw8uRJDB48GMuXL8f999+v++Pdbrfr7rXa/diN9NzHOy+Zx0y0\n2btn6OjoQGVlJX7+858D6Hu8jxgxAlevXu133Sn7wtfr9WLbtm144403cOjQIbz55ptYsWIFbty4\nEXGeHgwZMgS/+93vsGHDBpSVlcHtdsNmswUPcj2LNvvatWuRkZGBhQsXapS0r0jZP/74Y1RXV2P+\n/PlaxwwpUnafz4f6+nrcc889qKqqwsqVK/H000+jra1N69gAImfX42tVj8dutGLN7vV68bOf/QyT\nJk3CtGnTElp3yj75nz17Fk1NTSgsLAQAFBYWIj09HbW1tVAUJey8kSNHorGxET6fL/jCaWpqQl5e\nXqqiAwCKi4tRXFwMAHC5XNi+fTvGjBmD9vb2sPmEELrOHuB0OnHp0iVUVFTAZLr1eSAvLw9XrlwJ\nPsbj8cBkMqXsk3N/2d99913U1tYGXwANDQ1YsmQJNmzYoPvsHR0dsFgsmDVrFgDgvvvuQ3Z2Nurq\n6jBixAhdHzORXsdavFZ7H7uRnvt45+khOwD4fD6sXLkSX/rSl/D8888HH5eXl4eTJ08G/37lypWo\n9nnKPvkPHz4cDQ0NuHDhAgCgtrYWbrcbY8aMiTjP4XCgoKAAe/fuBQDs3bsXBQUFKfsncEBzczOA\nW/9M27RpEx599FFkZGREzKf37ACwadMmVFdXY+vWrbBarcFlvva1r6GjowOnT58GAOzevRszZ85M\nae5I2ZcuXYqjR4/i4MGDOHjwIIYPH47t27fjgQce0H12u92OoqIiHDt2DMCtKzncbjfGjh2r+2NG\nT6/VUMdupOc+3nl6yO73+7Fq1SqYzWa89NJLUJQvbujyrW99C//+97+DV1bt3r0b3/72t/vNkJSb\nuaxbtw4HDhyAy+VCdnY2srKysG/fPvz5z3/G73//+2Dwn/70p5g+fToARJxXW1uLVatWobW1FTab\nDU6nE3fddZfasSNm/+Uvf4mPPvoIN2/exDe/+U2sXr0aaWlp/ebTc/bz589j1qxZuPPOOzFo0CAA\nwKhRo7B161YAwEcffYQ1a9ags7MTI0eOxMaNG5GTk6OL7L2VlJSgoqIC+fn5hsheX1+P1atX49q1\na7BYLFixYgWmTJkCQN/HDKCP12qkYzfScx/vPK2zHz58GD/84Q+Rn58f/Nf5N77xDaxZswYA8Le/\n/Q0bN26E3+9HQUEBXn755eAHvHB4Jy8iIgnxf/gSEUmI5U9EJCGWPxGRhFj+REQSYvkTEUmI5U9E\nJCGWPxGRhFj+REQS+n9YnE5sVgm99QAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_647tI5Lp94v", - "colab_type": "text" - }, - "source": [ - "### Final adjustments\n", - "- filling nans" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ofZIC0EdKJ0Y", - "colab_type": "text" - }, - "source": [ - "# -----current: test ready-----\n", - "- converting to pandas \n", - " - to see what's going on\n", - " - figuring out what can and what can't be replicated in cuML" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "-4A3-sjRp8AE", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from sklearn import neighbors\n", - "# from cuml.preprocessing.model_selection import train_test_split\n", - "from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split\n", - "#location seems to be related to building quality, (knnclassifier)\n", - "\n", - "def fillna_knn(df, base, target):\n", - " data_colnames = [target] + base\n", - " #print(\"data_colnames\",data_colnames)\n", - " missing_values_boolflag = df[target].isnull() #true for missing rows, false for columns with values\n", - " #print(\"miss\",missing_values_boolflag.head())\n", - " not_missing_boolflag = ~missing_values_boolflag \n", - " #print(\"not miss\",not_missing_boolflag.head())\n", - " number_of_missing_val = missing_values_boolflag.sum()\n", - " print(\"# of miss\",number_of_missing_val)\n", - " not_missing_rows = df.loc[not_missing_boolflag, data_colnames]\n", - " #print(not_missing_rows.head())\n", - " Y = not_missing_rows[target]\n", - " X = not_missing_rows[base]\n", - " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, \n", - " test_size=0.20,\n", - " random_state=3192,\n", - " stratify=Y)\n", - " metrics = ['euclidean'] \n", - " weights = ['distance'] \n", - " numNeighbors = [5,10,15,20,25]\n", - " param_grid = dict(metric=metrics,weights=weights,n_neighbors=numNeighbors)\n", - " cv = StratifiedKFold(n_splits=3,random_state=3192,shuffle=False)\n", - " grid = GridSearchCV(neighbors.KNeighborsClassifier(n_jobs=-1),param_grid=param_grid,cv=cv,scoring='f1_weighted',refit=True,return_train_score=True,verbose=1,n_jobs=-1,pre_dispatch='n_jobs')\n", - " grid.fit(X_train ,Y_train)\n", - " #print(\"grid.cv_results_\",grid.cv_results_)\n", - " print(\"grid.best_estimator_\",grid.best_estimator_)\n", - " print(\"grid.best_params_\",grid.best_params_)\n", - " print(\"grid.scorer_\",grid.scorer_)\n", - " #print(\"grid.n_splits_\",grid.n_splits_)\n", - " y_true, y_pred = Y_test, grid.predict(X_test)\n", - " \n", - " Z = grid.predict(df.loc[missing_values_boolflag, base])\n", - " #df.loc[ missing_values_boolflag, target ] = Z\n", - " return Z" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "6eES-hq--NKZ", - "colab_type": "code", - "outputId": "2bc86856-507d-47bf-cfab-d29649cba819", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 903 - } - }, - "source": [ - "# make safe copy\n", - "# test = df_train.copy()\n", - "df_train = test.copy()\n", - "# switch to pandas (figuring out what's going on)\n", - "df_train = df_train.to_pandas()\n", - "\n", - "print(df_train.info())" - ], - "execution_count": 191, - "outputs": [ - { - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 90275 entries, 0 to 90274\n", - "Data columns (total 45 columns):\n", - "parcelid 90275 non-null int64\n", - "logerror 90275 non-null float64\n", - "ac_id 28781 non-null float64\n", - "basement_sqft 90275 non-null float64\n", - "total_bath 89110 non-null float64\n", - "bedroomcnt 88854 non-null float64\n", - "buildingqualitytypeid 57364 non-null float64\n", - "deck_flag 90275 non-null float64\n", - "finished_living_area_entryfloor_sqft2 6856 non-null float64\n", - "total_finished_living_area_sqft 89614 non-null float64\n", - "finished_living_area_entryfloor_sqft1 6856 non-null float64\n", - "fips 90275 non-null float64\n", - "fireplace_count 90275 non-null float64\n", - "full_bath 89093 non-null float64\n", - "garagecarcnt 29937 non-null float64\n", - "garage_sqft 21017 non-null float64\n", - "has_hottub_or_spa 90275 non-null int64\n", - "heating_system_id 56080 non-null float64\n", - "latitude 90275 non-null float64\n", - "longitude 90275 non-null float64\n", - "lot_area_sqft 80125 non-null float64\n", - "pool_count 90275 non-null float64\n", - "pool_sqft 90275 non-null float64\n", - "just_hottub_or_spa 90275 non-null float64\n", - "pool_with_spa_tub_yes 90275 non-null float64\n", - "pool_with_spa_tub_no 90275 non-null float64\n", - "propertylandusetypeid 90275 non-null float64\n", - "roomcnt 88859 non-null float64\n", - "basement_flag 90275 non-null float64\n", - "half_bath 89093 non-null float64\n", - "unitcnt 90275 non-null float64\n", - "patio_sqft 90275 non-null float64\n", - "storage_sqft 90275 non-null float64\n", - "yearbuilt 89519 non-null float64\n", - "numberofstories 20581 non-null float64\n", - "fireplaceflag 90275 non-null bool\n", - "structure_tax 89895 non-null float64\n", - "total_parcel_tax 90274 non-null float64\n", - "land_tax 90274 non-null float64\n", - "total_property_tax_2016 90269 non-null float64\n", - "taxdelinquencyflag 90275 non-null int64\n", - "taxdelinquencyyear 90275 non-null float64\n", - "transaction_month 90275 non-null int16\n", - "census_tractnumber 90275 non-null object\n", - "block_number 90275 non-null object\n", - "dtypes: bool(1), float64(38), int16(1), int64(3), object(2)\n", - "memory usage: 29.9+ MB\n", - "None\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AT8Osn51lD9v", - "colab_type": "code", - "outputId": "8ab0690a-2e06-468e-b7ce-f4d051a3ce83", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 573 - } - }, - "source": [ - "print('CURRENT DF SITUATION\\n')\n", - "print(f'SHAPE = {df_train.shape}')\n", - "print(f'NULL COUNT = {df_train.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{df_train.buildingqualitytypeid.value_counts()}\\n')\n", - "print(f'BUILDINGTYPEID HEAD\\n{df_train.buildingqualitytypeid.head()}\\n')\n", - "print(f'DF TRAIN HEAD\\n{df_train.head()}')" - ], - "execution_count": 192, - "outputs": [ - { - "output_type": "stream", - "text": [ - "CURRENT DF SITUATION\n", - "\n", - "SHAPE = (90275, 45)\n", - "NULL COUNT = 32911\n", - "VALUE COUNTS\n", - "7.0 29310\n", - "4.0 23839\n", - "1.0 2627\n", - "10.0 1461\n", - "12.0 119\n", - "8.0 5\n", - "6.0 2\n", - "11.0 1\n", - "Name: buildingqualitytypeid, dtype: int64\n", - "\n", - "BUILDINGTYPEID HEAD\n", - "0 7.0\n", - "1 NaN\n", - "2 NaN\n", - "3 7.0\n", - "4 4.0\n", - "Name: buildingqualitytypeid, dtype: float64\n", - "\n", - "DF TRAIN HEAD\n", - " parcelid logerror ac_id ... transaction_month census_tractnumber block_number\n", - "0 11827818 0.0402 NaN ... 3 5315.03 1013\n", - "1 12123024 0.0296 NaN ... 3 4625.00 1017\n", - "2 13867327 0.0344 NaN ... 3 0114.01 2017\n", - "3 12681894 0.0060 NaN ... 3 6513.02 1004\n", - "4 12848541 0.0695 1.0 ... 3 4087.03 1018\n", - "\n", - "[5 rows x 45 columns]\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "79bB7JKdAEtX", - "colab_type": "code", - "outputId": "32b79160-fd19-4d39-988a-fc5fcd7c3284", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 225 - } - }, - "source": [ - "df_train['buildingqualitytypeid'] = df_train['buildingqualitytypeid'].fillna(-1)\n", - "print(f'NULL COUNT = {df_train.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{df_train.buildingqualitytypeid.value_counts()}')" - ], - "execution_count": 193, - "outputs": [ - { - "output_type": "stream", - "text": [ - "NULL COUNT = 0\n", - "VALUE COUNTS\n", - "-1.0 32911\n", - " 7.0 29310\n", - " 4.0 23839\n", - " 1.0 2627\n", - " 10.0 1461\n", - " 12.0 119\n", - " 8.0 5\n", - " 6.0 2\n", - " 11.0 1\n", - "Name: buildingqualitytypeid, dtype: int64\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DVgF1c_p_bN1", - "colab_type": "text" - }, - "source": [ - "# -----current: break-----\n", - "- break 1 of 2" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "mAB9bsrPAGzQ", - "colab_type": "code", - "outputId": "d847758e-212e-4de8-85c4-89b469b71c48", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 762 - } - }, - "source": [ - "# say we run this whole thing by buildingqualitytypeid\n", - "# drop building types that aren't seen at least 3 times in the data\n", - "# df_train = df_train.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", - "\n", - "# BACK TO cuDF\n", - "df_train = cudf.from_pandas(df_train)\n", - "\n", - "print(df_train.buildingqualitytypeid.value_counts())\n", - "print()\n", - "print(df_train.buildingqualitytypeid.isnull().sum())\n", - "print(df_train.shape)\n", - "print()\n", - "\n", - "type_ids = list(set(df_train.buildingqualitytypeid.values))\n", - "from time import sleep\n", - "safe = []\n", - "for tid in type_ids:\n", - " print(tid)\n", - " sleep(5)\n", - " t = len(df_train.loc[df_train.buildingqualitytypeid == tid])\n", - " if t > 3:\n", - " safe.append(tid)\n", - " else:\n", - " print(f'{tid} count too low @ {t}')\n", - "for tid in type_ids:\n", - " if tid not in safe:\n", - " df_train = df_train.loc[df_train.buildingqualitytypeid != tid]\n", - "\n", - "print()\n", - "print(df_train.buildingqualitytypeid.value_counts())\n", - "print()\n", - "\n", - "df_train['buildingqualitytypeid'] = df_train['buildingqualitytypeid'].replace(-1,np.nan)\n", - "print(df_train.buildingqualitytypeid.isnull().sum())\n", - "print(df_train.shape)\n", - "\n", - "# BACK TO PANDAS\n", - "df_train = df_train.to_pandas()" - ], - "execution_count": 194, - "outputs": [ - { - "output_type": "stream", - "text": [ - "-1.0 32911\n", - " 7.0 29310\n", - " 4.0 23839\n", - " 1.0 2627\n", - " 10.0 1461\n", - " 12.0 119\n", - " 8.0 5\n", - " 6.0 2\n", - " 11.0 1\n", - "Name: buildingqualitytypeid, dtype: int32\n", - "\n", - "0\n", - "(90275, 45)\n", - "\n", - "1.0\n", - "4.0\n", - "6.0\n", - "6.0 count too low @ 2\n", - "7.0\n", - "8.0\n", - "10.0\n", - "11.0\n" - ], - "name": "stdout" - }, - { - "output_type": "error", - "ename": "ValueError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuildingqualitytypeid\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mtid\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mt\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0msafe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0marg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 109\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_tuple_arg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 110\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_tuple_arg\u001b[0;34m(self, arg)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;31m# Step 4: Downcast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_downcast_to_series\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/site-packages/cudf/core/dataframe.py\u001b[0m in \u001b[0;36mindex\u001b[0;34m(self, _index)\u001b[0m\n\u001b[1;32m 1058\u001b[0m \u001b[0;34m\"have %d elements\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mold_length\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnew_length\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1059\u001b[0m )\n\u001b[0;32m-> 1060\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1061\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1062\u001b[0m \u001b[0;31m# try to build an index from generic _index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Length mismatch: Expected axis has 1 elements, new values have 90275 elements" - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Zl7eXGt_g1uU", - "colab_type": "text" - }, - "source": [ - "# -----current: break-----\n", - "- break 2 of 2\n", - " - below is last cell run" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Q3ZBSOHm-79A", - "colab_type": "code", - "outputId": "e9ddb9b3-0bb0-4cf7-fa8e-ca35b9ea7f46", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 557 - } - }, - "source": [ - "# run cell above (currently broken) as would be in pandas\n", - "not_df_train = df_train.to_pandas()\n", - "not_df_train = not_df_train.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", - "\n", - "missing_values = fillna_knn(not_df_train, \n", - " base = ['latitude', 'longitude'], \n", - " target = 'buildingqualitytypeid')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = not_df_train['buildingqualitytypeid'].isnull()\n", - "not_df_train.loc[missing_values_boolflag, 'buildingqualitytypeid'] = missing_values\n", - "\n", - "print(not_df_train.buildingqualitytypeid.isnull().sum())" - ], - "execution_count": 195, - "outputs": [ - { - "output_type": "stream", - "text": [ - "# of miss 0\n", - "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n" - ], - "name": "stderr" - }, - { - "output_type": "stream", - "text": [ - "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", - " metric_params=None, n_jobs=-1, n_neighbors=15, p=2,\n", - " weights='distance')\n", - "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}\n", - "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 7.1s finished\n" - ], - "name": "stderr" - }, - { - "output_type": "error", - "ename": "ValueError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m missing_values = fillna_knn(not_df_train, \n\u001b[1;32m 5\u001b[0m \u001b[0mbase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'latitude'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'longitude'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m target = 'buildingqualitytypeid')\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"predicted output shape\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmissing_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mfillna_knn\u001b[0;34m(df, base, target)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mY_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0mZ\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmissing_values_boolflag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0;31m#df.loc[ missing_values_boolflag, target ] = Z\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mZ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/utils/metaestimators.py\u001b[0m in \u001b[0;36m\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;31m# lambda, but not partial, allows help() to work with update_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 117\u001b[0m \u001b[0;31m# update the docstring of the returned function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0mupdate_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 455\u001b[0m \"\"\"\n\u001b[1;32m 456\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'predict'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 457\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 458\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mif_delegate_has_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelegate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'best_estimator_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'estimator'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/neighbors/classification.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[0mClass\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0meach\u001b[0m \u001b[0mdata\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m \"\"\"\n\u001b[0;32m--> 147\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0mneigh_dist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneigh_ind\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkneighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[0;34m\" minimum of %d is required%s.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m % (n_samples, array.shape, ensure_min_samples,\n\u001b[0;32m--> 550\u001b[0;31m context))\n\u001b[0m\u001b[1;32m 551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mensure_min_features\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required." - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bgXh5OATEacY", - "colab_type": "text" - }, - "source": [ - "# BELOW NOT (really) RUN\n", - "- if run, was in pandas" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oTh_XPErqkHf", - "colab_type": "code", - "outputId": "3e667bca-70c5-4b66-c7d2-12d171cb140b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 278 - } - }, - "source": [ - "print(df_train.heating_system_id.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "temp['heating_system_id']=temp['heating_system_id'].fillna(-1)\n", - "temp=temp.groupby(\"heating_system_id\").filter(lambda x: x.heating_system_id.size > 3)\n", - "temp['heating_system_id'] = temp['heating_system_id'].replace(-1,np.nan)\n", - "print(temp.heating_system_id.isnull().sum())\n", - "print(temp.shape)\n", - "\n", - "missing_values=fillna_knn(temp,\n", - " base = [ 'latitude', 'longitude' ] ,\n", - " target = 'heating_system_id')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['heating_system_id'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'heating_system_id' ] = missing_values\n", - "\n", - "\n", - "print(df_train.heating_system_id.isnull().sum())" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "34194\n", - "(90272, 45)\n", - "34194\n", - "(90266, 45)\n", - "# of miss 34194\n", - "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 3.3s finished\n" - ], - "name": "stderr" - }, - { - "output_type": "stream", - "text": [ - "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", - " metric_params=None, n_jobs=-1, n_neighbors=15, p=2,\n", - " weights='distance')\n", - "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}\n", - "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n", - "predicted output shape (34194,)\n", - "0\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oVjNSkUYqnCt", - "colab_type": "code", - "outputId": "80fc7e87-36cd-44b7-96e9-ef0631c7d10c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 278 - } - }, - "source": [ - "print(df_train.ac_id.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "temp['ac_id']=temp['ac_id'].fillna(-1)\n", - "temp=temp.groupby(\"ac_id\").filter(lambda x: x.ac_id.size > 3)\n", - "temp['ac_id'] = temp['ac_id'].replace(-1,np.nan)\n", - "print(temp.ac_id.isnull().sum())\n", - "print(temp.shape)\n", - "\n", - "missing_values=fillna_knn(temp,\n", - " base = [ 'latitude', 'longitude' ] ,\n", - " target = 'ac_id')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['ac_id'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'ac_id' ] = missing_values\n", - "\n", - "print(df_train.ac_id.isnull().sum())" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "61492\n", - "(90272, 45)\n", - "61492\n", - "(90270, 45)\n", - "# of miss 61492\n", - "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 2.0s finished\n" - ], - "name": "stderr" - }, - { - "output_type": "stream", - "text": [ - "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", - " metric_params=None, n_jobs=-1, n_neighbors=25, p=2,\n", - " weights='distance')\n", - "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 25, 'weights': 'distance'}\n", - "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n", - "predicted output shape (61492,)\n", - "0\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "qTbcYbexqr0Y", - "colab_type": "code", - "outputId": "3459affa-a41a-4241-ab62-f0dfcadda039", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 278 - } - }, - "source": [ - "#yearbuilt\n", - "print(df_train.yearbuilt.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "temp['yearbuilt']=temp['yearbuilt'].fillna(-1)\n", - "temp=temp.groupby(\"yearbuilt\").filter(lambda x: x.yearbuilt.size > 3)\n", - "temp['yearbuilt'] = temp['yearbuilt'].replace(-1,np.nan)\n", - "print(temp.yearbuilt.isnull().sum())\n", - "print(temp.shape)\n", - "\n", - "missing_values=fillna_knn(temp,\n", - " base = [ 'latitude', 'longitude','buildingqualitytypeid','propertylandusetypeid' ] ,\n", - " target = 'yearbuilt')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['yearbuilt'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'yearbuilt' ] = missing_values\n", - "print(df_train.yearbuilt.isnull().sum())" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "755\n", - "(90272, 45)\n", - "755\n", - "(90258, 45)\n", - "# of miss 755\n", - "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 44.3s finished\n" - ], - "name": "stderr" - }, - { - "output_type": "stream", - "text": [ - "grid.best_estimator_ KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", - " metric_params=None, n_jobs=-1, n_neighbors=5, p=2,\n", - " weights='distance')\n", - "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}\n", - "grid.scorer_ make_scorer(f1_score, pos_label=None, average=weighted)\n", - "predicted output shape (755,)\n", - "0\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Gx1LYGmfqxLk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#location seems to be related to building quality, (knnregressor)\n", - "from sklearn.model_selection import KFold\n", - "\n", - "def fillna_knnr( df, base, target):\n", - " data_colnames = [ target ] + base\n", - " #print(\"data_colnames\",data_colnames)\n", - " missing_values_boolflag = df[target].isnull() #true for missing rows, false for columns with values\n", - " #print(\"miss\",missing_values_boolflag.head())\n", - " not_missing_boolflag = ~missing_values_boolflag \n", - " #print(\"not miss\",not_missing_boolflag.head())\n", - " number_of_missing_val = missing_values_boolflag.sum()\n", - " print(\"# of miss\",number_of_missing_val)\n", - " not_missing_rows = df.loc[ not_missing_boolflag, data_colnames]\n", - " #print(not_missing_rows.head())\n", - " Y = not_missing_rows[target]\n", - " X = not_missing_rows[base]\n", - " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=3192)\n", - " metrics = ['euclidean'] \n", - " weights = ['distance'] \n", - " numNeighbors = [5,10,15,20,25]\n", - " param_grid = dict(metric=metrics,weights=weights,n_neighbors=numNeighbors)\n", - " cv = KFold(n_splits=3,random_state=3192,shuffle=False) \n", - " grid = GridSearchCV(neighbors.KNeighborsRegressor(n_jobs=-1),param_grid=param_grid,cv=cv,scoring='neg_mean_absolute_error',refit=True,return_train_score=True,verbose=1,n_jobs=-1,pre_dispatch='n_jobs')\n", - " grid.fit(X_train ,Y_train)\n", - " #print(\"grid.cv_results_\",grid.cv_results_)\n", - " print(\"grid.best_estimator_\",grid.best_estimator_)\n", - " print(\"grid.best_params_\",grid.best_params_)\n", - " print(\"grid.scorer_\",grid.scorer_)\n", - " #print(\"grid.n_splits_\",grid.n_splits_)\n", - " y_true, y_pred = Y_test, grid.predict(X_test) \n", - " Z = grid.predict(df.loc[missing_values_boolflag, base])\n", - " #df.loc[ missing_values_boolflag, target ] = Z\n", - " return Z" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "pj5PXm7ozg5l", - "colab_type": "code", - "outputId": "3d42279f-221c-444c-8795-05a0832f97cd", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 606 - } - }, - "source": [ - "#garage_sqft\n", - "print(df_train.garage_sqft.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.loc[df_train.garagecarcnt>0,df_train.columns].copy()\n", - "\n", - "print(temp.garage_sqft.isnull().sum())\n", - "print(temp.shape)\n", - "\n", - "missing_values=fillna_knnr(temp,\n", - " base = [ 'latitude', 'longitude','garagecarcnt'] ,\n", - " target = 'garage_sqft')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['garage_sqft'].isnull()\n", - "df_train.loc[missing_values_boolflag, 'garage_sqft'] = missing_values\n", - "print(df_train.garage_sqft.isnull().sum())" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "69255\n", - "(90272, 45)\n", - "8920\n", - "(29647, 45)\n", - "# of miss 8920\n", - "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 2.7s finished\n" - ], - "name": "stderr" - }, - { - "output_type": "stream", - "text": [ - "grid.best_estimator_ KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='euclidean',\n", - " metric_params=None, n_jobs=-1, n_neighbors=5, p=2,\n", - " weights='distance')\n", - "grid.best_params_ {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}\n", - "grid.scorer_ make_scorer(mean_absolute_error, greater_is_better=False)\n", - "predicted output shape (8920,)\n" - ], - "name": "stdout" - }, - { - "output_type": "error", - "ename": "ValueError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"predicted output shape\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmissing_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mmissing_values_boolflag\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'garage_sqft'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmissing_values_boolflag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'garage_sqft'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmissing_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgarage_sqft\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_setitem_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setitem_with_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_validate_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_setitem_with_indexer\u001b[0;34m(self, indexer, value)\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 611\u001b[0;31m raise ValueError('Must have equal len keys and value '\n\u001b[0m\u001b[1;32m 612\u001b[0m 'when setting with an iterable')\n\u001b[1;32m 613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Must have equal len keys and value when setting with an iterable" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "b7e5CFTyzg_M", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_train = df_train.drop('parcelid', axis=1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "YxGquCOOzhD7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#All the other columns with missing values seems to be integer, will need regression to be imputed,\n", - "#time to get categorical variables hot encoded\n", - "\n", - "#Identify numerical columns to produce a heatmap\n", - "catcols = ['ac_id','buildingqualitytypeid','deck_flag','fips', 'heating_system_id','has_hottub_or_spa',\n", - " 'just_hottub_or_spa', 'pool_with_spa_tub_yes','pool_with_spa_tub_no','propertylandusetypeid','basement_flag'\n", - " ,'fireplaceflag','taxdelinquencyflag']\n", - "numcols = [x for x in df_train.columns if x not in catcols]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "uVZkszJEzhHj", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#total_finished_living_area_sqft\n", - "\n", - "print(df_train.total_finished_living_area_sqft.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "print(temp.total_finished_living_area_sqft.isnull().sum())\n", - "print(temp.shape)\n", - "missing_values=fillna_knnr(temp,\n", - " base = [ 'latitude', 'longitude','basementsqft','numberofstories','poolcnt','garagecarcnt','garage_sqft','propertylandusetypeid'] ,\n", - " target = 'total_finished_living_area_sqft')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['total_finished_living_area_sqft'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'total_finished_living_area_sqft' ] = missing_values\n", - "print(df_train.total_finished_living_area_sqft.isnull().sum())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "CVrTMb92zhLX", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#total_bath\t1165\n", - "#full_bath\t1182\n", - "#half_bath\t1182\n", - "#roomcnt\t1416\n", - "#bedroomcnt\t1421\n", - "\n", - "#total_finished_living_area_sqft\n", - "\n", - "print(df_train.total_bath.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "print(temp.total_bath.isnull().sum())\n", - "print(temp.shape)\n", - "missing_values=fillna_knnr(temp,\n", - " base = ['propertylandusetypeid','total_finished_living_area_sqft' ] ,\n", - " target = 'total_bath')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['total_bath'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'total_bath' ] = missing_values\n", - "print(df_train.total_bath.isnull().sum())#total_bath\t1165\n", - "#full_bath\t1182\n", - "#half_bath\t1182\n", - "#roomcnt\t1416\n", - "#bedroomcnt\t1421\n", - "\n", - "#total_finished_living_area_sqft\n", - "\n", - "print(df_train.total_bath.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "print(temp.total_bath.isnull().sum())\n", - "print(temp.shape)\n", - "missing_values=fillna_knnr(temp,\n", - " base = ['propertylandusetypeid','total_finished_living_area_sqft' ] ,\n", - " target = 'total_bath')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['total_bath'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'total_bath' ] = missing_values\n", - "print(df_train.total_bath.isnull().sum())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "BjIKlu-tzhPI", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# rop half_bath and full bath, as there are only redundant values of total_bath\n", - "df_train = df_train.drop(['full_bath','half_bath'], axis=1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "02X1y6EBzhT9", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#bedroomcnt\t1421\n", - "\n", - "print(df_train.bedroomcnt.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "print(temp.bedroomcnt.isnull().sum())\n", - "print(temp.shape)\n", - "missing_values=fillna_knnr(temp,\n", - " base = ['propertylandusetypeid','total_finished_living_area_sqft','total_bath' ] ,\n", - " target = 'bedroomcnt')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['bedroomcnt'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'bedroomcnt' ] = missing_values\n", - "print(df_train.bedroomcnt.isnull().sum())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "WzkZ_qeHzhXP", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_train['total_bath']=df_train.total_bath.round(1)\n", - "df_train['bedroomcnt']=df_train.bedroomcnt.round(1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "QF9DtDAczhaW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#recalculate roomcnt\t1416 as we have used imputation for total_bath and bedroomcnt\n", - "\n", - "df_train.loc[(df_train.roomcnt.isnull()),['roomcnt']]=df_train.total_bath + df_train.bedroomcnt" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "U5N41TBlz60W", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(df_train.shape)\n", - "df_train =df_train.loc[(df_train.total_parcel_tax.notnull()) & (df_train.land_tax.notnull()),df_train.columns]\n", - "\n", - "print(df_train.shape)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "kv9h5yL3z64Q", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#lot_area_sqft\n", - "print(df_train.lot_area_sqft.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "print(temp.lot_area_sqft.isnull().sum())\n", - "print(temp.shape)\n", - "missing_values=fillna_knnr(temp,\n", - " base = ['latitude','longitude','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", - " target = 'lot_area_sqft')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['lot_area_sqft'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'lot_area_sqft' ] = missing_values.round(2)\n", - "print(df_train.lot_area_sqft.isnull().sum())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "GYJLHrR4z68f", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# predict structure_tax and recalculate total_parcel_tax = land_tax + structure_tax\n", - "\n", - "\n", - "print(df_train.structure_tax.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "print(temp.structure_tax.isnull().sum())\n", - "print(temp.shape)\n", - "missing_values=fillna_knnr(temp,\n", - " base = ['latitude','longitude','lot_area_sqft','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", - " target = 'structure_tax')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['structure_tax'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'structure_tax' ] = missing_values.round(2)\n", - "print(df_train.structure_tax.isnull().sum())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ya-3K06Zz6_y", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#36 total_property_tax_2016 \n", - "\n", - "#total_parcel_tax = land_tax + structure_tax\n", - " \n", - "df_train['total_parcel_tax']=df_train['structure_tax']+df_train['land_tax']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "8Fvr7voVz7DX", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#age of the property\n", - "df_train['age'] = 2016 - df_train['yearbuilt']\n", - "df_train=df_train.drop(['yearbuilt'],axis=1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "xl0EOIT-z7Gl", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#total_property_tax_2016\n", - "\n", - "\n", - "print(df_train.total_property_tax_2016.isnull().sum())\n", - "print(df_train.shape)\n", - "temp=df_train.copy()\n", - "print(temp.total_property_tax_2016.isnull().sum())\n", - "print(temp.shape)\n", - "missing_values=fillna_knnr(temp,\n", - " base = ['latitude','longitude','lot_area_sqft','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", - " target = 'total_property_tax_2016')\n", - "\n", - "print(\"predicted output shape\",missing_values.shape)\n", - "missing_values_boolflag = df_train['total_property_tax_2016'].isnull()\n", - "df_train.loc[ missing_values_boolflag, 'total_property_tax_2016' ] = missing_values.round(2)\n", - "print(df_train.total_property_tax_2016.isnull().sum())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "YlaxWegqz7I-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#check missing values\n", - "\n", - "missing_df = df_train.isnull().sum(axis=0).reset_index()\n", - "missing_df.columns = ['column_name', 'missing_count']\n", - "missing_df = missing_df.loc[missing_df['missing_count']>0]\n", - "missing_df = missing_df.sort_values(by='missing_count')\n", - "print(missing_df)\n", - "print(missing_df.shape)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "dIl_nqKVz7NQ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#both the columns above miss 92% of the data, there is no related varibale to impute it, hence dropping them at this point\n", - "\n", - "df_train = df_train.drop(['finished_living_area_entryfloor_sqft2','finished_living_area_entryfloor_sqft1'], axis=1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "HQJd7rgKz7Qq", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#Identify numerical columns to produce a heatmap\n", - "catcols = ['ac_id','buildingqualitytypeid','deck_flag','fips','pool_with_spa_tub_no','pool_with_spa_tub_yes','has_hottub_or_spa',\n", - " 'just_hottub_or_spa','heating_system_id','propertylandusetypeid','basement_flag','fireplaceflag','taxdelinquencyflag']\n", - "numcols = [x for x in df_train.columns if x not in catcols]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "VUN3a6uJz7Ut", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# 2 variables are in object datatype, coverting into numeric\n", - "df_train[['census_tractnumber','block_number']] = df_train[['census_tractnumber','block_number']].apply(pd.to_numeric)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "zGx77rRAz7ZZ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# dropping categorical columns as xgboost feature selection cannot hadle it\n", - "\n", - "train_x = df_train.drop(catcols+['logerror'], axis=1)\n", - "\n", - "train_y=df_train['logerror']\n", - "\n", - "train_x = train_x.astype(float) \n", - "train_y = train_y.astype(float)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "es_Ew2YJz7dT", - "colab_type": "code", - "colab": {} - }, - "source": [ - "pd.options.display.max_rows = 65\n", - "\n", - "dtype_df = train_x.dtypes.reset_index()\n", - "dtype_df.columns = [\"Count\", \"Column Type\"]\n", - "#dtype_df" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "bvWIhR38z7fW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_train.loc[df_train.has_hottub_or_spa==True,'has_hottub_or_spa']=\"Yes\"\n", - "df_train.loc[df_train.has_hottub_or_spa==0,'has_hottub_or_spa']=\"No\"\n", - "\n", - "df_train.loc[df_train.just_hottub_or_spa==0,'just_hottub_or_spa']=\"No\"\n", - "df_train.loc[df_train.just_hottub_or_spa==1,'just_hottub_or_spa']=\"Yes\"\n", - "\n", - "df_train.loc[df_train.deck_flag==0,'deck_flag']=\"No\"\n", - "df_train.loc[df_train.deck_flag==1,'deck_flag']=\"Yes\"\n", - "\n", - "df_train.loc[df_train.basement_flag==0,'basement_flag']=\"No\"\n", - "df_train.loc[df_train.basement_flag==1,'basement_flag']=\"Yes\"\n", - "\n", - "df_train.loc[df_train.fireplaceflag==False,'fireplaceflag']=\"No\"\n", - "df_train.loc[df_train.fireplaceflag==True,'fireplaceflag']=\"Yes\"\n", - "#" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ef9JjrmMz7jw", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#ac_id,heating_system_id,propertylandusetypeid\n", - "dummieslist=['has_hottub_or_spa','just_hottub_or_spa',\n", - " 'deck_flag','fips','basement_flag','fireplaceflag','taxdelinquencyflag']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Z51Zrt2Uz7oD", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_train[dummieslist] = df_train[dummieslist].astype(object)\n", - "dummies = pd.get_dummies(df_train[dummieslist], prefix= dummieslist)\n", - "dummies.shape" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "VHBi5Gg6z7tu", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dummies2=['pool_with_spa_tub_no','pool_with_spa_tub_yes']\n", - "df_train[dummies2] = df_train[dummies2].astype(int)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "oocTPKI9z7rk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import MySQLdb\n", - "from sqlalchemy import create_engine\n", - "engineString = 'mysql+mysqldb://root:MyNewPass@localhost/sakila'\n", - "engine = create_engine(engineString)\n", - "con=engine.connect()\n", - "\n", - "with engine.connect() as con, con.begin():\n", - " df_train.to_sql('df_train_f1', engine, chunksize=10000, index =False,if_exists ='replace')" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "zj5ZLSPlz7XC", - "colab_type": "code", - "colab": {} - }, - "source": [ - "numcols2=['basementsqft','total_bath','bedroomcnt','total_finished_living_area_sqft','fireplace_count','garagecarcnt',\n", - " 'garage_sqft','latitude','longitude','lot_area_sqft','poolcnt','pool_sqft','roomcnt','unitcnt','patio_sqft','storage_sqft',\n", - " 'numberofstories','structure_tax','total_parcel_tax','land_tax','total_property_tax_2016','taxdelinquencyyear','transaction_month',\n", - " 'census_tractnumber','block_number','age']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "fp53dotszhgA", - "colab_type": "code", - "colab": {} - }, - "source": [ - "Y=df_train['logerror']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "O0Uaei4rzhj6", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#buildingqualitytypeid ->has order\n", - "le = LabelEncoder()\n", - "df_train['buildingqualitytypeid']=le.fit_transform(df_train.buildingqualitytypeid)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "g4-g-uvtzhds", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#df_train.ac_id.value_counts()\n", - "#df_train.propertylandusetypeid.value_counts()\n", - "#'buildingqualitytypeid','ac_id','heating_system_id','propertylandusetypeid'" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SzliXafdzhRd", - "colab_type": "code", - "colab": {} - }, - "source": [ - "X=pd.concat([dummies,df_train[dummies2],df_train[numcols2],df_train[['buildingqualitytypeid','ac_id','heating_system_id','propertylandusetypeid']]],axis=1)\n", - "X.shape" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "DBsZjyQd0W1N", - "colab_type": "code", - "colab": {} - }, - "source": [ - "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=3192)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ihXFZWcn0W5D", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# top features\n", - "import xgboost as xgb\n", - "xgb_params = {\n", - " 'eta': 0.05,\n", - " 'max_depth': 8,\n", - " 'subsample': 0.7,\n", - " 'colsample_bytree': 0.7,\n", - " 'objective': 'reg:linear',\n", - " 'silent': 1,\n", - " 'seed' : 0\n", - "}\n", - "dtrain = xgb.DMatrix(X_train, Y_train, feature_names=X_train.columns.values)\n", - "model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)\n", - "# plot the important features #\n", - "fig, ax = plt.subplots(figsize=(12,18))\n", - "#max_num_features=50, error for no reason \n", - "xgb.plot_importance(model, height=0.8, ax=ax)\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "TQEEzNkX0W9w", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#top features\n", - "xgboost_selection=['total_finished_living_area_sqft','latitude','structure_tax','total_property_tax_2016',\n", - "'total_parcel_tax','land_tax','longitude','lot_area_sqft','census_tractnumber','age','total_bath','bedroomcnt',\n", - "'block_number','transaction_month','roomcnt','taxdelinquencyyear','unitcnt','taxdelinquencyflag_No',\n", - "'fips_LA','garage_sqft','pool_with_spa_tub_no','has_hottub_or_spa_No','garagecarcnt','deck_flag_No',\n", - "'poolcnt','pool_sqft'\n", - "]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Rr_6EO4G0XEj", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# feature selection\n", - "#c_id,heating_system_id,propertylandusetypeid\n", - "from sklearn.ensemble import ExtraTreesRegressor\n", - "from sklearn.feature_selection import SelectFromModel\n", - "reg = ExtraTreesRegressor(n_estimators=500, max_depth=8, max_features='sqrt',\n", - " min_samples_split=100 ,min_samples_leaf=10, bootstrap=True,n_jobs=-1, random_state=3192)\n", - "reg = reg.fit(X_train, Y_train)\n", - "#print(\"importance\",reg.feature_importances_) \n", - "model = SelectFromModel(reg, prefit=True)\n", - "X_new = model.transform(X_train)\n", - "print(X_train.shape)\n", - "print(X_new.shape) \n", - "\n", - "feat_names = X.columns.values\n", - "importances = reg.feature_importances_\n", - "std = np.std([tree.feature_importances_ for tree in reg.estimators_], axis=0)\n", - "indices = np.argsort(importances)[::-1][:26]\n", - "plt.figure(figsize=(12,12))\n", - "plt.title(\"Feature importances\")\n", - "plt.bar(range(len(indices)), importances[indices], color=\"r\", yerr=std[indices], align=\"center\")\n", - "plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')\n", - "plt.xlim([-1, len(indices)])\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "i4FCNOG70XIU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "tree_selection=[\n", - " 'total_finished_living_area_sqft','structure_tax','total_property_tax_2016','total_bath','total_parcel_tax',\n", - " 'age','latitude','census_tractnumber','bedroomcnt','longitude','land_tax','propertylandusetypeid','block_number',\n", - " 'buildingqualitytypeid','numberofstories','heating_system_id','unitcnt','transaction_month','lot_area_sqft','roomcnt',\n", - " 'garage_sqft','garagecarcnt','pool_with_spa_tub_no','poolcnt','fips_LA','taxdelinquencyyear','patio_sqft',\n", - " 'taxdelinquencyflag_No','taxdelinquencyflag_Yes'\n", - "]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "TmIS1WAS0XMW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import matplotlib.pyplot as plt\n", - "from sklearn.model_selection import KFold\n", - "from sklearn.linear_model import Ridge,Lasso\n", - "from sklearn.feature_selection import RFECV\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.metrics import r2_score,mean_absolute_error,make_scorer\n", - "\n", - "#model=Lasso(alpha=0.2, fit_intercept=True, normalize=True, precompute=False, copy_X=True,\n", - " # max_iter=1000, \n", - " # tol=0.0001, warm_start=False, positive=False, random_state=3192, selection='cyclic')\n", - "\n", - "#Ridge(random_state=3192,solver='auto',fit_intercept=True,normalize=True,alpha=0.1)\n", - "#LinearRegression(n_jobs=-1,fit_intercept=True, normalize=True, copy_X=True)\n", - "\n", - "\n", - "rfecv = RFECV(estimator=LinearRegression(n_jobs=-1,fit_intercept=True, normalize=True, copy_X=True), step=2, cv=KFold(4),scoring='neg_mean_absolute_error')\n", - "rfecv.fit(X_train, Y_train)\n", - "\n", - "print(\"Optimal number of features : %d\" % rfecv.n_features_)\n", - "\n", - "# Plot number of features VS. cross-validation scores\n", - "plt.figure()\n", - "plt.xlabel(\"Number of features selected\")\n", - "\n", - "plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n", - "plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)\n", - "plt.show()\n" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "DIw8O00U0XPR", - "colab_type": "code", - "colab": {} - }, - "source": [ - "rfe_selection = [i for indx,i in enumerate(X.columns) if rfecv.support_[indx] == True]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "gHA0x5_80XWy", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#Linear regression with rfe_selection selection\n", - "#rfe_selection, tree_selection, xgboost_selection\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import r2_score,mean_absolute_error,make_scorer,mean_squared_error\n", - "\n", - "# just to check whether normalized /not normalized data gives better results\n", - "parameters = {'fit_intercept':[True], 'normalize':[True,False], 'copy_X':[True]}\n", - "scoring = {'MAE':'neg_mean_absolute_error','MSE': make_scorer(mean_squared_error,greater_is_better=False)}\n", - "\n", - "grid1 = GridSearchCV(LinearRegression(n_jobs=-1),param_grid=parameters, scoring=scoring,cv=5,refit='MAE',\n", - " return_train_score=True,\n", - " verbose=0,n_jobs=-1,pre_dispatch='n_jobs')\n", - "\n", - "grid1.fit(X_train[rfe_selection], Y_train)\n", - "#print(\"5. grid best_score_\",abs(grid.best_score_))\n", - "Y_pred = grid1.predict(X_test[rfe_selection])\n", - "print(\"MAE on test data\",mean_absolute_error(Y_test,Y_pred))\n", - "print(\"MSE on test data\",mean_squared_error(Y_test,Y_pred))\n", - "print(\"R Squared data \",r2_score(Y_test,Y_pred))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ekn4pBs60XcT", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#pca selection\n", - "from sklearn.decomposition import PCA\n", - "from sklearn.preprocessing import scale\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.preprocessing import scale\n", - "%matplotlib inline\n", - "scaled_x = scale(X)\n", - "pca = PCA(n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", - "pca.fit(scaled_x)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "yFuT-wUN0XfV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# The amount of variance that each PC explains\n", - "var= pca.explained_variance_ratio_\n", - "#Cumulative Variance explains\n", - "var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)\n", - "print(var1)\n", - "plt.plot(var1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "iPN4OBUe0XlD", - "colab_type": "code", - "colab": {} - }, - "source": [ - "#Looking at above plot I'm taking 28 variables\n", - "\n", - "pca = PCA(n_components=28, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", - "pca.fit(scaled_x)\n", - "\n", - "pca1=pca.fit_transform(scaled_x)\n", - "\n", - "pca = PCA(n_components=28, copy=True, whiten=True, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", - "pca.fit(scaled_x)\n", - "pca2=pca.fit_transform(scaled_x)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "EE4ednPC0XjX", - "colab_type": "code", - "colab": {} - }, - "source": [ - "pcaX_train, pcaX_test, pcaY_train, pcaY_test = train_test_split(pca1, Y, test_size=0.10, random_state=3192)\n", - "pca2X_train, pca2X_test, pca2Y_train, pca2Y_test = train_test_split(pca2, Y, test_size=0.10, random_state=3192)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "erYMXvTG0XaK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from sklearn.ensemble import GradientBoostingRegressor\n", - "from sklearn.metrics import mean_absolute_error,make_scorer\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", - "# just to check whether normalized /not normalized data gives better results\n", - "\n", - " # 0.005 for 1200 trees.\n", - "param_grid={'n_estimators':[1200],'max_features':[22]}\n", - "\n", - " \n", - "grid13 = GridSearchCV(GradientBoostingRegressor(subsample=0.8,min_samples_leaf=50,min_samples_split=50,max_depth=9,loss='ls',criterion='friedman_mse',learning_rate=0.005,random_state=3192),\n", - " param_grid=param_grid, cv=5,refit='MAE',\n", - " return_train_score=True,\n", - " verbose=2,n_jobs=-1,pre_dispatch='n_jobs')\n", - "\n", - "grid13.fit(pcaX_train, pcaY_train)\n", - "print(\"5. grid best_score_\",abs(grid13.best_score_))\n", - "print(\"best params\",grid13.best_params_)\n", - "print(\"best score\",grid13.best_score_)\n", - "Y_pred = grid13.predict(pcaX_test)\n", - "print(\"MAE on test data\",mean_absolute_error(pcaY_test,Y_pred))\n", - "print(\"MSE on test data\",mean_squared_error(pcaY_test,Y_pred))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "BgtbLCcR0XUx", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "FjdSCEFP0XCM", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WzATgLxmam5w", - "colab_type": "text" - }, - "source": [ - "In this competition, Zillow is asking you to predict the log-error between their Zestimate and the actual sale price, given all the features of a home. The log error is defined as\n", - "\n", - "logerror=log(Zestimate)−log(SalePrice)\n", - "and it is recorded in the transactions file train.csv. In this competition, you are going to predict the logerror for the months in Fall 2017. Since all the real estate transactions in the U.S. are publicly available, we will close the competition (no longer accepting submissions) before the evaluation period begins.\n", - "\n", - "Train/Test split\n", - "You are provided with a full list of real estate properties in three counties (Los Angeles, Orange and Ventura, California) data in 2016.\n", - "The train data has all the transactions before October 15, 2016, plus some of the transactions after October 15, 2016.\n", - "The test data in the public leaderboard has the rest of the transactions between October 15 and December 31, 2016.\n", - "The rest of the test data, which is used for calculating the private leaderboard, is all the properties in October 15, 2017, to December 15, 2017. This period is called the \"sales tracking period\", during which we will not be taking any submissions.\n", - "You are asked to predict 6 time points for all properties: October 2016 (201610), November 2016 (201611), December 2016 (201612), October 2017 (201710), November 2017 (201711), and December 2017 (201712).\n", - "Not all the properties are sold in each time period. If a property was not sold in a certain time period, that particular row will be ignored when calculating your score.\n", - "If a property is sold multiple times within 31 days, we take the first reasonable value as the ground truth. By \"reasonable\", we mean if the data seems wrong, we will take the transaction that has a value that makes more sense.\n", - "File descriptions\n", - "properties_2016.csv - all the properties with their home features for 2016. Note: Some 2017 new properties don't have any data yet except for their parcelid's. Those data points should be populated when properties_2017.csv is available.\n", - "properties_2017.csv - all the properties with their home features for 2017 (released on 10/2/2017)\n", - "train_2016.csv - the training set with transactions from 1/1/2016 to 12/31/2016\n", - "train_2017.csv - the training set with transactions from 1/1/2017 to 9/15/2017 (released on 10/2/2017)\n", - "sample_submission.csv - a sample submission file in the correct format" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "R0yrYUf7anN0", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kkEdr1VmigyU" + }, + "source": [ + "### Install RAPIDS AI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "p129YxxnihcV" + }, + "outputs": [], + "source": [ + "!wget -nc https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh\n", + "# RAPIDS 0.10 nightly\n", + "!bash rapids-colab.sh \n", + "\n", + "import sys, os\n", + "\n", + "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n", + "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", + "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "1CsdVW7SU9Li" + }, + "source": [ + "# Zillow Kaggle Competition on RAPIDS AI\n", + "- initially based off eswar3's [Zillow prediction models]( https://github.com/eswar3/Zillow-prediction-models) repo\n", + "## Download Data\n", + "- to download the data, please plug in your kaggle api username & key\n", + " - you can set up your kaggle api at `https://www.kaggle.com/YOUR USERNAME HERE/account`\n", + " - learn more: https://github.com/Kaggle/kaggle-api#api-credentials" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "x1dLRTm168Tk" + }, + "outputs": [], + "source": [ + "!pip install kaggle\n", + "!mkdir /root/.kaggle\n", + "\n", + "# plug api -- get your own API key\n", + "!echo '{\"username\":\"warobson\",\"key\":\"\"}' > /root/.kaggle/kaggle.json\n", + "!chmod 600 /root/.kaggle/kaggle.json\n", + "\n", + "# !kaggle datasets download\n", + "!kaggle competitions download -c zillow-prize-1\n", + "\n", + "# unzip kaggle data\n", + "!unzip -q \"/content/sample_submission.csv.zip\"\n", + "!unzip -q \"/content/train_2016_v2.csv.zip\"\n", + "!unzip -q \"/content/properties_2016.csv.zip\"\n", + "!unzip -q \"/content/train_2017.csv.zip\"\n", + "!unzip -q \"/content/properties_2017.csv.zip\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LICr9uz8do9K" + }, + "source": [ + "#### How is the data saved?\n", + "- inside content directory " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 173 + }, + "colab_type": "code", + "id": "6n75DyJ-dm4B", + "outputId": "64ac687e-39d6-4bb1-f4b7-5476c9de3b84" + }, + "outputs": [], + "source": [ + "# display content folder contents\n", + "!ls \"/content/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Lpa1b4edIXuT" + }, + "source": [ + "# Imports\n", + "### RAPIDS\n", + "* `cuDf`\n", + " - words here\n", + "* `cuML`\n", + " - words here\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ZKN5zuROroJD" + }, + "outputs": [], + "source": [ + "# rapids \n", + "import cudf, cuml \n", + "# switch to cupy next update (once docker has it)\n", + "import numpy as np\n", + "# general \n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YJeywzd2efw7" + }, + "source": [ + "## Data\n", + "* `properties_2016`\n", + " - aprox. 27,000,000 residential properties \n", + " - 58 attributes each\n", + "* `train_2016_v2`\n", + " - 90,000 transaction records for closings in the year 2016\n", + " * Merge datasets on `property_id`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + }, + "colab_type": "code", + "id": "2EfApIzCfEtr", + "outputId": "bc1e37d1-9ab8-4561-fa39-5af420480a72" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
parcelidairconditioningtypeidarchitecturalstyletypeidbasementsqftbathroomcntbedroomcntbuildingclasstypeidbuildingqualitytypeidcalculatedbathnbrdecktypeid...numberofstoriesfireplaceflagstructuretaxvaluedollarcnttaxvaluedollarcntassessmentyearlandtaxvaluedollarcnttaxamounttaxdelinquencyflagtaxdelinquencyyearcensustractandblock
010754147nullnullnull0.00.0nullnullnullnull...nullnullnull9.02015.09.0nullNonenullnull
110759547nullnullnull0.00.0nullnullnullnull...nullnullnull27516.02015.027516.0nullNonenullnull
210843547nullnullnull0.00.0nullnullnullnull...nullnull650756.01413387.02015.0762631.020800.37Nonenullnull
310859147nullnullnull0.00.03.07.0nullnull...1.0null571346.01156834.02015.0585488.014557.57Nonenullnull
410879947nullnullnull0.00.04.0nullnullnull...nullnull193796.0433491.02015.0239695.05725.17Nonenullnull
\n", + "

5 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " parcelid airconditioningtypeid architecturalstyletypeid basementsqft \\\n", + "0 10754147 null null null \n", + "1 10759547 null null null \n", + "2 10843547 null null null \n", + "3 10859147 null null null \n", + "4 10879947 null null null \n", + "\n", + " bathroomcnt bedroomcnt buildingclasstypeid buildingqualitytypeid \\\n", + "0 0.0 0.0 null null \n", + "1 0.0 0.0 null null \n", + "2 0.0 0.0 null null \n", + "3 0.0 0.0 3.0 7.0 \n", + "4 0.0 0.0 4.0 null \n", + "\n", + " calculatedbathnbr decktypeid ... numberofstories fireplaceflag \\\n", + "0 null null ... null null \n", + "1 null null ... null null \n", + "2 null null ... null null \n", + "3 null null ... 1.0 null \n", + "4 null null ... null null \n", + "\n", + " structuretaxvaluedollarcnt taxvaluedollarcnt assessmentyear \\\n", + "0 null 9.0 2015.0 \n", + "1 null 27516.0 2015.0 \n", + "2 650756.0 1413387.0 2015.0 \n", + "3 571346.0 1156834.0 2015.0 \n", + "4 193796.0 433491.0 2015.0 \n", + "\n", + " landtaxvaluedollarcnt taxamount taxdelinquencyflag taxdelinquencyyear \\\n", + "0 9.0 null None null \n", + "1 27516.0 null None null \n", + "2 762631.0 20800.37 None null \n", + "3 585488.0 14557.57 None null \n", + "4 239695.0 5725.17 None null \n", + "\n", + " censustractandblock \n", + "0 null \n", + "1 null \n", + "2 null \n", + "3 null \n", + "4 null \n", + "\n", + "[5 rows x 58 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# import 2016 properties\n", + "prop2016 = cudf.read_csv('zillow/properties_2016.csv')\n", + "\n", + "# peek display 2016 properties\n", + "prop2016.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 121 + }, + "colab_type": "code", + "id": "uynoUxpx8Xsn", + "outputId": "b64b7b32-c1f9-4cf3-c50d-36e90dc51a64" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
parcelidlogerrortransactiondate
0110165940.02762016-01-01
114366692-0.16842016-01-01
212098116-0.00402016-01-01
3126434130.02182016-01-02
414432541-0.00502016-01-02
\n", + "
" + ], + "text/plain": [ + " parcelid logerror transactiondate\n", + "0 11016594 0.0276 2016-01-01\n", + "1 14366692 -0.1684 2016-01-01\n", + "2 12098116 -0.0040 2016-01-01\n", + "3 12643413 0.0218 2016-01-02\n", + "4 14432541 -0.0050 2016-01-02" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# import train 2016 data\n", + "train2016 = cudf.read_csv('zillow/train_2016_v2.csv',\n", + " parse_dates=[\"transactiondate\"])\n", + "\n", + "# peek display 2016 train\n", + "train2016.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gGiscxESJDrl" + }, + "source": [ + "## [Zillow Prediction Model](https://colab.research.google.com/github/eswar3/Zillow-prediction-models/blob/master/Step%202a-Approach1.ipynb)\n", + "\n", + " In this approach the properties data and transaction data are merged together before adressing any missing values\n", + "\n", + "\n", + "#### Merging Data \n", + " - we will start by merging the two dataframes\n", + " - then rename the new dataframe's attributes to be meaningful \n", + " - e.g. from `pooltypeid7` to `pool_with_spa_tub_no` and `structuretaxvaluedollarcnt` to `structure_tax`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + }, + "colab_type": "code", + "id": "o4CvSIcwm4B2", + "outputId": "4e59a51a-ebd6-4fe5-b037-3165e57e3b85" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
parcelidlogerrortransactiondateac_idarchitecturalstyletypeidbasement_sqfttotal_bathbedroomcntbuildingclasstypeidbuildingqualitytypeid...fireplaceflagstructure_taxtotal_parcel_taxassessmentyearland_taxtotal_property_tax_2016taxdelinquencyflagtaxdelinquencyyearcensustractandblocktransaction_month
0171299710.04212016-01-25nullnullnull3.04.0nullnull...null266718.0444528.02015.0177810.05108.38Nonenull6.111005e+131
1129219490.02662016-01-251.0nullnull3.04.0null4.0...null361522.0506127.02015.0144605.06150.23Nonenull6.037404e+131
214502581-0.00602016-01-25nullnullnull2.53.0nullnull...null170960.0339273.02015.0168313.05487.92Nonenull6.059032e+131
310946127-0.10202016-01-251.0nullnull3.02.0null4.0...null144440.0389200.02015.0244760.04326.54Nonenull6.037311e+131
411835451-0.00302016-01-25nullnullnull3.05.0null7.0...null144020.0235739.02015.091719.03698.87Nonenull6.037530e+131
\n", + "

5 rows × 61 columns

\n", + "
" + ], + "text/plain": [ + " parcelid logerror transactiondate ac_id architecturalstyletypeid \\\n", + "0 17129971 0.0421 2016-01-25 null null \n", + "1 12921949 0.0266 2016-01-25 1.0 null \n", + "2 14502581 -0.0060 2016-01-25 null null \n", + "3 10946127 -0.1020 2016-01-25 1.0 null \n", + "4 11835451 -0.0030 2016-01-25 null null \n", + "\n", + " basement_sqft total_bath bedroomcnt buildingclasstypeid \\\n", + "0 null 3.0 4.0 null \n", + "1 null 3.0 4.0 null \n", + "2 null 2.5 3.0 null \n", + "3 null 3.0 2.0 null \n", + "4 null 3.0 5.0 null \n", + "\n", + " buildingqualitytypeid ... fireplaceflag structure_tax total_parcel_tax \\\n", + "0 null ... null 266718.0 444528.0 \n", + "1 4.0 ... null 361522.0 506127.0 \n", + "2 null ... null 170960.0 339273.0 \n", + "3 4.0 ... null 144440.0 389200.0 \n", + "4 7.0 ... null 144020.0 235739.0 \n", + "\n", + " assessmentyear land_tax total_property_tax_2016 taxdelinquencyflag \\\n", + "0 2015.0 177810.0 5108.38 None \n", + "1 2015.0 144605.0 6150.23 None \n", + "2 2015.0 168313.0 5487.92 None \n", + "3 2015.0 244760.0 4326.54 None \n", + "4 2015.0 91719.0 3698.87 None \n", + "\n", + " taxdelinquencyyear censustractandblock transaction_month \n", + "0 null 6.111005e+13 1 \n", + "1 null 6.037404e+13 1 \n", + "2 null 6.059032e+13 1 \n", + "3 null 6.037311e+13 1 \n", + "4 null 6.037530e+13 1 \n", + "\n", + "[5 rows x 61 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# merge 2016 train and property dataframes by parcel id\n", + "df_train=''\n", + "df_train = train2016.merge(prop2016, how='left', on='parcelid')\n", + "\n", + "# add column inidcaticating month of transaction\n", + "df_train['transaction_month'] = df_train['transactiondate'].dt.month\n", + "\n", + "# set colums to be renamed for general english understandability \n", + "rename_these = {\"bathroomcnt\": \"total_bath\",\n", + " \"fullbathcnt\": \"full_bath\",\n", + " \"threequarterbathnbr\": \"half_bath\",\n", + " \"yardbuildingsqft17\": \"patio_sqft\",\n", + " \"yardbuildingsqft26\":\"storage_sqft\",\n", + " \"decktypeid\": \"deck_flag\",\n", + " \"pooltypeid7\": \"pool_with_spa_tub_no\", \n", + " \"pooltypeid2\": \"pool_with_spa_tub_yes\",\n", + " \"hashottuborspa\": \"has_hottub_or_spa\", \n", + " \"pooltypeid10\": \"just_hottub_or_spa\",\n", + " \"calculatedfinishedsquarefeet\":\"total_finished_living_area_sqft\", \n", + " \"finishedsquarefeet12\": \"finished_living_area_sqft\",\n", + " \"lotsizesquarefeet\": \"lot_area_sqft\",\n", + " \"finishedsquarefeet50\":\"finished_living_area_entryfloor_sqft1\",\n", + " \"finishedfloor1squarefeet\":\"finished_living_area_entryfloor_sqft2\",\n", + " \"finishedsquarefeet6\": \"base_unfinished_and_finished_area_sqft\",\n", + " \"finishedsquarefeet15\": \"total_area_sqft\",\n", + " \"finishedsquarefeet13\": \"preimeter_living_area_sqft\",\n", + " \"taxvaluedollarcnt\":\"total_parcel_tax\",\n", + " \"landtaxvaluedollarcnt\":\"land_tax\",\n", + " \"taxamount\":\"total_property_tax_2016\",\n", + " \"structuretaxvaluedollarcnt\":\"structure_tax\",\n", + " \"garagetotalsqft\":\"garage_sqft\",\n", + " \"fireplacecnt\":\"fireplace_count\",\n", + " \"buildingqualitytypeid \":\"building_quality_id\",\n", + " \"heatingorsystemtypeid\":\"heating_system_id\",\n", + " \"airconditioningtypeid\":\"ac_id\",\n", + " \"storytypeid\": \"basement_flag\",\n", + " \"basementsqft\": \"basement_sqft\",\n", + " \"poolsizesum\": \"pool_sqft\",\n", + " \"poolcnt\": \"pool_count\"}\n", + "# rename columns \n", + "df_train = df_train.rename(columns = rename_these)\n", + "\n", + "# what's the data frame look like?\n", + "df_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YdtyBI2jFnJv" + }, + "source": [ + "## Conforming Attribute Values\n", + "### #0 boolean columns & null = 0s cases \n", + "* `pool_count`, `pool_with_spa_tub_no` and `pool_with_spa_tub_yes` are all binary variables, replace all NULL values with zero\n", + "* `basement_flag` has values 7 & `Null` but is supposed to be bool, convert the `7`s to `1`s and the `Null`s to `0`s \n", + "* patio and shed variables with null values are assumed to have none\n", + "* deck_flag has only 2 values, `66` and `null`\n", + " - convert it into binary flag\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "z3bPdNONHTYI" + }, + "outputs": [], + "source": [ + "# replace missing pool count values so we booling\n", + "the_bool_club = ['pool_count','pool_with_spa_tub_no','pool_with_spa_tub_yes',\n", + " 'basement_flag','patio_sqft','storage_sqft', 'deck_flag']\n", + "\n", + "for col in the_bool_club:\n", + " # convert null values to 0\n", + " df_train[col]=df_train[col].fillna(0)\n", + "\n", + "# convert 7s and 66s to 1s\n", + "df_train['basement_flag'] = df_train['basement_flag'].replace(7, 1)\n", + "df_train['deck_flag'] = df_train['deck_flag'].replace(66, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5MbGy6r7JLLD" + }, + "source": [ + "### #1 The pool\n", + "* When pool is present and if it has tub/spa then `just_hottub_or_spa` = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + }, + "colab_type": "code", + "id": "B3-1V93smA9A", + "outputId": "52e1a5d7-869a-443f-ac2d-40504992dc14" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before\n", + "1.0 1161\n", + "Name: just_hottub_or_spa, dtype: int32\n", + "\n", + "after\n", + "0.0 1204\n", + "1.0 1161\n", + "Name: just_hottub_or_spa, dtype: int32\n" + ] + } + ], + "source": [ + "print(f'before\\n{df_train.just_hottub_or_spa.value_counts()}\\n')\n", + "\n", + "# if poolcnt=1 and has_hottub_or_spa=1 and just_hottub_or_spa is null\n", + "conditions = ((df_train['pool_count'] == 1) \n", + " & (df_train['has_hottub_or_spa'] == 1) \n", + " & (df_train['just_hottub_or_spa'].isna() == True))\n", + "# then just_hottub_or_spa = 0\n", + "df_train.just_hottub_or_spa.loc[conditions] = 0\n", + "\n", + "print(f'after\\n{df_train.just_hottub_or_spa.value_counts()}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "v6E3-_XlSGBs" + }, + "source": [ + "\n", + "- when `has_hottub_or_spa` is null and `just_hottub_or_spa` is null\n", + " - both should be zero\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Xa12WFccSGM6" + }, + "outputs": [], + "source": [ + "# if both has hottub and just hottub are null\n", + "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", + " & (df_train['just_hottub_or_spa'].isna() == True))\n", + "# just hottub or spa = 0 \n", + "df_train.just_hottub_or_spa.loc[conditions] = 0\n", + "\n", + "# now, if has hottub is null and just hottub is 0 \n", + "conditions = ((df_train['has_hottub_or_spa'].isna() == True) \n", + " & (df_train['just_hottub_or_spa'] == 0))\n", + "# has hottub or spa = 0 \n", + "df_train.has_hottub_or_spa.loc[conditions] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5umCCWN73qxw" + }, + "source": [ + "- when there is no pool\n", + " - if there is tub/spa \n", + " - then `just_hottub_or_spa` = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 69 + }, + "colab_type": "code", + "id": "FBgs7zJm3qk-", + "outputId": "78c76ac5-2b7f-4f98-9615-8a335bc3214e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0 89114\n", + "1.0 1161\n", + "Name: just_hottub_or_spa, dtype: int32" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# when poolcnt=0, has_hottub_or_spa=1\n", + "conditions = ((df_train['pool_count'] == 0) \n", + " & (df_train['has_hottub_or_spa'] == 1))\n", + "# just_hottub_or_spa=1\n", + "df_train.just_hottub_or_spa.loc[conditions] = 1\n", + "\n", + "# let's check the values\n", + "df_train.just_hottub_or_spa.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3LsRr1aoSCVx" + }, + "source": [ + "* When there is no pool, set pool size to zero instead of na" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NtdyXCbx0TKx" + }, + "outputs": [], + "source": [ + "# where there is no pool\n", + "conditions = df_train['pool_count']==0\n", + "# square footage of non existant pool is 0 \n", + "df_train.pool_sqft.loc[conditions] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3hQFkXmAgQPY" + }, + "source": [ + "### #2 The basement\n", + "* Where `basement_flag` is zero, `basement_sqft` should also be zero\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kMuCOqAmLTmY" + }, + "outputs": [], + "source": [ + "# where there is no basement\n", + "conditions = df_train['basement_flag'] == 0\n", + "# fun fact: we just did this with the pool\n", + "df_train.basement_sqft.loc[conditions] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wU6Uohb-PDYB" + }, + "source": [ + "### #3 The fireplace\n", + "There seems to be inconsistency between the `fireplace_flag` and `fireplace_count`\n", + "- 90,053 flag values are null\n", + "- 80,688 `fireplace_count` values are null\n", + " * 9,385 (-11.5%) difference, but a boatload either way" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "colab_type": "code", + "id": "OZM6lXmmpj5k", + "outputId": "ecf62d1d-b036-41ad-8052-a3090ae590ef" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "there are 80668 fireplace_count nulls\n", + "there are 90053 fireplaceflag nulls\n" + ] + } + ], + "source": [ + "print(f\"there are {df_train['fireplace_count'].isna().sum()} fireplace_count \\\n", + "nulls\\nthere are {df_train['fireplaceflag'].isna().sum()} fireplaceflag nulls\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "v9ZAzFoIpkSF" + }, + "source": [ + "* context driven solutions\n", + " * where neither flag nor count exists, `fireplaceflag == False`\n", + " * when `fireplace_count` is more than zero `fireplaceflag` should be `True`\n", + " * if `fireplaceflag == False`, the `fireplace_count` is logically `0`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "colab_type": "code", + "id": "i3YRZgU_qZhA", + "outputId": "e45a7a96-2e1d-47d2-a0bd-48ece42cbb6e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "there are 222 fireplace_count nulls\n", + "there are 0 fireplaceflag nulls\n" + ] + } + ], + "source": [ + "# null flags with null counts are zero\n", + "conditions = ((df_train['fireplace_count'].isna()==True) \n", + " & (df_train['fireplaceflag'].isna()==True))\n", + "df_train.fireplaceflag.loc[conditions] = False\n", + "\n", + "# true flags for positive fireplace counts\n", + "conditions = df_train['fireplace_count'] > 0\n", + "df_train.fireplaceflag.loc[conditions] = True\n", + "\n", + "# set fireplace count nulls to 0 where false flags are\n", + "conditions = ((df_train['fireplace_count'].isna()==True) \n", + " & (df_train['fireplaceflag']==False))\n", + "df_train.fireplace_count.loc[conditions] = 0\n", + "\n", + "print(f\"there are {df_train['fireplace_count'].isna().sum()} fireplace_count \\\n", + "nulls\\nthere are {df_train['fireplaceflag'].isna().sum()} fireplaceflag nulls\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pYntUejosOn3" + }, + "source": [ + "### #4 The garage\n", + "* Properties with no garages would have NA values for both " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "L9mGs-mK9E0Q" + }, + "outputs": [], + "source": [ + "garage = ['garagecarcnt', 'garage_sqft']\n", + "# where garage car count and garage square feet are null\n", + "conditions = ((df_train['garagecarcnt'].isna()==True) \n", + " & (df_train['garage_sqft'].isna()==True))\n", + "# set both to 0\n", + "df_train[garage].loc[conditions] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0uV115W6-ohW" + }, + "source": [ + "Exploring the data farther, we see\n", + "- `garage_sqft` holds over 8,900 measurements of 0 despite the garage's car count being 1 or more \n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 121 + }, + "colab_type": "code", + "id": "gbbUIbwJ-ouS", + "outputId": "310a4cdf-01a0-4fc3-ed1b-0e2f5e668518" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
garagecarcntgarage_sqft
182.00.0
201.00.0
321.00.0
362.00.0
421.00.0
\n", + "
" + ], + "text/plain": [ + " garagecarcnt garage_sqft\n", + "18 2.0 0.0\n", + "20 1.0 0.0\n", + "32 1.0 0.0\n", + "36 2.0 0.0\n", + "42 1.0 0.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# show rows where garage count and square feet don't add up\n", + "conditions = (df_train.garagecarcnt > 0) & (df_train.garage_sqft == 0)\n", + "\n", + "# give a display\n", + "df_train.loc[conditions][garage].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5I1O76QKA8Cb" + }, + "source": [ + "- these 0 values need to be null\n", + " - because no garage holding 1 or more cars in 2016 measured 0sqft" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "eWVtoty0A9Jt" + }, + "outputs": [], + "source": [ + "# where garage count and square feet don't add up\n", + "conditions = (df_train.garagecarcnt>0) & (df_train.garage_sqft==0)\n", + "# insert a NaN value\n", + "df_train.garage_sqft.loc[conditions] = np.nan" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "seb6r5wx5Bbz" + }, + "source": [ + "### #5 The bath\n", + "* `total_bath` & `calculatedbathnbr` are near-duplicates w/ `calculated` having more nulls\n", + " - let's drop it\n", + "* if `full_bath` is null and `half_bath` is also null\n", + " - let's make `total_bath` = 0 \n", + " - because we can't truthfully assume it's any more " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "EgMNToed5BMu" + }, + "outputs": [], + "source": [ + "# drop calculated bath column\n", + "df_train = df_train.drop('calculatedbathnbr', axis=1)\n", + "\n", + "# if full_bath is null & half_bath is null\n", + "conditions = ((df_train['full_bath'].isnull()==True) \n", + " & (df_train['half_bath'].isnull()==True) \n", + " & (df_train['total_bath']==0))\n", + "# total_bath=0\n", + "df_train.total_bath.loc[conditions] = np.nan\n", + "\n", + "# when full_bath==total_bath, half_bath=0 \n", + "df_train.half_bath.loc[df_train.full_bath == df_train.total_bath] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Sh8cG0pr4_hl" + }, + "source": [ + "### #6 Mode Imputation \n", + "* scaling down the latitude and longitide\n", + " - knn imput takes more time due to the larger numbers\n", + " - standardizing gives better results on most algorithms\n", + " - this is a competition, we came to win" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kitrNxKgLWUd" + }, + "outputs": [], + "source": [ + "df_train['latitude'] = df_train.latitude / 100000\n", + "df_train['longitude'] = df_train.longitude / 100000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "y6bhRhu5YZ1d" + }, + "source": [ + "### #7 numberofstories & unitcnt & roomcnt\n", + "* we can devise unit count based on property land type\n", + " - so we can now go ahead and correct the unit counts for each given property" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 208 + }, + "colab_type": "code", + "id": "yHZH4rMNLfBA", + "outputId": "97106bb4-10f2-49a9-f821-03a3972db136" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0 86035\n", + "2.0 2372\n", + "4.0 884\n", + "3.0 622\n", + "5.0 1\n", + "6.0 1\n", + "9.0 1\n", + "11.0 1\n", + "70.0 1\n", + "143.0 1\n", + "Name: unitcnt, dtype: int32" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# where room count is 0, go ahead and NaN it\n", + "df_train.roomcnt.loc[df_train['roomcnt'] == 0] = np.nan\n", + "\n", + "\"\"\"\n", + "propertylandusetypeid & unitcnt are related \n", + " these are the propertylandusetypeid codes & their definitions\n", + " \n", + "#246 -Duplex (2 Units, Any Combination)\n", + "#247 -Triplex (3 Units, Any Combination)\n", + "#248 -Quadruplex (4 Units, Any Combination)\n", + "#260 -Residential General\n", + "#261 -Single Family Residential\n", + "#263 -Mobile Home\n", + "#264 -Townhouse\n", + "#266 -Condominium\n", + "#267 -Cooperative\n", + "#269 -Planned Unit Development\n", + "#275 -Residential Common Area \n", + "#31 - Commercial/Office/Residential Mixed Used\n", + "#47 -Store/Office (Mixed Use)\n", + "#265 -Cluster Home\n", + "\"\"\"\n", + "\n", + "# one unit \n", + "ones = [260,261,263,264,266,267,269,275]\n", + "for one in ones:\n", + " # adjust conditions to one unit indicator\n", + " conditions = ((df_train['propertylandusetypeid'] == one) \n", + " & (df_train['unitcnt'].isna()))\n", + " df_train.unitcnt.loc[conditions] = 1\n", + "\n", + "# two units \n", + "twos = [31,47,246]\n", + "for two in twos:\n", + " # adjust conditions to two unit indicator\n", + " conditions = ((df_train['propertylandusetypeid'] == two) \n", + " & (df_train['unitcnt'].isna()))\n", + " df_train.unitcnt.loc[conditions] = 2\n", + "\n", + "# three units\n", + "conditions = ((df_train['propertylandusetypeid'] == 247) \n", + " & (df_train['unitcnt'].isna()))\n", + "df_train.unitcnt.loc[conditions] = 3\n", + "\n", + "# four units\n", + "conditions = ((df_train['propertylandusetypeid'] == 248) \n", + " & (df_train['unitcnt'].isna()))\n", + "df_train.unitcnt.loc[conditions] = 4\n", + "\n", + "# let's see how out unit counts look\n", + "df_train.unitcnt.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "02yLicmxLs3C" + }, + "source": [ + "### #8 Time to Cut\n", + "**Because of the adjustments made so far a number of columns are no longer needed**\n", + "* transaction date column is no longer of use\n", + " - and can be dropped \n", + "* `preimeter_living_area_sqft` and `total_finished_living_area_sqft` have the same values \n", + " - except that `preimeter_living_area_sqft` has more duplicates\n", + "* `total_area_sqft` and `total_finished_living_area_sqft` have the same values \n", + " - except that \"total_area_sqft\" has more duplicates\n", + "* `total_finished_living_area_sqft` and `finished_living_area_sqft` have the same values \n", + " - except that `finished_living_area_sqft` has more duplicates\n", + "* `base_unfinished_and_finished_area_sqft` and `total_finished_living_area_sqft` have the same values \n", + " - except that `base_unfinished_and_finished_area_sqft` has more duplicates\n", + "* different counties follow different land use code\n", + " - to compare different counties, zillow has created it's own `propertylandusetypeid`\n", + " - hence we can drop `propertycountylandusecode`\n", + " - the same applies to `propertyzoningdesc`\n", + "* Most zip id's either invalid or out of city\n", + " - since enough information about location is given in latitude and longitude \n", + " - let's drop other location related fields\n", + " - `regionidcity`\n", + " - `regionidzip`\n", + " - `regionidneighborhood`\n", + "* `assessmentyear` has a constant value for all rows\n", + " - let's drop it" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OtOgzOqHLyid" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BEFORE: (90275, 60)\n", + "AFTER: (90275, 48)\n" + ] + } + ], + "source": [ + "print(f\"BEFORE: {df_train.shape}\")\n", + "\n", + "# collect columns to drop\n", + "cut = ['propertyzoningdesc','propertycountylandusecode',\n", + " 'base_unfinished_and_finished_area_sqft','finished_living_area_sqft',\n", + " 'total_area_sqft','preimeter_living_area_sqft','regionidzip',\n", + " 'regionidcity','regionidneighborhood','assessmentyear','transactiondate',\n", + " 'censustractandblock']\n", + "# cut columns form dataframe\n", + "df_train = df_train.drop(cut, axis=1)\n", + "\n", + "print(f\"AFTER: {df_train.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "icDvpvSD6BSb" + }, + "source": [ + "### #9 Tax, Year, & Census\n", + "- if tax deliquency flag is null, assume there is no unpaid tax on the property\n", + " - an issue arrises here because `taxdelinquencyflag` is a `StringColumn`\n", + " - i.e. null values indicate no tax delinquency, all other values are `Y` for yes\n", + " - because of this, the normal method of.." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 311 + }, + "colab_type": "code", + "id": "8lYcO_T5XKNN", + "outputId": "596cfad3-890d-4241-b8b8-347673082a7f" + }, + "outputs": [ + { + "ename": "TypeError", + "evalue": "fill_value must be a string or a string series", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# how we'd normally take care of this\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'taxdelinquencyflag'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/envs/rapidsenv/lib/python3.7/site-packages/cudf/core/series.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit)\u001b[0m\n\u001b[1;32m 1186\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"The axis keyword is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1188\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1189\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1190\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/rapidsenv/lib/python3.7/site-packages/cudf/core/column/string.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, fill_value, inplace)\u001b[0m\n\u001b[1;32m 719\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_column\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStringColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 720\u001b[0m ):\n\u001b[0;32m--> 721\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"fill_value must be a string or a string series\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 722\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 723\u001b[0m \u001b[0;31m# replace fill_value with nvstrings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: fill_value must be a string or a string series" + ] + } + ], + "source": [ + "# how we'd normally take care of this\n", + "df_train['taxdelinquencyflag'].fillna(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tA6xG6h59rLi" + }, + "source": [ + "- ...comes with error. \n", + " - Why?\n", + " - the series we are trying to fill the null values of is a string series\n", + " - because of this `.fillna()` requires a sting value (e.g. '0') instead of an int value (e.g. 0)\n", + " - So, what now?\n", + " - there is an easy and straightforward solution with masked assigning!! \n", + " - First\n", + " - switch 1 (current True, actual False) to -1\n", + " - Then\n", + " - switch 0 (current False, actual True) to 1 to reflect True status\n", + " - Finally\n", + " - switch -1 (old True, actual False) to 0 to reflect False status" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 69 + }, + "colab_type": "code", + "id": "Svp6J0cJ5dL0", + "outputId": "03862711-e104-4954-bf9c-61bd51b3a9e3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 88492\n", + "1 1783\n", + "Name: taxdelinquencyflag, dtype: int32" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# if bool 'Y'/None is already set, change string to int bool column via .isna()\n", + "df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'].isna()\n", + "\n", + "# next we must correct the values, with 1 (True) for 'Y' and 0 for no\n", + "switcharoo = [(1,-1),(0,1),(-1,0)]\n", + "# switch values in order\n", + "for pair in switcharoo:\n", + " # tag old value and new value it will be replaced with\n", + " old, new = pair\n", + " # replace old value with new value\n", + " df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'].replace(old, new)\n", + " \n", + "# display values in tax delinquency flag column\n", + "df_train['taxdelinquencyflag'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "w5EAdWXaCTRU" + }, + "source": [ + "- Convert years\n", + " - from yy\n", + " - to 2016 - yyyy \n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 243 + }, + "colab_type": "code", + "id": "6Bic66I9LfGC", + "outputId": "baaa5387-bbd7-4242-a336-0b6b90606935" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0 88492\n", + "2.0 628\n", + "1.0 518\n", + "3.0 210\n", + "4.0 154\n", + "6.0 89\n", + "5.0 85\n", + "7.0 63\n", + "8.0 24\n", + "9.0 8\n", + "10.0 3\n", + "17.0 1\n", + "Name: taxdelinquencyyear, dtype: int32" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# no delinquency? set year to 0\n", + "df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyflag == 0] = 0\n", + "\n", + "# collect x and xx formatted delinquency years w/ matching xxxx year format pair\n", + "year_pairs = [(99,1999), (6,2006), (7,2007), (8,2008), (9,2009), (10,2010),\n", + " (11,2011), (12,2012), (13,2013), (14,2014), (15,2015)]\n", + "# go through the pairs individually \n", + "for year in year_pairs:\n", + " # split the pair in question \n", + " old, new = year\n", + " # replace old year (e.g. 99) with new year (e.g. 1999)\n", + " df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear == old] = new\n", + "\n", + "# adjust delinquency year relative to training year (2016) \n", + "df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear>0] = 2016 - df_train.taxdelinquencyyear.loc[df_train.taxdelinquencyyear>0]\n", + "\n", + "# what've we got? \n", + "df_train.taxdelinquencyyear.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ya7xLHzdGVcs" + }, + "source": [ + "- values in `rawcensustractandblock` represent multiple fields concatened together as float values\n", + " - by converting those values to string we can split each and build new columns:\n", + " - `census_tractnumber`\n", + " - `block_number`" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# ttt=df_train.copy()\n", + "df_train=ttt.copy()\n", + "\n", + "# origional column\n", + "\"\"\"\n", + "\n", + "# both are float columns now\n", + "#rawcensustractandblock\n", + "s_rawcensustractandblock=df_train.rawcensustractandblock.apply(lambda x: str(x))\n", + "\n", + "df_train['census_tractnumber']=s_rawcensustractandblock.str.slice(4,11)\n", + "df_train['block_number']=s_rawcensustractandblock.str.slice(start=11)\n", + "df_train['block_number']=df_train['block_number'].apply(lambda x: x[:4]+'.'+x[4:]+'0' )\n", + "df_train['block_number']=df_train['block_number'].apply(lambda x: int(round(float(x),0)) )\n", + "df_train['block_number']=df_train['block_number'].apply(lambda x: str(x).ljust(4,'0') )\n", + "\n", + "#droping censustractandblock since this is just a duplicate of rawcensustractandblock\n", + "df_train=df_train.drop('censustractandblock', axis=1)\n", + "\n", + "# drooping rawcensustractandblock, since it's already stored as substrings in different column names\n", + "df_train=df_train.drop('rawcensustractandblock', axis=1)\n", + "\n", + "\"\"\"\n", + "pass" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 489 + }, + "colab_type": "code", + "id": "Sg0eN-K1QdZy", + "outputId": "a90de47f-5c88-4834-df44-75a9dedcd07c" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
census_tractnumberblock_number
00053.032043
14037.032000
20320.483000
33107.021000
45303.012001
\n", + "
" + ], + "text/plain": [ + " census_tractnumber block_number\n", + "0 0053.03 2043\n", + "1 4037.03 2000\n", + "2 0320.48 3000\n", + "3 3107.02 1000\n", + "4 5303.01 2001" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# copy rawcensustractandblock with values as string instead of float\n", + "string_data = cudf.Series(df_train['rawcensustractandblock'].values_to_string())\n", + "\n", + "# print(type(string_data))\n", + "# print(len(string_data))\n", + "# print(string_data)\n", + "\n", + "# \"\"\"\n", + "# CURRENT ERROR IN CONVERSION OF VALUES\n", + "# \"\"\"\n", + "# print(f\"\\nNOTE: THERE APPEARS TO BE AN ERROR WHEN CONVERTING TO STRING\\n\"\n", + "# f\" > somewhat random numbers added to end of some values\\n >> e.g. 004, 006\"\n", + "# f\"\\n\\n\\ndf_train['rawcensustractandblock'].head(10).values\\n\"\n", + "# f\"{df_train['rawcensustractandblock'].head(10).values}\\n\\n\"\n", + "# f\"data.head(10).values\\n{string_data.head(10).values}\\n\\n\\n\"\n", + "# f\"THE SAME NUMBERS OCCOUR IN THE FIRST WHEN PUT INTO A LIST\\n\"\n", + "# f\" > not sure how to deal with this now\\n\"\n", + "# f\" >> difficult to reproduce without data\\n\\n\")\n", + "# \"\"\"\n", + "# CURRENT ERROR IN CONVERSION OF VALUES\n", + "# \"\"\"\n", + "\n", + "# set new tract number \n", + "df_train['census_tractnumber'] = string_data.str.slice(4, 11)\n", + "\n", + "# set/adjust block number\n", + "df_train['block_number'] = string_data.str.slice(11)\n", + "df_train['block_number'] = df_train.block_number.str.slice(0,4).str.cat(df_train.block_number.str.slice(4), '.')\n", + "df_train['block_number'] = df_train.block_number.astype('float').round(0).astype('int')\n", + "df_train['block_number'] = df_train.block_number.astype('str').str.ljust(4, '0')\n", + "\n", + "# drop raw census tract and block column, no longer needed\n", + "df_train = df_train.drop('rawcensustractandblock', axis=1)\n", + "\n", + "\"\"\"\n", + "CORRECT NUMBERS THAT SHOULD BE DISPLAYED BY BELOW PRINT STATEMENT\n", + " > currently not being seen due to prior mentioned error\n", + "\n", + "tractnumber\n", + "0 1066.46\n", + "1 0524.22\n", + "2 4638.00\n", + "3 2963.00\n", + "4 0423.38\n", + "dtype: object\n", + "\n", + "blocknumber\n", + "0 1001\n", + "1 2024\n", + "2 3004\n", + "3 2002\n", + "4 1006\n", + "dtype: object\n", + "\"\"\"\n", + "df_train[['census_tractnumber', 'block_number']].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "T71orw51lpTN" + }, + "source": [ + "## Dealing with Missing Values\n", + "### #1 Setting standards\n", + "- Despite corecting and adjusting the data to this point, there are still some columns holding a large majority of null values\n", + "- For some columns, this majority represents over 95% of values\n", + " - Let's identify those columns\n", + " - And drop columns with more than 95% null values \n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 86 + }, + "colab_type": "code", + "id": "xhCosNpXvTVU", + "outputId": "2d969756-decb-4912-94f6-19836eb0323a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " field percentage\n", + "7 buildingclasstypeid 0.999823\n", + "3 architecturalstyletypeid 0.997109\n", + "33 typeconstructiontypeid 0.996688\n" + ] + } + ], + "source": [ + "# calculate null value % for each column & frame it\n", + "missingvalues_prop = (df_train.isnull().sum()/len(df_train)).reset_index()\n", + "missingvalues_prop.columns = ['field','percentage']\n", + "\n", + "# sort by null values percentage, from highest % to lowest\n", + "missingvalues_prop = missingvalues_prop.sort_values(by='percentage', \n", + " ascending=False)\n", + "# identify columns with > 95% of values null\n", + "missingvaluescols = missingvalues_prop.loc[missingvalues_prop['percentage'] > 0.95]\n", + "\n", + "# display columns with highest % null values\n", + "print(missingvaluescols)\n", + "\n", + "# drop columns with more than 95% null values\n", + "df_train = df_train.drop(missingvaluescols['field'], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "az6t2ntBCMRe" + }, + "source": [ + "### #2 Working with Remaining Values\n", + "- the majority of values still missing in unitcnt are rows were `propertylandusetypeid` = 265, \n", + " - which is Cluster Home (i.e. group of houses with shared walls)\n", + " - each cluster is anywhere between 5 to 25 units\n", + " - here we will asssume 10 units as reassonable count" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 225 + }, + "colab_type": "code", + "id": "yB2lzAyopS_S", + "outputId": "db6c7add-5452-4535-8948-a426654851b7" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0 86035\n", + "2.0 2372\n", + "4.0 884\n", + "3.0 622\n", + "10.0 356\n", + "5.0 1\n", + "6.0 1\n", + "9.0 1\n", + "11.0 1\n", + "70.0 1\n", + "143.0 1\n", + "Name: unitcnt, dtype: int32" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# highly related propertylandusetypeid\n", + "df_train['unitcnt'].loc[df_train['propertylandusetypeid'] == 265] = 10\n", + "\n", + "# let's see what we've got\n", + "df_train['unitcnt'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "iR1rBlz-dOdH" + }, + "source": [ + "- a number of pool sizes are null despite there being a pool\n", + " - let's calculate the average pool size\n", + " - and assume those null values are pools of average size" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "-icFDeLSoJwl", + "outputId": "b1ed39c3-3a14-4dc1-eb48-b3429da5cffe" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16932\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how's it look before?\n", + "print(df_train.pool_sqft.isna().sum())\n", + "\n", + "# calculate the average pool square footage for properties with a pool(s)\n", + "poolsizesum_mean = df_train.pool_sqft.loc[df_train['pool_count'] > 0].mean()\n", + "\n", + "# where the property has a pool(s) but pool square feet is 0\n", + "conditions = ((df_train['pool_count'] > 0) \n", + " & (df_train['pool_sqft'].isna()==True))\n", + "\n", + "# set pool square feet to the average pool square footage of pool properties\n", + "df_train['pool_sqft'].loc[conditions] = poolsizesum_mean\n", + "\n", + "# display new null count\n", + "df_train.pool_sqft.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "AyGeXJfEmJBU" + }, + "source": [ + "- total parcel tax\n", + "- structure tax\n", + "- land tax" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "393" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many rows have values in total parcel tax that do not add up given land tax and structure tax\n", + "len(df_train.loc[df_train['total_parcel_tax'] != df_train['land_tax'] + df_train['structure_tax']])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6\n", + "380\n", + "1\n", + "1\n", + "\n", + "6\n", + "380\n", + "1\n", + "1\n" + ] + } + ], + "source": [ + "print(df_train.total_property_tax_2016.isnull().sum())\n", + "print(df_train.structure_tax.isnull().sum())\n", + "print(df_train.total_parcel_tax.isnull().sum())\n", + "print(df_train.land_tax.isnull().sum())\n", + "print()\n", + "\n", + "# where land tax is not a null value\n", + "condition_1 = df_train.land_tax.isnull() == False\n", + "# where total parceltax is not a null value\n", + "condition_2 = df_train.total_parcel_tax.isnull()==False\n", + "\n", + "# pull the total parcel tax column\n", + "total_parcel_tax_not_null = df_train.loc[condition_1 & condition_2, 'total_parcel_tax']\n", + "# pull the land tax column\n", + "land_tax_not_null = df_train.loc[condition_1 & condition_2, 'land_tax']\n", + "\n", + "# total_parcel_tax = structure_tax + land_tax\n", + "# -> structure_tax = total_parcel_tax - land_tax\n", + "correct_structure_tax = total_parcel_tax_not_null - land_tax_not_null\n", + "\n", + "# set the structure_tax values in rows where total and land taxes are not null to these correct values \n", + "df_train['structure_tax'].loc[condition_1 & condition_2] = correct_structure_tax\n", + "\n", + "# where structure tax is still 0, there isn't structure tax\n", + "df_train['structure_tax'].loc[df_train['structure_tax'] == 0] = np.nan\n", + "\n", + "print(df_train.total_property_tax_2016.isnull().sum())\n", + "print(df_train.structure_tax.isnull().sum())\n", + "print(df_train.total_parcel_tax.isnull().sum())\n", + "print(df_train.land_tax.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "380" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many rows have values in total parcel tax that do not add up given land tax and structure tax\n", + "len(df_train.loc[df_train['total_parcel_tax'] != df_train['land_tax'] + df_train['structure_tax']])" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "8SID48LOpYvu", + "outputId": "6d20a3ba-4360-4554-908d-f6d673aece12" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(90275, 45)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" } - ] -} \ No newline at end of file + ], + "source": [ + "# regionidcounty is exact copy of fips code, dropping the dulicate column\n", + "df_train = df_train.drop(['regionidcounty'], axis=1)\n", + "df_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "tWmM2J8_pkg1", + "outputId": "6362e07f-e363-4884-b0c5-9380b5fee956" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1421\n", + "0\n", + "0\n", + "1421\n" + ] + } + ], + "source": [ + "#*******************************\n", + "#bedroomcnt #1421 zero bed room houses ??, observed it's missing all other room count also missing\n", + "# where there is no bedroom, null is a better representation \n", + "\n", + "# before\n", + "print(len(df_train['bedroomcnt'].loc[df_train['bedroomcnt'] == 0]))\n", + "print(df_train.bedroomcnt.isnull().sum())\n", + "\n", + "df_train['bedroomcnt'].loc[df_train['bedroomcnt'] == 0] = np.nan\n", + "\n", + "# after\n", + "print(len(df_train['bedroomcnt'].loc[df_train['bedroomcnt'] == 0]))\n", + "print(df_train.bedroomcnt.isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Room Count\n", + "caluculate full bath and half bath again from total bath as it has few extra columns (fixes 500 missing values in roomcnt)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 208 + }, + "colab_type": "code", + "id": "3qnP2L9LpmeJ", + "outputId": "c0eabce4-3232-4435-8733-779526f18c57" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1165\n", + "1182\n", + "1182\n", + "1421\n", + "69700\n", + "\n", + "1165\n", + "1182\n", + "1182\n", + "1421\n", + "1416\n" + ] + } + ], + "source": [ + "# propertylandusetypeid & total living area\n", + "# total_bath 1165\n", + "# full_bath 1182\n", + "# half_bath 1182\n", + "# bedroomcnt 1421\n", + "# roomcnt 1416\n", + "\n", + "print(df_train.total_bath.isna().sum())\n", + "print(df_train.full_bath.isnull().sum())\n", + "print(df_train.half_bath.isnull().sum())\n", + "print(df_train.bedroomcnt.isnull().sum())\n", + "print(df_train.roomcnt.isnull().sum())\n", + "print()\n", + "\n", + "# roomcnt = (full_bath + half_bath) + bedroomcnt\n", + "# total_bath = fullbath+ 0.5(half_bath)\n", + "\n", + "# where full & half bath and bedroom count are not null, but room count is null\n", + "conditions = ((df_train['full_bath'].isna() == False) \n", + " & (df_train['half_bath'].isna() == False) \n", + " & (df_train['bedroomcnt'].isna() == False) \n", + " & (df_train['roomcnt'].isna() == True))\n", + "\n", + "# calculate room count including all full & half baths along with bedroom count\n", + "new_values = df_train.full_bath.loc[conditions] + df_train.half_bath.loc[conditions] + df_train.bedroomcnt.loc[conditions]\n", + "\n", + "# df_train['roomcnt'] = df_train['roomcnt'].masked_assign(new_values, conditions)\n", + "df_train.roomcnt.loc[conditions] = new_values\n", + "\n", + "\n", + "# most bedroom count and roomcount null are in same place\n", + "# all column null count 1133 all columns are null\n", + "\n", + "print(df_train.total_bath.isna().sum())\n", + "print(df_train.full_bath.isnull().sum())\n", + "print(df_train.half_bath.isnull().sum())\n", + "print(df_train.bedroomcnt.isnull().sum())\n", + "print(df_train.roomcnt.isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Mvy51Ckev9CX" + }, + "source": [ + "- correct number of stories by Zillow's `propertylandusetypeid` indicator\n", + " - where null values are not\n", + " - number of stories can be set to mode\n", + " - where there are null values\n", + " - number of stories can be set to the generally accepted number of stories" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 260 + }, + "colab_type": "code", + "id": "IW4CG2InpolD", + "outputId": "02375307-54e2-432b-8b87-1397c73d56b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BEFORE\n", + "1.0 12016\n", + "2.0 8044\n", + "3.0 508\n", + "4.0 2\n", + "Name: numberofstories, dtype: int32\n", + "69705 remaining null values\n", + "\n", + "AFTER\n", + "1.0 20154\n", + "2.0 423\n", + "3.0 4\n", + "Name: numberofstories, dtype: int32\n", + "69694 remaining null values\n" + ] + } + ], + "source": [ + "# before (what's it look like?)\n", + "print(f'BEFORE\\n{df_train.numberofstories.value_counts()}\\n'\n", + " f'{df_train.numberofstories.isnull().sum()} remaining null values\\n')\n", + "\n", + "#numberofstories\t69705\n", + "\n", + "# store ids and general number of stories \n", + "zillow_type_ids = [(31,2), (246,2), (247,2), (248,2), (260,2), (261,1), \n", + " (263,1), (266,1), (267,1), (269, 2), (275,1)]\n", + "\n", + "# go through each id pair \n", + "for type_id in zillow_type_ids:\n", + " # split the pair into type id and number of stories\n", + " t_id, n_stories = type_id\n", + "\n", + " # when type id matches and story count is not null\n", + " conditions = ((df_train['propertylandusetypeid'] == t_id) \n", + " & (df_train['numberofstories'].isna() == False))\n", + "\n", + " # calculate the mode story count for matching id properties\n", + " mode_stories = df_train.numberofstories.loc[conditions].value_counts()\n", + " \n", + " # when there is at least one value in the value_counts of this property type\n", + " if len(mode_stories) > 0:\n", + " # set mode stories to the most popular value\n", + " mode_stories = mode_stories[0]\n", + " # otherwise\n", + " else:\n", + " # set mode stories to the general average for this property type\n", + " mode_stories = n_stories\n", + "\n", + " # and set those non null values to the most common value seen\n", + " df_train['numberofstories'].loc[conditions] = mode_stories\n", + "\n", + " # when type id matches and story count is null\n", + " conditions = ((df_train['propertylandusetypeid'] == t_id) \n", + " & (df_train['numberofstories'].isna() == False))\n", + " # set null values to the common number of stories seen in that type id\n", + " df_train['numberofstories'].loc[conditions] = n_stories\n", + "\n", + "# edge cases\n", + "conditions = ((df_train.propertylandusetypeid==264) \n", + " & (df_train.numberofstories.isnull()))\n", + "df_train.numberofstories.loc[conditions] = 2\n", + "\n", + "# what's it looking like? \n", + "print(f'AFTER\\n{df_train.numberofstories.value_counts()}\\n'\n", + " f'{df_train.numberofstories.isnull().sum()} remaining null values')" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "colab_type": "code", + "id": "AHcMsDCxprd4", + "outputId": "30481b2c-e035-4478-d62f-63e10a09c17e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BEFORE\n", + "0.0 80446\n", + "1.0 8165\n", + "2.0 1106\n", + "3.0 312\n", + "4.0 21\n", + "5.0 3\n", + "Name: fireplace_count, dtype: int32\n", + "222 remaining null values\n", + "\n", + "AFTER\n", + "0.0 80446\n", + "8165.0 9607\n", + "1.0 222\n", + "Name: fireplace_count, dtype: int32\n", + "0 remaining null values\n" + ] + } + ], + "source": [ + "# before (what's it looking like?) \n", + "print(f'BEFORE\\n{df_train.fireplace_count.value_counts()}\\n'\n", + " f'{df_train.fireplace_count.isnull().sum()} remaining null values\\n')\n", + "\n", + "# where there is a fire place, and count is not null\n", + "conditions = ((df_train.fireplaceflag==1) \n", + " & (df_train.fireplace_count.isna() == False))\n", + "# calculate the mode fireplace count \n", + "mode_fire_count = df_train.loc[conditions, 'fireplace_count'].value_counts()[0]\n", + "# and set those non null values to the most common fireplace count\n", + "df_train['fireplace_count'].loc[conditions] = mode_fire_count\n", + "\n", + "# where there is a fire place, and count is null\n", + "conditions = ((df_train.fireplaceflag==1) \n", + " & (df_train.fireplace_count.isna() == True))\n", + "# set null values to the most common fireplace count\n", + "df_train.fireplace_count.loc[conditions] = 1\n", + "\n", + "# after\n", + "print(f'AFTER\\n{df_train.fireplace_count.value_counts()}\\n'\n", + " f'{df_train.fireplace_count.isnull().sum()} remaining null values')" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 317 + }, + "colab_type": "code", + "id": "FIuSWoJspt3H", + "outputId": "cb11c3a1-1658-4bce-cbde-a1a47ccdc0a8" + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# set basic sns \n", + "color = sns.color_palette()\n", + "sns.set(style=\"darkgrid\")\n", + "# convert dataframe to pandas for ease of use with sns\n", + "pd_train = df_train.to_pandas()\n", + "# set ax plot\n", + "ax = sns.countplot(x=\"buildingqualitytypeid\", data=pd_train)\n", + "# adjust fringe aesthetics\n", + "plt.xticks(rotation='vertical')\n", + "plt.title(\"Frequency of Bathroom count\", fontsize=15)\n", + "# display the graph\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 274 + }, + "colab_type": "code", + "id": "KOHPCFRSp5y9", + "outputId": "e0f3fe2e-a82a-49e8-a798-a3f79a30bcee" + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# let's look more into year built vs type \n", + "plt.plot(pd_train.yearbuilt, pd_train.buildingqualitytypeid, 'ro')\n", + "# display the graph\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_647tI5Lp94v" + }, + "source": [ + "### Final adjustments\n", + "- filling nans" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ofZIC0EdKJ0Y" + }, + "source": [ + "# -----current: test ready-----\n", + "- converting to pandas \n", + " - to see what's going on\n", + " - figuring out what can and what can't be replicated in cuML" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "-4A3-sjRp8AE" + }, + "outputs": [], + "source": [ + "from sklearn import neighbors\n", + "# from cuml.preprocessing.model_selection import train_test_split\n", + "from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split\n", + "#location seems to be related to building quality, (knnclassifier)\n", + "\n", + "def fillna_knn(df, base, target):\n", + " data_colnames = [target] + base\n", + " #print(\"data_colnames\",data_colnames)\n", + " missing_values_boolflag = df[target].isnull() #true for missing rows, false for columns with values\n", + " #print(\"miss\",missing_values_boolflag.head())\n", + " not_missing_boolflag = ~missing_values_boolflag \n", + " #print(\"not miss\",not_missing_boolflag.head())\n", + " number_of_missing_val = missing_values_boolflag.sum()\n", + " print(\"# of miss\",number_of_missing_val)\n", + " not_missing_rows = df.loc[not_missing_boolflag, data_colnames]\n", + " #print(not_missing_rows.head())\n", + " Y = not_missing_rows[target]\n", + " X = not_missing_rows[base]\n", + " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, \n", + " test_size=0.20,\n", + " random_state=3192,\n", + " stratify=Y)\n", + " metrics = ['euclidean'] \n", + " weights = ['distance'] \n", + " numNeighbors = [5,10,15,20,25]\n", + " param_grid = dict(metric=metrics,weights=weights,n_neighbors=numNeighbors)\n", + " cv = StratifiedKFold(n_splits=3,random_state=3192,shuffle=False)\n", + " grid = GridSearchCV(neighbors.KNeighborsClassifier(n_jobs=-1),param_grid=param_grid,cv=cv,scoring='f1_weighted',refit=True,return_train_score=True,verbose=1,n_jobs=-1,pre_dispatch='n_jobs')\n", + " grid.fit(X_train ,Y_train)\n", + " #print(\"grid.cv_results_\",grid.cv_results_)\n", + " print(\"grid.best_estimator_\",grid.best_estimator_)\n", + " print(\"grid.best_params_\",grid.best_params_)\n", + " print(\"grid.scorer_\",grid.scorer_)\n", + " #print(\"grid.n_splits_\",grid.n_splits_)\n", + " y_true, y_pred = Y_test, grid.predict(X_test)\n", + " \n", + " Z = grid.predict(df.loc[missing_values_boolflag, base])\n", + " #df.loc[ missing_values_boolflag, target ] = Z\n", + " return Z" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 573 + }, + "colab_type": "code", + "id": "AT8Osn51lD9v", + "outputId": "8ab0690a-2e06-468e-b7ce-f4d051a3ce83" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CURRENT DF SITUATION\n", + "\n", + "SHAPE = (90275, 45)\n", + "NULL COUNT = 32911\n", + "VALUE COUNTS\n", + "7.0 29310\n", + "4.0 23839\n", + "1.0 2627\n", + "10.0 1461\n", + "12.0 119\n", + "8.0 5\n", + "6.0 2\n", + "11.0 1\n", + "Name: buildingqualitytypeid, dtype: int32\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "0 null\n", + "1 4.0\n", + "2 null\n", + "3 4.0\n", + "4 7.0\n", + "Name: buildingqualitytypeid, dtype: float64" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print('CURRENT DF SITUATION\\n')\n", + "\n", + "print(f'SHAPE = {df_train.shape}')\n", + "print(f'NULL COUNT = {df_train.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{df_train.buildingqualitytypeid.value_counts()}\\n')\n", + "\n", + "df_train['buildingqualitytypeid'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 225 + }, + "colab_type": "code", + "id": "79bB7JKdAEtX", + "outputId": "32b79160-fd19-4d39-988a-fc5fcd7c3284" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NULL COUNT = 0\n", + "VALUE COUNTS\n", + "-1.0 32911\n", + " 7.0 29310\n", + " 4.0 23839\n", + " 1.0 2627\n", + " 10.0 1461\n", + " 12.0 119\n", + " 8.0 5\n", + " 6.0 2\n", + " 11.0 1\n", + "Name: buildingqualitytypeid, dtype: int32\n" + ] + } + ], + "source": [ + "df_train['buildingqualitytypeid'] = df_train['buildingqualitytypeid'].fillna(-1)\n", + "\n", + "print(f'NULL COUNT = {df_train.buildingqualitytypeid.isnull().sum()}\\nVALUE COUNTS\\n{df_train.buildingqualitytypeid.value_counts()}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DVgF1c_p_bN1" + }, + "source": [ + "# -----current: break-----\n", + "- break 1 of 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 903 + }, + "colab_type": "code", + "id": "6eES-hq--NKZ", + "outputId": "2bc86856-507d-47bf-cfab-d29649cba819" + }, + "outputs": [], + "source": [ + "# make safe copy\n", + "test = df_train.copy()\n", + "df_train = test.copy()\n", + "# switch to pandas (figuring out what's going on)\n", + "df_train = df_train.to_pandas()\n", + "\n", + "print(df_train.info())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 762 + }, + "colab_type": "code", + "id": "mAB9bsrPAGzQ", + "outputId": "d847758e-212e-4de8-85c4-89b469b71c48" + }, + "outputs": [], + "source": [ + "# say we run this whole thing by buildingqualitytypeid\n", + "# drop building types that aren't seen at least 3 times in the data\n", + "# df_train = df_train.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", + "\n", + "# BACK TO cuDF\n", + "df_train = cudf.from_pandas(df_train)\n", + "\n", + "print(df_train.buildingqualitytypeid.value_counts())\n", + "print()\n", + "print(df_train.buildingqualitytypeid.isnull().sum())\n", + "print(df_train.shape)\n", + "print()\n", + "\n", + "type_ids = list(set(df_train.buildingqualitytypeid.values))\n", + "from time import sleep\n", + "safe = []\n", + "for tid in type_ids:\n", + " print(tid)\n", + " sleep(5)\n", + " t = len(df_train.loc[df_train.buildingqualitytypeid == tid])\n", + " if t > 3:\n", + " safe.append(tid)\n", + " else:\n", + " print(f'{tid} count too low @ {t}')\n", + "for tid in type_ids:\n", + " if tid not in safe:\n", + " df_train = df_train.loc[df_train.buildingqualitytypeid != tid]\n", + "\n", + "print()\n", + "print(df_train.buildingqualitytypeid.value_counts())\n", + "print()\n", + "\n", + "df_train['buildingqualitytypeid'] = df_train['buildingqualitytypeid'].replace(-1,np.nan)\n", + "print(df_train.buildingqualitytypeid.isnull().sum())\n", + "print(df_train.shape)\n", + "\n", + "# BACK TO PANDAS\n", + "df_train = df_train.to_pandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Zl7eXGt_g1uU" + }, + "source": [ + "# -----current: break-----\n", + "- break 2 of 2\n", + " - below is last cell run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 557 + }, + "colab_type": "code", + "id": "Q3ZBSOHm-79A", + "outputId": "e9ddb9b3-0bb0-4cf7-fa8e-ca35b9ea7f46" + }, + "outputs": [], + "source": [ + "# run cell above (currently broken) as would be in pandas\n", + "not_df_train = df_train.to_pandas()\n", + "not_df_train = not_df_train.groupby(\"buildingqualitytypeid\").filter(lambda x: x.buildingqualitytypeid.size > 3)\n", + "\n", + "missing_values = fillna_knn(not_df_train, \n", + " base = ['latitude', 'longitude'], \n", + " target = 'buildingqualitytypeid')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = not_df_train['buildingqualitytypeid'].isnull()\n", + "not_df_train.loc[missing_values_boolflag, 'buildingqualitytypeid'] = missing_values\n", + "\n", + "print(not_df_train.buildingqualitytypeid.isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "bgXh5OATEacY" + }, + "source": [ + "# BELOW NOT (really) RUN\n", + "- if run, was in pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + }, + "colab_type": "code", + "id": "oTh_XPErqkHf", + "outputId": "3e667bca-70c5-4b66-c7d2-12d171cb140b" + }, + "outputs": [], + "source": [ + "print(df_train.heating_system_id.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['heating_system_id']=temp['heating_system_id'].fillna(-1)\n", + "temp=temp.groupby(\"heating_system_id\").filter(lambda x: x.heating_system_id.size > 3)\n", + "temp['heating_system_id'] = temp['heating_system_id'].replace(-1,np.nan)\n", + "print(temp.heating_system_id.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knn(temp,\n", + " base = [ 'latitude', 'longitude' ] ,\n", + " target = 'heating_system_id')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['heating_system_id'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'heating_system_id' ] = missing_values\n", + "\n", + "\n", + "print(df_train.heating_system_id.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + }, + "colab_type": "code", + "id": "oVjNSkUYqnCt", + "outputId": "80fc7e87-36cd-44b7-96e9-ef0631c7d10c" + }, + "outputs": [], + "source": [ + "print(df_train.ac_id.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['ac_id']=temp['ac_id'].fillna(-1)\n", + "temp=temp.groupby(\"ac_id\").filter(lambda x: x.ac_id.size > 3)\n", + "temp['ac_id'] = temp['ac_id'].replace(-1,np.nan)\n", + "print(temp.ac_id.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knn(temp,\n", + " base = [ 'latitude', 'longitude' ] ,\n", + " target = 'ac_id')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['ac_id'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'ac_id' ] = missing_values\n", + "\n", + "print(df_train.ac_id.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + }, + "colab_type": "code", + "id": "qTbcYbexqr0Y", + "outputId": "3459affa-a41a-4241-ab62-f0dfcadda039" + }, + "outputs": [], + "source": [ + "#yearbuilt\n", + "print(df_train.yearbuilt.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "temp['yearbuilt']=temp['yearbuilt'].fillna(-1)\n", + "temp=temp.groupby(\"yearbuilt\").filter(lambda x: x.yearbuilt.size > 3)\n", + "temp['yearbuilt'] = temp['yearbuilt'].replace(-1,np.nan)\n", + "print(temp.yearbuilt.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knn(temp,\n", + " base = [ 'latitude', 'longitude','buildingqualitytypeid','propertylandusetypeid' ] ,\n", + " target = 'yearbuilt')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['yearbuilt'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'yearbuilt' ] = missing_values\n", + "print(df_train.yearbuilt.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Gx1LYGmfqxLk" + }, + "outputs": [], + "source": [ + "#location seems to be related to building quality, (knnregressor)\n", + "from sklearn.model_selection import KFold\n", + "\n", + "def fillna_knnr( df, base, target):\n", + " data_colnames = [ target ] + base\n", + " #print(\"data_colnames\",data_colnames)\n", + " missing_values_boolflag = df[target].isnull() #true for missing rows, false for columns with values\n", + " #print(\"miss\",missing_values_boolflag.head())\n", + " not_missing_boolflag = ~missing_values_boolflag \n", + " #print(\"not miss\",not_missing_boolflag.head())\n", + " number_of_missing_val = missing_values_boolflag.sum()\n", + " print(\"# of miss\",number_of_missing_val)\n", + " not_missing_rows = df.loc[ not_missing_boolflag, data_colnames]\n", + " #print(not_missing_rows.head())\n", + " Y = not_missing_rows[target]\n", + " X = not_missing_rows[base]\n", + " X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=3192)\n", + " metrics = ['euclidean'] \n", + " weights = ['distance'] \n", + " numNeighbors = [5,10,15,20,25]\n", + " param_grid = dict(metric=metrics,weights=weights,n_neighbors=numNeighbors)\n", + " cv = KFold(n_splits=3,random_state=3192,shuffle=False) \n", + " grid = GridSearchCV(neighbors.KNeighborsRegressor(n_jobs=-1),param_grid=param_grid,cv=cv,scoring='neg_mean_absolute_error',refit=True,return_train_score=True,verbose=1,n_jobs=-1,pre_dispatch='n_jobs')\n", + " grid.fit(X_train ,Y_train)\n", + " #print(\"grid.cv_results_\",grid.cv_results_)\n", + " print(\"grid.best_estimator_\",grid.best_estimator_)\n", + " print(\"grid.best_params_\",grid.best_params_)\n", + " print(\"grid.scorer_\",grid.scorer_)\n", + " #print(\"grid.n_splits_\",grid.n_splits_)\n", + " y_true, y_pred = Y_test, grid.predict(X_test) \n", + " Z = grid.predict(df.loc[missing_values_boolflag, base])\n", + " #df.loc[ missing_values_boolflag, target ] = Z\n", + " return Z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 606 + }, + "colab_type": "code", + "id": "pj5PXm7ozg5l", + "outputId": "3d42279f-221c-444c-8795-05a0832f97cd" + }, + "outputs": [], + "source": [ + "#garage_sqft\n", + "print(df_train.garage_sqft.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.loc[df_train.garagecarcnt>0,df_train.columns].copy()\n", + "\n", + "print(temp.garage_sqft.isnull().sum())\n", + "print(temp.shape)\n", + "\n", + "missing_values=fillna_knnr(temp,\n", + " base = [ 'latitude', 'longitude','garagecarcnt'] ,\n", + " target = 'garage_sqft')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['garage_sqft'].isnull()\n", + "df_train.loc[missing_values_boolflag, 'garage_sqft'] = missing_values\n", + "print(df_train.garage_sqft.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "b7e5CFTyzg_M" + }, + "outputs": [], + "source": [ + "df_train = df_train.drop('parcelid', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "YxGquCOOzhD7" + }, + "outputs": [], + "source": [ + "#All the other columns with missing values seems to be integer, will need regression to be imputed,\n", + "#time to get categorical variables hot encoded\n", + "\n", + "#Identify numerical columns to produce a heatmap\n", + "catcols = ['ac_id','buildingqualitytypeid','deck_flag','fips', 'heating_system_id','has_hottub_or_spa',\n", + " 'just_hottub_or_spa', 'pool_with_spa_tub_yes','pool_with_spa_tub_no','propertylandusetypeid','basement_flag'\n", + " ,'fireplaceflag','taxdelinquencyflag']\n", + "numcols = [x for x in df_train.columns if x not in catcols]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "uVZkszJEzhHj" + }, + "outputs": [], + "source": [ + "#total_finished_living_area_sqft\n", + "\n", + "print(df_train.total_finished_living_area_sqft.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_finished_living_area_sqft.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = [ 'latitude', 'longitude','basementsqft','numberofstories','poolcnt','garagecarcnt','garage_sqft','propertylandusetypeid'] ,\n", + " target = 'total_finished_living_area_sqft')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_finished_living_area_sqft'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_finished_living_area_sqft' ] = missing_values\n", + "print(df_train.total_finished_living_area_sqft.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CVrTMb92zhLX" + }, + "outputs": [], + "source": [ + "#total_bath\t1165\n", + "#full_bath\t1182\n", + "#half_bath\t1182\n", + "#roomcnt\t1416\n", + "#bedroomcnt\t1421\n", + "\n", + "#total_finished_living_area_sqft\n", + "\n", + "print(df_train.total_bath.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_bath.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['propertylandusetypeid','total_finished_living_area_sqft' ] ,\n", + " target = 'total_bath')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_bath'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_bath' ] = missing_values\n", + "print(df_train.total_bath.isnull().sum())#total_bath\t1165\n", + "#full_bath\t1182\n", + "#half_bath\t1182\n", + "#roomcnt\t1416\n", + "#bedroomcnt\t1421\n", + "\n", + "#total_finished_living_area_sqft\n", + "\n", + "print(df_train.total_bath.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_bath.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['propertylandusetypeid','total_finished_living_area_sqft' ] ,\n", + " target = 'total_bath')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_bath'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_bath' ] = missing_values\n", + "print(df_train.total_bath.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "BjIKlu-tzhPI" + }, + "outputs": [], + "source": [ + "# rop half_bath and full bath, as there are only redundant values of total_bath\n", + "df_train = df_train.drop(['full_bath','half_bath'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "02X1y6EBzhT9" + }, + "outputs": [], + "source": [ + "#bedroomcnt\t1421\n", + "\n", + "print(df_train.bedroomcnt.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.bedroomcnt.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['propertylandusetypeid','total_finished_living_area_sqft','total_bath' ] ,\n", + " target = 'bedroomcnt')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['bedroomcnt'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'bedroomcnt' ] = missing_values\n", + "print(df_train.bedroomcnt.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WzkZ_qeHzhXP" + }, + "outputs": [], + "source": [ + "df_train['total_bath']=df_train.total_bath.round(1)\n", + "df_train['bedroomcnt']=df_train.bedroomcnt.round(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "QF9DtDAczhaW" + }, + "outputs": [], + "source": [ + "#recalculate roomcnt\t1416 as we have used imputation for total_bath and bedroomcnt\n", + "\n", + "df_train.loc[(df_train.roomcnt.isnull()),['roomcnt']]=df_train.total_bath + df_train.bedroomcnt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "U5N41TBlz60W" + }, + "outputs": [], + "source": [ + "print(df_train.shape)\n", + "df_train =df_train.loc[(df_train.total_parcel_tax.notnull()) & (df_train.land_tax.notnull()),df_train.columns]\n", + "\n", + "print(df_train.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kv9h5yL3z64Q" + }, + "outputs": [], + "source": [ + "#lot_area_sqft\n", + "print(df_train.lot_area_sqft.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.lot_area_sqft.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['latitude','longitude','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", + " target = 'lot_area_sqft')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['lot_area_sqft'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'lot_area_sqft' ] = missing_values.round(2)\n", + "print(df_train.lot_area_sqft.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "GYJLHrR4z68f" + }, + "outputs": [], + "source": [ + "# predict structure_tax and recalculate total_parcel_tax = land_tax + structure_tax\n", + "\n", + "\n", + "print(df_train.structure_tax.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.structure_tax.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['latitude','longitude','lot_area_sqft','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", + " target = 'structure_tax')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['structure_tax'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'structure_tax' ] = missing_values.round(2)\n", + "print(df_train.structure_tax.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ya-3K06Zz6_y" + }, + "outputs": [], + "source": [ + "#36 total_property_tax_2016 \n", + "\n", + "#total_parcel_tax = land_tax + structure_tax\n", + " \n", + "df_train['total_parcel_tax']=df_train['structure_tax']+df_train['land_tax']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "8Fvr7voVz7DX" + }, + "outputs": [], + "source": [ + "#age of the property\n", + "df_train['age'] = 2016 - df_train['yearbuilt']\n", + "df_train=df_train.drop(['yearbuilt'],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "xl0EOIT-z7Gl" + }, + "outputs": [], + "source": [ + "#total_property_tax_2016\n", + "\n", + "\n", + "print(df_train.total_property_tax_2016.isnull().sum())\n", + "print(df_train.shape)\n", + "temp=df_train.copy()\n", + "print(temp.total_property_tax_2016.isnull().sum())\n", + "print(temp.shape)\n", + "missing_values=fillna_knnr(temp,\n", + " base = ['latitude','longitude','lot_area_sqft','propertylandusetypeid','total_finished_living_area_sqft','roomcnt','numberofstories' ] ,\n", + " target = 'total_property_tax_2016')\n", + "\n", + "print(\"predicted output shape\",missing_values.shape)\n", + "missing_values_boolflag = df_train['total_property_tax_2016'].isnull()\n", + "df_train.loc[ missing_values_boolflag, 'total_property_tax_2016' ] = missing_values.round(2)\n", + "print(df_train.total_property_tax_2016.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "YlaxWegqz7I-" + }, + "outputs": [], + "source": [ + "#check missing values\n", + "\n", + "missing_df = df_train.isnull().sum(axis=0).reset_index()\n", + "missing_df.columns = ['column_name', 'missing_count']\n", + "missing_df = missing_df.loc[missing_df['missing_count']>0]\n", + "missing_df = missing_df.sort_values(by='missing_count')\n", + "print(missing_df)\n", + "print(missing_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "dIl_nqKVz7NQ" + }, + "outputs": [], + "source": [ + "#both the columns above miss 92% of the data, there is no related varibale to impute it, hence dropping them at this point\n", + "\n", + "df_train = df_train.drop(['finished_living_area_entryfloor_sqft2','finished_living_area_entryfloor_sqft1'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HQJd7rgKz7Qq" + }, + "outputs": [], + "source": [ + "#Identify numerical columns to produce a heatmap\n", + "catcols = ['ac_id','buildingqualitytypeid','deck_flag','fips','pool_with_spa_tub_no','pool_with_spa_tub_yes','has_hottub_or_spa',\n", + " 'just_hottub_or_spa','heating_system_id','propertylandusetypeid','basement_flag','fireplaceflag','taxdelinquencyflag']\n", + "numcols = [x for x in df_train.columns if x not in catcols]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "VUN3a6uJz7Ut" + }, + "outputs": [], + "source": [ + "# 2 variables are in object datatype, coverting into numeric\n", + "df_train[['census_tractnumber','block_number']] = df_train[['census_tractnumber','block_number']].apply(pd.to_numeric)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zGx77rRAz7ZZ" + }, + "outputs": [], + "source": [ + "# dropping categorical columns as xgboost feature selection cannot hadle it\n", + "\n", + "train_x = df_train.drop(catcols+['logerror'], axis=1)\n", + "\n", + "train_y=df_train['logerror']\n", + "\n", + "train_x = train_x.astype(float) \n", + "train_y = train_y.astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "es_Ew2YJz7dT" + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = 65\n", + "\n", + "dtype_df = train_x.dtypes.reset_index()\n", + "dtype_df.columns = [\"Count\", \"Column Type\"]\n", + "#dtype_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "bvWIhR38z7fW" + }, + "outputs": [], + "source": [ + "df_train.loc[df_train.has_hottub_or_spa==True,'has_hottub_or_spa']=\"Yes\"\n", + "df_train.loc[df_train.has_hottub_or_spa==0,'has_hottub_or_spa']=\"No\"\n", + "\n", + "df_train.loc[df_train.just_hottub_or_spa==0,'just_hottub_or_spa']=\"No\"\n", + "df_train.loc[df_train.just_hottub_or_spa==1,'just_hottub_or_spa']=\"Yes\"\n", + "\n", + "df_train.loc[df_train.deck_flag==0,'deck_flag']=\"No\"\n", + "df_train.loc[df_train.deck_flag==1,'deck_flag']=\"Yes\"\n", + "\n", + "df_train.loc[df_train.basement_flag==0,'basement_flag']=\"No\"\n", + "df_train.loc[df_train.basement_flag==1,'basement_flag']=\"Yes\"\n", + "\n", + "df_train.loc[df_train.fireplaceflag==False,'fireplaceflag']=\"No\"\n", + "df_train.loc[df_train.fireplaceflag==True,'fireplaceflag']=\"Yes\"\n", + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ef9JjrmMz7jw" + }, + "outputs": [], + "source": [ + "#ac_id,heating_system_id,propertylandusetypeid\n", + "dummieslist=['has_hottub_or_spa','just_hottub_or_spa',\n", + " 'deck_flag','fips','basement_flag','fireplaceflag','taxdelinquencyflag']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Z51Zrt2Uz7oD" + }, + "outputs": [], + "source": [ + "df_train[dummieslist] = df_train[dummieslist].astype(object)\n", + "dummies = pd.get_dummies(df_train[dummieslist], prefix= dummieslist)\n", + "dummies.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "VHBi5Gg6z7tu" + }, + "outputs": [], + "source": [ + "dummies2=['pool_with_spa_tub_no','pool_with_spa_tub_yes']\n", + "df_train[dummies2] = df_train[dummies2].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oocTPKI9z7rk" + }, + "outputs": [], + "source": [ + "import MySQLdb\n", + "from sqlalchemy import create_engine\n", + "engineString = 'mysql+mysqldb://root:MyNewPass@localhost/sakila'\n", + "engine = create_engine(engineString)\n", + "con=engine.connect()\n", + "\n", + "with engine.connect() as con, con.begin():\n", + " df_train.to_sql('df_train_f1', engine, chunksize=10000, index =False,if_exists ='replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zj5ZLSPlz7XC" + }, + "outputs": [], + "source": [ + "numcols2=['basementsqft','total_bath','bedroomcnt','total_finished_living_area_sqft','fireplace_count','garagecarcnt',\n", + " 'garage_sqft','latitude','longitude','lot_area_sqft','poolcnt','pool_sqft','roomcnt','unitcnt','patio_sqft','storage_sqft',\n", + " 'numberofstories','structure_tax','total_parcel_tax','land_tax','total_property_tax_2016','taxdelinquencyyear','transaction_month',\n", + " 'census_tractnumber','block_number','age']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fp53dotszhgA" + }, + "outputs": [], + "source": [ + "Y=df_train['logerror']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "O0Uaei4rzhj6" + }, + "outputs": [], + "source": [ + "#buildingqualitytypeid ->has order\n", + "le = LabelEncoder()\n", + "df_train['buildingqualitytypeid']=le.fit_transform(df_train.buildingqualitytypeid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "g4-g-uvtzhds" + }, + "outputs": [], + "source": [ + "#df_train.ac_id.value_counts()\n", + "#df_train.propertylandusetypeid.value_counts()\n", + "#'buildingqualitytypeid','ac_id','heating_system_id','propertylandusetypeid'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "SzliXafdzhRd" + }, + "outputs": [], + "source": [ + "X=pd.concat([dummies,df_train[dummies2],df_train[numcols2],df_train[['buildingqualitytypeid','ac_id','heating_system_id','propertylandusetypeid']]],axis=1)\n", + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DBsZjyQd0W1N" + }, + "outputs": [], + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=3192)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ihXFZWcn0W5D" + }, + "outputs": [], + "source": [ + "# top features\n", + "import xgboost as xgb\n", + "xgb_params = {\n", + " 'eta': 0.05,\n", + " 'max_depth': 8,\n", + " 'subsample': 0.7,\n", + " 'colsample_bytree': 0.7,\n", + " 'objective': 'reg:linear',\n", + " 'silent': 1,\n", + " 'seed' : 0\n", + "}\n", + "dtrain = xgb.DMatrix(X_train, Y_train, feature_names=X_train.columns.values)\n", + "model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)\n", + "# plot the important features #\n", + "fig, ax = plt.subplots(figsize=(12,18))\n", + "#max_num_features=50, error for no reason \n", + "xgb.plot_importance(model, height=0.8, ax=ax)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TQEEzNkX0W9w" + }, + "outputs": [], + "source": [ + "#top features\n", + "xgboost_selection=['total_finished_living_area_sqft','latitude','structure_tax','total_property_tax_2016',\n", + "'total_parcel_tax','land_tax','longitude','lot_area_sqft','census_tractnumber','age','total_bath','bedroomcnt',\n", + "'block_number','transaction_month','roomcnt','taxdelinquencyyear','unitcnt','taxdelinquencyflag_No',\n", + "'fips_LA','garage_sqft','pool_with_spa_tub_no','has_hottub_or_spa_No','garagecarcnt','deck_flag_No',\n", + "'poolcnt','pool_sqft'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Rr_6EO4G0XEj" + }, + "outputs": [], + "source": [ + "# feature selection\n", + "#c_id,heating_system_id,propertylandusetypeid\n", + "from sklearn.ensemble import ExtraTreesRegressor\n", + "from sklearn.feature_selection import SelectFromModel\n", + "reg = ExtraTreesRegressor(n_estimators=500, max_depth=8, max_features='sqrt',\n", + " min_samples_split=100 ,min_samples_leaf=10, bootstrap=True,n_jobs=-1, random_state=3192)\n", + "reg = reg.fit(X_train, Y_train)\n", + "#print(\"importance\",reg.feature_importances_) \n", + "model = SelectFromModel(reg, prefit=True)\n", + "X_new = model.transform(X_train)\n", + "print(X_train.shape)\n", + "print(X_new.shape) \n", + "\n", + "feat_names = X.columns.values\n", + "importances = reg.feature_importances_\n", + "std = np.std([tree.feature_importances_ for tree in reg.estimators_], axis=0)\n", + "indices = np.argsort(importances)[::-1][:26]\n", + "plt.figure(figsize=(12,12))\n", + "plt.title(\"Feature importances\")\n", + "plt.bar(range(len(indices)), importances[indices], color=\"r\", yerr=std[indices], align=\"center\")\n", + "plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')\n", + "plt.xlim([-1, len(indices)])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "i4FCNOG70XIU" + }, + "outputs": [], + "source": [ + "tree_selection=[\n", + " 'total_finished_living_area_sqft','structure_tax','total_property_tax_2016','total_bath','total_parcel_tax',\n", + " 'age','latitude','census_tractnumber','bedroomcnt','longitude','land_tax','propertylandusetypeid','block_number',\n", + " 'buildingqualitytypeid','numberofstories','heating_system_id','unitcnt','transaction_month','lot_area_sqft','roomcnt',\n", + " 'garage_sqft','garagecarcnt','pool_with_spa_tub_no','poolcnt','fips_LA','taxdelinquencyyear','patio_sqft',\n", + " 'taxdelinquencyflag_No','taxdelinquencyflag_Yes'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TmIS1WAS0XMW" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import KFold\n", + "from sklearn.linear_model import Ridge,Lasso\n", + "from sklearn.feature_selection import RFECV\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import r2_score,mean_absolute_error,make_scorer\n", + "\n", + "#model=Lasso(alpha=0.2, fit_intercept=True, normalize=True, precompute=False, copy_X=True,\n", + " # max_iter=1000, \n", + " # tol=0.0001, warm_start=False, positive=False, random_state=3192, selection='cyclic')\n", + "\n", + "#Ridge(random_state=3192,solver='auto',fit_intercept=True,normalize=True,alpha=0.1)\n", + "#LinearRegression(n_jobs=-1,fit_intercept=True, normalize=True, copy_X=True)\n", + "\n", + "\n", + "rfecv = RFECV(estimator=LinearRegression(n_jobs=-1,fit_intercept=True, normalize=True, copy_X=True), step=2, cv=KFold(4),scoring='neg_mean_absolute_error')\n", + "rfecv.fit(X_train, Y_train)\n", + "\n", + "print(\"Optimal number of features : %d\" % rfecv.n_features_)\n", + "\n", + "# Plot number of features VS. cross-validation scores\n", + "plt.figure()\n", + "plt.xlabel(\"Number of features selected\")\n", + "\n", + "plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n", + "plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DIw8O00U0XPR" + }, + "outputs": [], + "source": [ + "rfe_selection = [i for indx,i in enumerate(X.columns) if rfecv.support_[indx] == True]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gHA0x5_80XWy" + }, + "outputs": [], + "source": [ + "#Linear regression with rfe_selection selection\n", + "#rfe_selection, tree_selection, xgboost_selection\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import r2_score,mean_absolute_error,make_scorer,mean_squared_error\n", + "\n", + "# just to check whether normalized /not normalized data gives better results\n", + "parameters = {'fit_intercept':[True], 'normalize':[True,False], 'copy_X':[True]}\n", + "scoring = {'MAE':'neg_mean_absolute_error','MSE': make_scorer(mean_squared_error,greater_is_better=False)}\n", + "\n", + "grid1 = GridSearchCV(LinearRegression(n_jobs=-1),param_grid=parameters, scoring=scoring,cv=5,refit='MAE',\n", + " return_train_score=True,\n", + " verbose=0,n_jobs=-1,pre_dispatch='n_jobs')\n", + "\n", + "grid1.fit(X_train[rfe_selection], Y_train)\n", + "#print(\"5. grid best_score_\",abs(grid.best_score_))\n", + "Y_pred = grid1.predict(X_test[rfe_selection])\n", + "print(\"MAE on test data\",mean_absolute_error(Y_test,Y_pred))\n", + "print(\"MSE on test data\",mean_squared_error(Y_test,Y_pred))\n", + "print(\"R Squared data \",r2_score(Y_test,Y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ekn4pBs60XcT" + }, + "outputs": [], + "source": [ + "#pca selection\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import scale\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import scale\n", + "%matplotlib inline\n", + "scaled_x = scale(X)\n", + "pca = PCA(n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", + "pca.fit(scaled_x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yFuT-wUN0XfV" + }, + "outputs": [], + "source": [ + "# The amount of variance that each PC explains\n", + "var= pca.explained_variance_ratio_\n", + "#Cumulative Variance explains\n", + "var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)\n", + "print(var1)\n", + "plt.plot(var1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "iPN4OBUe0XlD" + }, + "outputs": [], + "source": [ + "#Looking at above plot I'm taking 28 variables\n", + "\n", + "pca = PCA(n_components=28, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", + "pca.fit(scaled_x)\n", + "\n", + "pca1=pca.fit_transform(scaled_x)\n", + "\n", + "pca = PCA(n_components=28, copy=True, whiten=True, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)\n", + "pca.fit(scaled_x)\n", + "pca2=pca.fit_transform(scaled_x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "EE4ednPC0XjX" + }, + "outputs": [], + "source": [ + "pcaX_train, pcaX_test, pcaY_train, pcaY_test = train_test_split(pca1, Y, test_size=0.10, random_state=3192)\n", + "pca2X_train, pca2X_test, pca2Y_train, pca2Y_test = train_test_split(pca2, Y, test_size=0.10, random_state=3192)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "erYMXvTG0XaK" + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.metrics import mean_absolute_error,make_scorer\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "# just to check whether normalized /not normalized data gives better results\n", + "\n", + " # 0.005 for 1200 trees.\n", + "param_grid={'n_estimators':[1200],'max_features':[22]}\n", + "\n", + " \n", + "grid13 = GridSearchCV(GradientBoostingRegressor(subsample=0.8,min_samples_leaf=50,min_samples_split=50,max_depth=9,loss='ls',criterion='friedman_mse',learning_rate=0.005,random_state=3192),\n", + " param_grid=param_grid, cv=5,refit='MAE',\n", + " return_train_score=True,\n", + " verbose=2,n_jobs=-1,pre_dispatch='n_jobs')\n", + "\n", + "grid13.fit(pcaX_train, pcaY_train)\n", + "print(\"5. grid best_score_\",abs(grid13.best_score_))\n", + "print(\"best params\",grid13.best_params_)\n", + "print(\"best score\",grid13.best_score_)\n", + "Y_pred = grid13.predict(pcaX_test)\n", + "print(\"MAE on test data\",mean_absolute_error(pcaY_test,Y_pred))\n", + "print(\"MSE on test data\",mean_squared_error(pcaY_test,Y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "BgtbLCcR0XUx" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "FjdSCEFP0XCM" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WzATgLxmam5w" + }, + "source": [ + "In this competition, Zillow is asking you to predict the log-error between their Zestimate and the actual sale price, given all the features of a home. The log error is defined as\n", + "\n", + "logerror=log(Zestimate)−log(SalePrice)\n", + "and it is recorded in the transactions file train.csv. In this competition, you are going to predict the logerror for the months in Fall 2017. Since all the real estate transactions in the U.S. are publicly available, we will close the competition (no longer accepting submissions) before the evaluation period begins.\n", + "\n", + "Train/Test split\n", + "You are provided with a full list of real estate properties in three counties (Los Angeles, Orange and Ventura, California) data in 2016.\n", + "The train data has all the transactions before October 15, 2016, plus some of the transactions after October 15, 2016.\n", + "The test data in the public leaderboard has the rest of the transactions between October 15 and December 31, 2016.\n", + "The rest of the test data, which is used for calculating the private leaderboard, is all the properties in October 15, 2017, to December 15, 2017. This period is called the \"sales tracking period\", during which we will not be taking any submissions.\n", + "You are asked to predict 6 time points for all properties: October 2016 (201610), November 2016 (201611), December 2016 (201612), October 2017 (201710), November 2017 (201711), and December 2017 (201712).\n", + "Not all the properties are sold in each time period. If a property was not sold in a certain time period, that particular row will be ignored when calculating your score.\n", + "If a property is sold multiple times within 31 days, we take the first reasonable value as the ground truth. By \"reasonable\", we mean if the data seems wrong, we will take the transaction that has a value that makes more sense.\n", + "File descriptions\n", + "properties_2016.csv - all the properties with their home features for 2016. Note: Some 2017 new properties don't have any data yet except for their parcelid's. Those data points should be populated when properties_2017.csv is available.\n", + "properties_2017.csv - all the properties with their home features for 2017 (released on 10/2/2017)\n", + "train_2016.csv - the training set with transactions from 1/1/2016 to 12/31/2016\n", + "train_2017.csv - the training set with transactions from 1/1/2017 to 9/15/2017 (released on 10/2/2017)\n", + "sample_submission.csv - a sample submission file in the correct format" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "zillow_kaggle_zestimate_comp.ipynb", + "provenance": [], + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}