From 3499d18fec1c345ff455907a2e06e191af9146fa Mon Sep 17 00:00:00 2001 From: Siddharth Singha Roy Date: Tue, 6 Oct 2020 03:41:37 +0530 Subject: [PATCH] Delete ToxicBot_Training.ipynb --- ToxicBot/classifier/ToxicBot_Training.ipynb | 1974 ------------------- 1 file changed, 1974 deletions(-) delete mode 100644 ToxicBot/classifier/ToxicBot_Training.ipynb diff --git a/ToxicBot/classifier/ToxicBot_Training.ipynb b/ToxicBot/classifier/ToxicBot_Training.ipynb deleted file mode 100644 index 6194187..0000000 --- a/ToxicBot/classifier/ToxicBot_Training.ipynb +++ /dev/null @@ -1,1974 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "ToxicBot Training.ipynb", - "provenance": [], - "collapsed_sections": [ - "flL1nSHsyZGy", - "20_p0ENZydJG", - "5LmOh6SBzP10", - "fSE4a7vuzacB" - ] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "flL1nSHsyZGy" - }, - "source": [ - "# Import the Dataset" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "T4yzf2hcxpO7" - }, - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import tensorflow as tf\n", - "from keras.preprocessing.text import Tokenizer\n", - "from keras.preprocessing.sequence import pad_sequences\n", - "from keras import Sequential\n", - "from keras.layers import Dense, Embedding, GlobalMaxPool1D, LSTM\n", - "from keras.losses import BinaryCrossentropy\n", - "from keras.metrics import AUC\n", - "from keras.optimizers import Adam\n", - "from keras.models import model_from_json\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "import re\n", - "import gc\n", - "import pickle" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "hNvk_-rPyGBl", - "outputId": "85150b77-4fa1-4715-fa67-30deb8d80de2", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "print(\"Num GPUs Available: \", len(tf.config.experimental.list_physical_devices('GPU')))\n", - "tf.debugging.set_log_device_placement(True)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Num GPUs Available: 1\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "E55YwcRAyMH1", - "outputId": "1eaa8892-2229-42a3-908c-4332069d67ad", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 - } - }, - "source": [ - "! pip install kaggle" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requirement already satisfied: kaggle in /usr/local/lib/python3.6/dist-packages (1.5.8)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from kaggle) (4.41.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from kaggle) (2.23.0)\n", - "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/dist-packages (from kaggle) (1.15.0)\n", - "Requirement already satisfied: slugify in /usr/local/lib/python3.6/dist-packages (from kaggle) (0.0.1)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kaggle) (2.8.1)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle) (2020.6.20)\n", - "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle) (1.24.3)\n", - "Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle) (4.0.1)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle) (3.0.4)\n", - "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify->kaggle) (1.3)\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZTBhDHo-yNzm", - "outputId": "13ebf6e0-d209-46b5-d26c-8d6b1b8bed88", - "colab": { - "resources": { - "http://localhost:8080/nbextensions/google.colab/files.js": { - "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCkgewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwogICAgICBwZXJjZW50LnRleHRDb250ZW50ID0KICAgICAgICAgIGAke01hdGgucm91bmQoKHBvc2l0aW9uIC8gZmlsZURhdGEuYnl0ZUxlbmd0aCkgKiAxMDApfSUgZG9uZWA7CiAgICB9CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", - "ok": true, - "headers": [ - [ - "content-type", - "application/javascript" - ] - ], - "status": 200, - "status_text": "" - } - }, - "base_uri": "https://localhost:8080/", - "height": 89 - } - }, - "source": [ - "from google.colab import files\n", - "files.upload()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " Upload widget is only available when the cell has been executed in the\n", - " current browser session. Please rerun this cell to enable.\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "Saving kaggle.json to kaggle.json\n" - ], - "name": "stdout" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'kaggle.json': b'{\"username\":\"sid200026\",\"key\":\"c1398985906d76ebf510dbc0e65e5ceb\"}'}" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 4 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "d2AU8nMfyPUG" - }, - "source": [ - "! mkdir ~/.kaggle\n", - "! cp kaggle.json ~/.kaggle/" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "0XtLlLkuyQtx" - }, - "source": [ - "! chmod 600 ~/.kaggle/kaggle.json" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "r91NqgxUyRzG", - "outputId": "a795fcdd-9640-448f-b029-7f6cab871a61", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 238 - } - }, - "source": [ - "! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Warning: Looks like you're using an outdated API Version, please consider updating (server 1.5.6 / client 1.5.4)\n", - "Downloading test.csv.zip to /content\n", - " 21% 5.00M/23.4M [00:00<00:01, 17.7MB/s]\n", - "100% 23.4M/23.4M [00:00<00:00, 67.5MB/s]\n", - "Downloading sample_submission.csv.zip to /content\n", - " 0% 0.00/1.39M [00:00] 2.03G 1.94MB/s in 16m 57s \n", - "\n", - "2020-10-03 12:56:41 (2.04 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]\n", - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "TStVpry02lDb", - "outputId": "47a803f8-462a-4fee-8f10-9c76ff59639e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "! unzip glove.840B.300d.zip " - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Archive: glove.840B.300d.zip\n", - " inflating: glove.840B.300d.txt \n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5LmOh6SBzP10" - }, - "source": [ - "# Data Fetching" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ERRmMO0fzSjM", - "outputId": "944560c7-4543-447f-f3f6-165f924ee00c", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "train = pd.read_csv('dataset/train.csv', dtype={'comment_text':'string'})\n", - "train.head()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
00000997932d777bfExplanation\n", - "Why the edits made under my userna...000000
1000103f0d9cfb60fD'aww! He matches this background colour I'm s...000000
2000113f07ec002fdHey man, I'm really not trying to edit war. It...000000
30001b41b1c6bb37e\"\n", - "More\n", - "I can't make any real suggestions on im...000000
40001d958c54c6e35You, sir, are my hero. Any chance you remember...000000
\n", - "
" - ], - "text/plain": [ - " id ... identity_hate\n", - "0 0000997932d777bf ... 0\n", - "1 000103f0d9cfb60f ... 0\n", - "2 000113f07ec002fd ... 0\n", - "3 0001b41b1c6bb37e ... 0\n", - "4 0001d958c54c6e35 ... 0\n", - "\n", - "[5 rows x 8 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 14 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZLWQ36_XzUIF", - "outputId": "5486798e-273d-4dff-cdbf-10786edeed73", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "train = train.drop(columns='id')\n", - "train.head()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
comment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
0Explanation\n", - "Why the edits made under my userna...000000
1D'aww! He matches this background colour I'm s...000000
2Hey man, I'm really not trying to edit war. It...000000
3\"\n", - "More\n", - "I can't make any real suggestions on im...000000
4You, sir, are my hero. Any chance you remember...000000
\n", - "
" - ], - "text/plain": [ - " comment_text ... identity_hate\n", - "0 Explanation\n", - "Why the edits made under my userna... ... 0\n", - "1 D'aww! He matches this background colour I'm s... ... 0\n", - "2 Hey man, I'm really not trying to edit war. It... ... 0\n", - "3 \"\n", - "More\n", - "I can't make any real suggestions on im... ... 0\n", - "4 You, sir, are my hero. Any chance you remember... ... 0\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 15 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PQzzIypxzWoH", - "outputId": "17847626-3e5c-4c76-be6c-0ee1afd6b5ba", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "test = pd.read_csv('dataset/test.csv', dtype={'comment_text':'string'})\n", - "test.head()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idcomment_text
000001cee341fdb12Yo bitch Ja Rule is more succesful then you'll...
10000247867823ef7== From RfC == \n", - "\n", - " The title is fine as it is, ...
200013b17ad220c46\" \n", - "\n", - " == Sources == \n", - "\n", - " * Zawe Ashton on Lapland...
300017563c3f7919a:If you have a look back at the source, the in...
400017695ad8997ebI don't anonymously edit articles at all.
\n", - "
" - ], - "text/plain": [ - " id comment_text\n", - "0 00001cee341fdb12 Yo bitch Ja Rule is more succesful then you'll...\n", - "1 0000247867823ef7 == From RfC == \n", - "\n", - " The title is fine as it is, ...\n", - "2 00013b17ad220c46 \" \n", - "\n", - " == Sources == \n", - "\n", - " * Zawe Ashton on Lapland...\n", - "3 00017563c3f7919a :If you have a look back at the source, the in...\n", - "4 00017695ad8997eb I don't anonymously edit articles at all." - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 16 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "JkvG1xtszXvB", - "outputId": "cd835f63-592d-41fd-fd1f-992564a9e2f9", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "test_labels = pd.read_csv('dataset/test_labels.csv')\n", - "test_labels.head()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idtoxicsevere_toxicobscenethreatinsultidentity_hate
000001cee341fdb12-1-1-1-1-1-1
10000247867823ef7-1-1-1-1-1-1
200013b17ad220c46-1-1-1-1-1-1
300017563c3f7919a-1-1-1-1-1-1
400017695ad8997eb-1-1-1-1-1-1
\n", - "
" - ], - "text/plain": [ - " id toxic severe_toxic ... threat insult identity_hate\n", - "0 00001cee341fdb12 -1 -1 ... -1 -1 -1\n", - "1 0000247867823ef7 -1 -1 ... -1 -1 -1\n", - "2 00013b17ad220c46 -1 -1 ... -1 -1 -1\n", - "3 00017563c3f7919a -1 -1 ... -1 -1 -1\n", - "4 00017695ad8997eb -1 -1 ... -1 -1 -1\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 17 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BlnJqWef-AQj", - "outputId": "b606aa53-b640-4460-aa76-dd7f46663cc1", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "test_labels = test_labels[test_labels['toxic'] != -1] \n", - "test_labels.head()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idtoxicsevere_toxicobscenethreatinsultidentity_hate
50001ea8717f6de06000000
7000247e83dcc1211000000
110002f87b16116a7f000000
130003e1cccfd5a40a000000
1400059ace3e3e9a53000000
\n", - "
" - ], - "text/plain": [ - " id toxic severe_toxic ... threat insult identity_hate\n", - "5 0001ea8717f6de06 0 0 ... 0 0 0\n", - "7 000247e83dcc1211 0 0 ... 0 0 0\n", - "11 0002f87b16116a7f 0 0 ... 0 0 0\n", - "13 0003e1cccfd5a40a 0 0 ... 0 0 0\n", - "14 00059ace3e3e9a53 0 0 ... 0 0 0\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 18 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4mRrv72l-BnS", - "outputId": "ed1e935f-a91d-4882-f4b6-41a2eedc660e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "test_labels.shape" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(63978, 7)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 19 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "MJpYN-cw-C2c", - "outputId": "c78e0c28-19a5-4187-e518-faa2932668ea", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "test = pd.merge(test, test_labels, how='inner', on ='id')\n", - "test = test.drop(columns=['id'])\n", - "test.head()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
comment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
0Thank you for understanding. I think very high...000000
1:Dear god this site is horrible.000000
2\"::: Somebody will invariably try to add Relig...000000
3\" \n", - "\n", - " It says it right there that it IS a type....000000
4\" \n", - "\n", - " == Before adding a new product to the lis...000000
\n", - "
" - ], - "text/plain": [ - " comment_text ... identity_hate\n", - "0 Thank you for understanding. I think very high... ... 0\n", - "1 :Dear god this site is horrible. ... 0\n", - "2 \"::: Somebody will invariably try to add Relig... ... 0\n", - "3 \" \n", - "\n", - " It says it right there that it IS a type.... ... 0\n", - "4 \" \n", - "\n", - " == Before adding a new product to the lis... ... 0\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 20 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Un65ephE-E9N", - "outputId": "c9cc5d68-defa-431a-ece9-1fae7526e983", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "test.shape" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(63978, 7)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 21 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jTa9Ny4i-Gdz", - "outputId": "9c12e263-a468-4f15-e92a-6f6885f73753", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "train = pd.concat([train,test], ignore_index=True)\n", - "train.head()" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
comment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
0Explanation\n", - "Why the edits made under my userna...000000
1D'aww! He matches this background colour I'm s...000000
2Hey man, I'm really not trying to edit war. It...000000
3\"\n", - "More\n", - "I can't make any real suggestions on im...000000
4You, sir, are my hero. Any chance you remember...000000
\n", - "
" - ], - "text/plain": [ - " comment_text ... identity_hate\n", - "0 Explanation\n", - "Why the edits made under my userna... ... 0\n", - "1 D'aww! He matches this background colour I'm s... ... 0\n", - "2 Hey man, I'm really not trying to edit war. It... ... 0\n", - "3 \"\n", - "More\n", - "I can't make any real suggestions on im... ... 0\n", - "4 You, sir, are my hero. Any chance you remember... ... 0\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 22 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BztS2U7j-Ul2", - "outputId": "b2bf5abc-40f3-4666-d95e-8bf94a37898d", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "train.shape" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(223549, 7)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 23 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fSE4a7vuzacB" - }, - "source": [ - "# Preprocessing" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZWfi-BUGzb6h" - }, - "source": [ - "X = train['comment_text'].values\n", - "Y = train.iloc[:,1:].values" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "cFV5DRx3zet3", - "outputId": "c65c020e-2145-44f2-bac5-dc647402a453", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "print(X.shape)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "(223549,)\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "n-421Qp1zfpa", - "outputId": "a59eb74d-74aa-4cf4-fdf7-9674126f399b", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "print(Y.shape)\n", - "Y" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "(223549, 6)\n" - ], - "name": "stdout" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([[0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0],\n", - " ...,\n", - " [0, 0, 0, 0, 0, 0],\n", - " [1, 0, 1, 0, 1, 0],\n", - " [0, 0, 0, 0, 0, 0]])" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 26 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1_j1m17XzkLM" - }, - "source": [ - "X_train, y_train = X,Y" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "yYMTHx72zlLQ" - }, - "source": [ - "tokenizer = Tokenizer()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "y-FTzQ3Tzmhj" - }, - "source": [ - "tokenizer.fit_on_texts(X_train)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "nWYvM5yRznqx" - }, - "source": [ - "X_train_seq = tokenizer.texts_to_sequences(X_train)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "NtuwDPS-zoyC", - "outputId": "844bb1a7-c846-4c29-c4b3-cb7ca0812706", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "len(X_train_seq)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "223549" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 31 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PJvTZeyLzqOw", - "outputId": "01feb3b6-c743-4edf-8c4a-b9b0aa972e56", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "print(len(tokenizer.word_index))" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "300257\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "MfSfTYMDzsjb" - }, - "source": [ - "X_train_seq = pad_sequences(X_train_seq, maxlen=250)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "6axAq78wzulo", - "outputId": "82211a0e-5d9f-4d05-e00f-3289d28dc1ac", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "X_train_seq.shape" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(223549, 250)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 34 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Jb3W1FA2zwj3" - }, - "source": [ - "# Pre-Trained Embedding" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ESKpfaJYzzhg", - "outputId": "e007f8dd-086d-4166-e294-dd4efbe01505", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "vocab_size = len(tokenizer.word_index) + 1\n", - "vocab_size" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "300258" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 35 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lrQeeEf40eA7" - }, - "source": [ - "embeddings_index = dict()\n", - "glove = open('glove.840B.300d.txt')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "dXH3JUI_0hjr", - "outputId": "6005dceb-c7ae-45b0-e895-a908cdc88509", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "for line in glove:\n", - " word, coefs = line.split(maxsplit=1)\n", - " coefs = np.fromstring(coefs, \"f\", sep=\" \")\n", - " embeddings_index[word] = coefs" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: DeprecationWarning: string or file could not be read to its end due to unmatched data; this will raise a ValueError in the future.\n", - " This is separate from the ipykernel package so we can avoid doing imports until\n" - ], - "name": "stderr" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "eD0JgutO3zHj", - "outputId": "f19f1a81-3de2-4b71-d1bb-19a4143390dc", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "print(\"Found %s word vectors.\" % len(embeddings_index))" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Found 2195884 word vectors.\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oxH29B3p0mYh" - }, - "source": [ - "glove.close()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "gwmkYd4g0px1", - "outputId": "ff22d8ff-abcb-4c7f-cb76-81db6d55eea4", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "embedding_matrix = np.zeros((vocab_size, 300))\n", - "miss = 0\n", - "\n", - "for word, i in tokenizer.word_index.items():\n", - " embedding_vector = embeddings_index.get(word)\n", - " if embedding_vector is not None:\n", - " if embedding_vector.shape[0] != 0:\n", - " embedding_matrix[i] = embedding_vector\n", - " else:\n", - " miss+=1\n", - "\n", - "print(miss)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "11\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "w7EauSqc7jDF", - "outputId": "12bf181b-adba-495e-b8a1-61e61df7e65e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "embedding_matrix.shape" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(300258, 300)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 41 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "n5-16xZ2z1n2" - }, - "source": [ - "model = Sequential()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qLdb6ADp1CH6" - }, - "source": [ - "model.add(Embedding(input_dim=vocab_size, output_dim = 300, input_length = 250, weights=[embedding_matrix], trainable = False))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ub6RlS2e1L4j" - }, - "source": [ - "model.add(LSTM(units=150,return_sequences=True, dropout=0.1))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "-WvAfOf71O3A" - }, - "source": [ - "model.add(GlobalMaxPool1D())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Uhwsjl9O1P4h" - }, - "source": [ - "model.add(Dense(units = 64, activation='relu'))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qPXjhT601RTK" - }, - "source": [ - "model.add(Dense(units = 16, activation='relu'))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "YQB50pkQ1Sc9" - }, - "source": [ - "model.add(Dense(units = 6, activation='sigmoid'))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Uufbr7XM1Txq" - }, - "source": [ - "model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "IjSy820P1VH8", - "outputId": "271d1d30-8da1-4f35-f603-5080290fdf22", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "source": [ - "print(model.summary())" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Model: \"sequential\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "embedding (Embedding) (None, 250, 300) 90077400 \n", - "_________________________________________________________________\n", - "lstm (LSTM) (None, 250, 150) 270600 \n", - "_________________________________________________________________\n", - "global_max_pooling1d (Global (None, 150) 0 \n", - "_________________________________________________________________\n", - "dense (Dense) (None, 64) 9664 \n", - "_________________________________________________________________\n", - "dense_1 (Dense) (None, 16) 1040 \n", - "_________________________________________________________________\n", - "dense_2 (Dense) (None, 6) 102 \n", - "=================================================================\n", - "Total params: 90,358,806\n", - "Trainable params: 281,406\n", - "Non-trainable params: 90,077,400\n", - "_________________________________________________________________\n", - "None\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "blUIiMMx1Wk3" - }, - "source": [ - "history = model.fit(np.array(X_train_seq), np.array(y_train), batch_size=256, epochs=10)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "58VK__zr1ZKO" - }, - "source": [ - "model_json = model.to_json()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "j4wyJ7XZ1aRn" - }, - "source": [ - "with open('ToxicBot_GloVeEmbedding.json', 'w') as json_file:\n", - " json_file.write(model_json)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "HvSHcnNP1bae" - }, - "source": [ - "model.save_weights(\"ToxicBot_Weights.h5\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "_BYtu3H_1c_T" - }, - "source": [ - "json_file = open('ToxicBot_GloVeEmbedding.json', 'r')\n", - "loaded_model_json = json_file.read()\n", - "json_file.close()\n", - "loaded_model = model_from_json(loaded_model_json)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "xtphsM9W1eYk" - }, - "source": [ - "with open('ToxicBot_Tokenizer.pickle', 'wb') as handle:\n", - " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "TkwNqv5G1fTC" - }, - "source": [ - "with open('ToxicBot_Tokenizer.pickle', 'rb') as handle:\n", - " tokenizer = pickle.load(handle)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "3p0p-ZUk1gPG" - }, - "source": [ - "loaded_model.load_weights(\"ToxicBot_Weights.h5\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mPoZN8UV1hX6" - }, - "source": [ - "loaded_model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])" - ], - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file