From 29a7727668d4485bf30e5789482dd7a346648aad Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Fri, 11 Feb 2022 18:37:54 +0100 Subject: [PATCH] Adding the Le Wagon recap as complementary analysis (not super usefull I guess) --- .../valentin/Complementary_exploration.ipynb | 645 ++++++++++++++++++ 1 file changed, 645 insertions(+) create mode 100644 notebooks/valentin/Complementary_exploration.ipynb diff --git a/notebooks/valentin/Complementary_exploration.ipynb b/notebooks/valentin/Complementary_exploration.ipynb new file mode 100644 index 0000000..22ff084 --- /dev/null +++ b/notebooks/valentin/Complementary_exploration.ipynb @@ -0,0 +1,645 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "animated-bonus", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "going-brunei", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T15:05:50.989675Z", + "start_time": "2022-02-11T15:05:50.856495Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.naive_bayes import ComplementNB\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.model_selection import cross_validate\n", + "from evalstudent import metrics\n", + "from evalstudent import utils\n", + "from sklearn.decomposition import LatentDirichletAllocation\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "descending-preview", + "metadata": {}, + "source": [ + "# EDA (Exploratory Data Analysis)" + ] + }, + { + "cell_type": "markdown", + "id": "private-there", + "metadata": {}, + "source": [ + "## Loading data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "answering-washer", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T10:28:29.374187Z", + "start_time": "2022-02-11T10:28:29.194338Z" + }, + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34m.\u001b[00m\r\n", + "├── MANIFEST.in\r\n", + "├── Makefile\r\n", + "├── README.md\r\n", + "├── \u001b[01;34mevalstudent\u001b[00m\r\n", + "│   ├── __init__.py\r\n", + "│   ├── \u001b[01;34m__pycache__\u001b[00m\r\n", + "│   ├── \u001b[01;34mdata\u001b[00m\r\n", + "│   ├── metrics.py\r\n", + "│   └── utils.py\r\n", + "├── \u001b[01;34mevalstudent.egg-info\u001b[00m\r\n", + "│   ├── PKG-INFO\r\n", + "│   ├── SOURCES.txt\r\n", + "│   ├── dependency_links.txt\r\n", + "│   ├── not-zip-safe\r\n", + "│   └── top_level.txt\r\n", + "├── \u001b[01;34mnotebooks\u001b[00m\r\n", + "│   ├── \u001b[01;34mvalentin\u001b[00m\r\n", + "│   └── \u001b[01;34myoann\u001b[00m\r\n", + "├── \u001b[01;34mraw_data\u001b[00m\r\n", + "│   ├── sample_submission.csv\r\n", + "│   ├── \u001b[01;34mtest\u001b[00m\r\n", + "│   ├── \u001b[01;34mtrain\u001b[00m\r\n", + "│   └── train.csv\r\n", + "├── requirements.txt\r\n", + "├── \u001b[01;34mscripts\u001b[00m\r\n", + "│   └── evalstudent-run\r\n", + "├── setup.py\r\n", + "└── \u001b[01;34mtests\u001b[00m\r\n", + " └── __init__.py\r\n", + "\r\n", + "12 directories, 17 files\r\n" + ] + } + ], + "source": [ + "!cd ../..;tree -L 2" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "central-teaching", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T10:10:53.620392Z", + "start_time": "2022-02-11T10:10:52.658641Z" + } + }, + "outputs": [], + "source": [ + "train_df = pd.read_csv(\"../../raw_data/train.csv\", dtype = {\"discourse_id\": int, \"discourse_start\": int, \"discourse_end\": int})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "sought-armor", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T10:10:54.082538Z", + "start_time": "2022-02-11T10:10:54.031332Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiscourse_iddiscourse_startdiscourse_enddiscourse_textdiscourse_typediscourse_type_numpredictionstring
0423A1CA112E216226276605248229Modern humans today are always on their phone....LeadLead 11 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1423A1CA112E21622627653021230312They are some really bad consequences when stu...PositionPosition 145 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2423A1CA112E21622627671020313401Some certain areas in the United States ban ph...EvidenceEvidence 160 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3423A1CA112E21622627696365402758When people have phones, they know about certa...EvidenceEvidence 276 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4423A1CA112E21622627759780759886Driving is one of the way how to get around. P...ClaimClaim 1139 140 141 142 143 144 145 146 147 148 149 15...
\n", + "
" + ], + "text/plain": [ + " id discourse_id discourse_start discourse_end \\\n", + "0 423A1CA112E2 1622627660524 8 229 \n", + "1 423A1CA112E2 1622627653021 230 312 \n", + "2 423A1CA112E2 1622627671020 313 401 \n", + "3 423A1CA112E2 1622627696365 402 758 \n", + "4 423A1CA112E2 1622627759780 759 886 \n", + "\n", + " discourse_text discourse_type \\\n", + "0 Modern humans today are always on their phone.... Lead \n", + "1 They are some really bad consequences when stu... Position \n", + "2 Some certain areas in the United States ban ph... Evidence \n", + "3 When people have phones, they know about certa... Evidence \n", + "4 Driving is one of the way how to get around. P... Claim \n", + "\n", + " discourse_type_num predictionstring \n", + "0 Lead 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1... \n", + "1 Position 1 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 \n", + "2 Evidence 1 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 \n", + "3 Evidence 2 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9... \n", + "4 Claim 1 139 140 141 142 143 144 145 146 147 148 149 15... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "integrated-toilet", + "metadata": {}, + "source": [ + "## Displaying classes" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "vocal-heart", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T10:57:48.181989Z", + "start_time": "2022-02-11T10:57:48.105433Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "|Lead |Position |Claim |Counterclaim |Rebuttal |Evidence |Concluding Statement

Phones

|Modern humans today are always on their phone. They are always on their phone more than 5 hours a day no stop .All they do is text back and forward and just have group Chats on social media. They even do it while driving. |They are some really bad consequences when stuff happens when it comes to a phone. |Some certain areas in the United States ban phones from class rooms just because of it.

|When people have phones, they know about certain apps that they have .Apps like Facebook Twitter Instagram and Snapchat. So like if a friend moves away and you want to be in contact you can still be in contact by posting videos or text messages. People always have different ways how to communicate with a phone. Phones have changed due to our generation.

|Driving is one of the way how to get around. People always be on their phones while doing it. Which can cause serious Problems. |That's why there's a thing that's called no texting while driving. That's a really important thing to remember. Some people still do it because they think It's stupid. No matter what they do they still have to obey it because that's the only way how did he save.

|Sometimes on the news there is either an accident or a suicide. It might involve someone not looking where they're going or tweet that someone sent. It either injury or death. If a mysterious number says I'm going to kill you and they know where you live but you don't know the person's contact

,It makes you puzzled and make you start to freak out. Which can end up really badly.


|Phones are fine to use and it's also the best way to come over help. |If you go through a problem and you can't find help you ,always have a phone there with you. Even though phones are used almost every day as long as you're safe it would come into use if you get into trouble. Make sure you do not be like this phone while you're in the middle of driving. |The news always updated when people do something stupid around that involves their phones. The safest way is the best way to stay safe." + ], + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "utils.display_classes(\"423A1CA112E2\", train_df)" + ] + }, + { + "cell_type": "markdown", + "id": "victorian-washer", + "metadata": {}, + "source": [ + "## Class balance" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "collective-agenda", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T10:25:20.721332Z", + "start_time": "2022-02-11T10:25:20.663621Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "discourse_type\n", + "Claim 35\n", + "Concluding Statement 9\n", + "Counterclaim 4\n", + "Evidence 32\n", + "Lead 6\n", + "Position 11\n", + "Rebuttal 3\n", + "Name: id, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(train_df.groupby(\"discourse_type\").id.count()/train_df.shape[0]*100).map(round)" + ] + }, + { + "cell_type": "markdown", + "id": "simplified-inquiry", + "metadata": {}, + "source": [ + "## Lead position in essays" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "electric-marshall", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T17:18:33.687065Z", + "start_time": "2022-02-11T17:18:33.233849Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "6.2266523374529825" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lead start character\n", + "train_df[train_df[\"discourse_type\"] == \"Lead\"][\"discourse_start\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "id": "distinguished-scholar", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T17:19:00.752119Z", + "start_time": "2022-02-11T17:19:00.682175Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "298.78355722729714" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lead end character\n", + "train_df[train_df[\"discourse_type\"] == \"Lead\"][\"discourse_end\"].mean()" + ] + }, + { + "cell_type": "markdown", + "id": "thirty-asian", + "metadata": {}, + "source": [ + "## Essays length" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "portuguese-association", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T17:19:39.928291Z", + "start_time": "2022-02-11T17:19:36.155400Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2364.0979864050278" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "essay_ids = train_df.id.unique()\n", + "essay_len = []\n", + "for essay_id in essay_ids:\n", + " essay_text = open(f\"../../raw_data/train/{essay_id}.txt\").read()\n", + " essay_len.append(len(essay_text))\n", + "\n", + "np.mean(essay_len)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "artistic-reduction", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T17:19:46.774793Z", + "start_time": "2022-02-11T17:19:46.047263Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVLElEQVR4nO3df5Bd5X3f8fc3KBDC2pIAzw4jqRFpVHcoahO0Y+g48aysFAtwLZI6Lh7GCExGkw5OcSET5HpaZ9p6KjclHnuS2qMExqJDvWBiDxrAtVWZrcd/CBsRgvhhzILlWDuyVBtZzhoSR+m3f9xnN1fb/aH7c/fqeb9m7uw5z3PuOd9zVvrcc5977tnITCRJdfippS5AktQ/hr4kVcTQl6SKGPqSVBFDX5IqsmKpC1jIxRdfnOvXr5+Z//GPf8wFF1ywdAW1wFp7w1p7Z5DqtdaFHTx48PuZ+aY5OzNz2T42bdqUzR5//PEcFNbaG9baO4NUr7UuDHgy58nVRYd3IuLeiDgeEc82tf1+RHwzIp6JiC9ExKqmvg9FxEREvBgR72hq31raJiJiZ0cvY5KktpzJmP5ngK2z2vYBl2fmPwa+BXwIICIuA24A/lF5zn+LiHMi4hzgj4BrgMuA95ZlJUl9tGjoZ+ZXgVdntX05M0+V2QPA2jK9DRjLzL/OzG8DE8BbymMiM1/JzJ8AY2VZSVIfdeOD3PcDD5TpNTReBKYdKW0A353VfuVcK4uIHcAOgOHhYcbHx2f6pqamTptfzqy1N6y1dwapXmttX0ehHxEfBk4B93enHMjM3cBugJGRkRwdHZ3pGx8fp3l+ObPW3rDW3hmkeq21fW2HfkTcDLwT2FI+LQaYBNY1Lba2tLFAuySpT9r6clZEbAV+F3hXZr7W1LUXuCEizouIS4ENwNeBbwAbIuLSiDiXxoe9ezsrXZLUqkXP9CPis8AocHFEHAE+QuNqnfOAfREBcCAzfyszn4uIB4HnaQz73JaZf1vW8wHgS8A5wL2Z+VwP9keStIBFQz8z3ztH8z0LLP9R4KNztD8GPNZSdZKkrlrWt2FYSut3PjozfXjXdUtYiSR1jzdck6SKGPqSVBFDX5IqYuhLUkUMfUmqiFfvtMireiQNMs/0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkWqv2TTSzAl1aT60D8TzS8MkjTIDP0mhruks51j+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBG/kdsB79sjadAseqYfEfdGxPGIeLap7cKI2BcRL5Wfq0t7RMQnI2IiIp6JiCuanrO9LP9SRGzvze5IkhZyJmf6nwH+ELivqW0nsD8zd0XEzjJ/F3ANsKE8rgQ+BVwZERcCHwFGgAQORsTezDzRrR1phffYkVSrRc/0M/OrwKuzmrcBe8r0HuD6pvb7suEAsCoiLgHeAezLzFdL0O8DtnahfklSCyIzF18oYj3wSGZeXuZ/mJmrynQAJzJzVUQ8AuzKzK+Vvv003gGMAj+Tmf+ptP874PXM/K9zbGsHsANgeHh409jY2Ezf1NQUQ0NDbe/stEOTJztex2wb16w8bb5btfaDtfbGINUKg1WvtS5s8+bNBzNzZK6+jj/IzcyMiMVfOc58fbuB3QAjIyM5Ojo60zc+Pk7zfLtu7sHwzuEbR0+b71at/WCtvTFItcJg1Wut7Wv3ks1jZdiG8vN4aZ8E1jUtt7a0zdcuSeqjdkN/LzB9Bc524OGm9pvKVTxXAScz8yjwJeDqiFhdrvS5urRJkvpo0eGdiPgsjTH5iyPiCI2rcHYBD0bErcB3gPeUxR8DrgUmgNeAWwAy89WI+I/AN8py/yEzZ384LEnqsUVDPzPfO0/XljmWTeC2edZzL3BvS9VJkrrK2zBIUkUMfUmqiKEvSRUx9CWpIoa+JFXEWyt3ibdZljQIPNOXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLo98D6nY9yaPLkaX83V5KWA0NfkiqyopMnR8S/AX4TSOAQcAtwCTAGXAQcBN6XmT+JiPOA+4BNwA+Af5mZhzvZfiuW6qy7ebuHd123JDVI0rS2z/QjYg3wr4GRzLwcOAe4AfgY8PHM/AXgBHBrecqtwInS/vGynCSpjzod3lkBnB8RK4CfBY4CbwceKv17gOvL9LYyT+nfEhHR4fYlSS2IzGz/yRG3Ax8FXge+DNwOHChn80TEOuCLmXl5RDwLbM3MI6XvZeDKzPz+rHXuAHYADA8PbxobG5vpm5qaYmhoqK1aD02ebOt57Ro+H469fnrbxjUr+1rDmerkuPabtfbOINVrrQvbvHnzwcwcmauv7TH9iFhN4+z9UuCHwOeAre2ub1pm7gZ2A4yMjOTo6OhM3/j4OM3zrbi5z2P6d248xd2HTj+8h28c7WsNZ6qT49pv1to7g1Svtbavk+GdXwW+nZn/JzP/Bvg88FZgVRnuAVgLTJbpSWAdQOlfSeMDXUlSn3QS+n8BXBURP1vG5rcAzwOPA+8uy2wHHi7Te8s8pf8r2cnYkiSpZW2HfmY+QeMD2adoXK75UzSGZe4C7oiICRqXbd5TnnIPcFFpvwPY2UHdkqQ2dHSdfmZ+BPjIrOZXgLfMsexfAb/RyfYkSZ3xG7mSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkirS0Z9LVGvW73x0ZvrwruuWsBJJtfJMX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0JekinQU+hGxKiIeiohvRsQLEfFPI+LCiNgXES+Vn6vLshERn4yIiYh4JiKu6M4uSJLOVKdn+p8A/mdm/kPgnwAvADuB/Zm5Adhf5gGuATaUxw7gUx1uW5LUorZDPyJWAm8D7gHIzJ9k5g+BbcCestge4PoyvQ24LxsOAKsi4pJ2ty9Jal1kZntPjPhFYDfwPI2z/IPA7cBkZq4qywRwIjNXRcQjwK7M/Frp2w/clZlPzlrvDhrvBBgeHt40NjY20zc1NcXQ0FBb9R6aPNnW89o1fD4ce33+/o1rVvavmEV0clz7zVp7Z5DqtdaFbd68+WBmjszV18m9d1YAVwC/nZlPRMQn+LuhHAAyMyOipVeVzNxN48WEkZGRHB0dnekbHx+neb4VNzfd96Yf7tx4irsPLXB4D/14ZnKp78PTyXHtN2vtnUGq11rb18mY/hHgSGY+UeYfovEicGx62Kb8PF76J4F1Tc9fW9okSX3Sduhn5veA70bEm0vTFhpDPXuB7aVtO/Bwmd4L3FSu4rkKOJmZR9vdviSpdZ3eWvm3gfsj4lzgFeAWGi8kD0bErcB3gPeUZR8DrgUmgNfKspKkPuoo9DPzaWCuDwu2zLFsArd1sj1JUmf8Rq4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SarIiqUuQLB+56Mz04d3XbeElUg623mmL0kVMfQlqSKGviRVxNCXpIp0HPoRcU5E/FlEPFLmL42IJyJiIiIeiIhzS/t5ZX6i9K/vdNuSpNZ040z/duCFpvmPAR/PzF8ATgC3lvZbgROl/eNlOUlSH3UU+hGxFrgO+JMyH8DbgYfKInuA68v0tjJP6d9Slpck9UlkZvtPjngI+M/AG4DfAW4GDpSzeSJiHfDFzLw8Ip4FtmbmkdL3MnBlZn5/1jp3ADsAhoeHN42Njc30TU1NMTQ01FathyZPtvW8dg2fD8deb/15G9es7H4xi+jkuPabtfbOINVrrQvbvHnzwcwcmauv7S9nRcQ7geOZeTAiRttdz2yZuRvYDTAyMpKjo3+36vHxcZrnW3Fz0xeg+uHOjae4+1Drh/fwjaPdL2YRnRzXfrPW3hmkeq21fZ18I/etwLsi4lrgZ4A3Ap8AVkXEisw8BawFJsvyk8A64EhErABWAj/oYPuSpBa1PaafmR/KzLWZuR64AfhKZt4IPA68uyy2HXi4TO8t85T+r2QnY0tnqfU7H515SFK39eI6/buAOyJiArgIuKe03wNcVNrvAHb2YNuSpAV05YZrmTkOjJfpV4C3zLHMXwG/0Y3tSZLa4102lzHvvimp27wNgyRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKeBuGAeEtGSR1g2f6klQRQ1+SKmLoS1JFDH1JqoihL0kV8eqdAeSVPJLa5Zm+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVOauv02++nl2S1MGZfkSsi4jHI+L5iHguIm4v7RdGxL6IeKn8XF3aIyI+GRETEfFMRFzRrZ2QJJ2ZToZ3TgF3ZuZlwFXAbRFxGbAT2J+ZG4D9ZR7gGmBDeewAPtXBtiVJbWg79DPzaGY+Vab/EngBWANsA/aUxfYA15fpbcB92XAAWBURl7S7fUlS6yIzO19JxHrgq8DlwF9k5qrSHsCJzFwVEY8AuzLza6VvP3BXZj45a107aLwTYHh4eNPY2NhM39TUFENDQ2dc16HJkx3sVWeGz4djr/d/uxvXrGz5Oa0e16Vkrb0zSPVa68I2b958MDNH5urr+IPciBgC/hT4YGb+qJHzDZmZEdHSq0pm7gZ2A4yMjOTo6OhM3/j4OM3zi7l5CT/IvXPjKe4+1P/PyQ/fONryc1o9rkvJWntnkOq11vZ1dMlmRPw0jcC/PzM/X5qPTQ/blJ/HS/sksK7p6WtLmySpTzq5eieAe4AXMvMPmrr2AtvL9Hbg4ab2m8pVPFcBJzPzaLvblyS1rpPxh7cC7wMORcTTpe3fAruAByPiVuA7wHtK32PAtcAE8BpwSwfbliS1oe3QLx/IxjzdW+ZYPoHb2t2ezox/YEXSQrwNgyRV5Ky+DUPtPOuXNJtn+pJUEUNfkipi6EtSRQx9SaqIoS9JFfHqnUp4JY8k8ExfkqrimX6FPOuX6uWZviRVxNCXpIoY+pVbv/NRDk2ePG3Ip5XnTj8kDQZDX5IqYuhLUkW8ekdd4RVB0mAw9DXD4JbOfoa+5uQLgHR2ckxfkipi6EtSRQx9SaqIY/palOP70tnD0FdL/PatNNgc3pGkihj6klQRQ1+SKmLoS1JF+h76EbE1Il6MiImI2Nnv7UtSzfp69U5EnAP8EfDPgCPANyJib2Y+38861Fte4iktX/2+ZPMtwERmvgIQEWPANsDQP0t16xLP+V482nmB8UVJNYvM7N/GIt4NbM3M3yzz7wOuzMwPNC2zA9hRZt8MvNi0iouB7/ep3E5Za29Ya+8MUr3WurCfy8w3zdWx7L6clZm7gd1z9UXEk5k50ueS2mKtvWGtvTNI9Vpr+/r9Qe4ksK5pfm1pkyT1Qb9D/xvAhoi4NCLOBW4A9va5BkmqVl+HdzLzVER8APgScA5wb2Y+18Iq5hz2WaastTestXcGqV5rbVNfP8iVJC0tv5ErSRUx9CWpIgMR+svh1g0RsS4iHo+I5yPiuYi4vbT/XkRMRsTT5XFt03M+VGp+MSLe0c/9iYjDEXGo1PRkabswIvZFxEvl5+rSHhHxyVLPMxFxRdN6tpflX4qI7T2o881Nx+7piPhRRHxwOR3XiLg3Io5HxLNNbV07lhGxqfyuJspzo8u1/n5EfLPU84WIWFXa10fE603H+NOL1TTffnex1q793qNxwcgTpf2BaFw80s1aH2iq83BEPF3al/S4Liozl/WDxge+LwM/D5wL/Dlw2RLUcQlwRZl+A/At4DLg94DfmWP5y0qt5wGXln04p1/7AxwGLp7V9l+AnWV6J/CxMn0t8EUggKuAJ0r7hcAr5efqMr26x7/r7wE/t5yOK/A24Arg2V4cS+DrZdkoz72my7VeDawo0x9rqnV983Kz1jNnTfPtdxdr7drvHXgQuKFMfxr4V92sdVb/3cC/Xw7HdbHHIJzpz9y6ITN/AkzfuqGvMvNoZj5Vpv8SeAFYs8BTtgFjmfnXmfltYILGvizl/mwD9pTpPcD1Te33ZcMBYFVEXAK8A9iXma9m5glgH7C1h/VtAV7OzO8ssEzfj2tmfhV4dY46Oj6Wpe+NmXkgG//j72taV1dqzcwvZ+apMnuAxvdj5rVITfPtd1dqXUBLv/dyBv124KFe11q29R7gswuto1/HdTGDEPprgO82zR9h4bDtuYhYD/wS8ERp+kB563xv09uy+eru1/4k8OWIOBiNW1sADGfm0TL9PWB4mdQ67QZO/4+zHI/rtG4dyzVlenZ7r7yfxhnmtEsj4s8i4n9HxK+UtoVqmm+/u6kbv/eLgB82vdj18rj+CnAsM19qaluOxxUYjNBfViJiCPhT4IOZ+SPgU8DfB34ROErjbd5y8MuZeQVwDXBbRLytubOcaSyb63XLeOu7gM+VpuV6XP8/y+1YziciPgycAu4vTUeBv5eZvwTcAfyPiHjjma6vR/s9ML/3Ju/l9JOV5XhcZwxC6C+bWzdExE/TCPz7M/PzAJl5LDP/NjP/L/DHNN5uwvx192V/MnOy/DwOfKHUday8xZx+q3l8OdRaXAM8lZnHSt3L8rg26daxnOT04Zae1B0RNwPvBG4soUIZKvlBmT5IY2z8HyxS03z73RVd/L3/gMbQ2opZ7V1V1v/rwANN+7DsjmuzQQj9ZXHrhjJudw/wQmb+QVP7JU2L/Row/en+XuCGiDgvIi4FNtD4EKfn+xMRF0TEG6anaXyQ92zZzvRVI9uBh5tqvSkargJOlreaXwKujojV5W321aWtF047W1qOx3WWrhzL0vejiLiq/Bu7qWldXRERW4HfBd6Vma81tb8pGn/jgoj4eRrH8pVFappvv7tVa1d+7+WF7XHg3b2qtfhV4JuZOTNssxyP62l69QlxNx80roj4Fo1XzA8vUQ2/TOMt1zPA0+VxLfDfgUOlfS9wSdNzPlxqfpGmKzJ6vT80rmT48/J4bnobNMY59wMvAf8LuLC0B40/bvNy2ZeRpnW9n8aHZhPALT06thfQODNb2dS2bI4rjRejo8Df0BiHvbWbxxIYoRFuLwN/SPmmfBdrnaAx7j397/bTZdl/Uf59PA08BfzzxWqab7+7WGvXfu/l/8HXy/5/Djivm7WW9s8AvzVr2SU9ros9vA2DJFVkEIZ3JEldYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekivw/iMZwlcESlyQAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "pd.Series(essay_len).hist(bins=100)" + ] + }, + { + "cell_type": "markdown", + "id": "impressive-astronomy", + "metadata": {}, + "source": [ + "## Topics" + ] + }, + { + "cell_type": "markdown", + "id": "behind-poster", + "metadata": {}, + "source": [ + "Here are the topics we're looking for: https://www.kaggle.com/c/feedback-prize-2021/discussion/301481" + ] + }, + { + "cell_type": "markdown", + "id": "posted-slide", + "metadata": {}, + "source": [ + "Working on a subset of the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "greenhouse-donna", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T14:35:51.242458Z", + "start_time": "2022-02-11T14:35:51.096750Z" + } + }, + "outputs": [], + "source": [ + "ids = train_df.id.values.copy()\n", + "np.random.shuffle(ids) # We need to shuffle because the data is actually sorted by topics\n", + "ids_subset = ids[0:150]\n", + "\n", + "sub_data = [open(f'../../raw_data/train/{essay_id}.txt').read() for essay_id in ids_subset]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "later-springfield", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T14:36:27.920378Z", + "start_time": "2022-02-11T14:36:27.681473Z" + } + }, + "outputs": [], + "source": [ + "vect = CountVectorizer()\n", + "X = vect.fit_transform(sub_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "arbitrary-preserve", + "metadata": { + "ExecuteTime": { + "end_time": "2022-02-11T14:36:30.149544Z", + "start_time": "2022-02-11T14:36:28.262263Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 0:\n", + "[('the', 172.82254500117247), ('that', 80.52770479122563), ('face', 80.17858026214711), ('to', 72.7622234827799), ('it', 71.89678834929373), ('in', 65.34423092672304), ('of', 63.38028921658119), ('they', 50.219396460404965), ('and', 48.393616639433084), ('is', 47.09359500962151)]\n", + "Topic 1:\n", + "[('to', 613.091957932786), ('the', 501.07731620732557), ('and', 389.82072626593487), ('that', 322.018139933217), ('of', 255.37354591792155), ('in', 249.06198441339708), ('can', 200.4700481663499), ('they', 196.992662287954), ('have', 180.94145395796957), ('is', 173.85007426666556)]\n", + "Topic 2:\n", + "[('plurality', 0.06666666666687518), ('pedigree', 0.06666666666687518), ('continuing', 0.06666666666677169), ('intimidating', 0.06666666666675286), ('handedly', 0.0666666666667507), ('thirds', 0.0666666666667507), ('machine', 0.06666666666674344), ('forth', 0.0666666666667403), ('therefor', 0.0666666666667403), ('vanquished', 0.0666666666667403)]\n", + "Topic 3:\n", + "[('the', 1012.74692348668), ('of', 301.47484990797426), ('to', 291.982866322596), ('electoral', 232.06666660482145), ('college', 216.19927404885863), ('is', 211.53911691883192), ('for', 197.82239377838144), ('that', 192.50095957240535), ('vote', 192.06666655182994), ('in', 190.39765287076094)]\n", + "Topic 4:\n", + "[('the', 440.86271044357176), ('to', 264.7530838861564), ('of', 237.54446483785907), ('and', 217.43825573529787), ('that', 173.40241049137137), ('in', 158.85064890071112), ('is', 144.0351122317272), ('venus', 111.06666653640555), ('on', 95.41904355413175), ('be', 94.29470502734371)]\n", + "Topic 5:\n", + "[('the', 136.22089146218764), ('cars', 70.51274356274251), ('car', 62.60425931245278), ('to', 60.325207137546585), ('and', 55.04844122901495), ('in', 42.22907240974827), ('driverless', 42.066666529121036), ('are', 40.74510256924329), ('they', 38.41989980607021), ('not', 35.45683510970714)]\n", + "Topic 6:\n", + "[('to', 495.3159532482881), ('students', 465.3708299610961), ('the', 462.4438560945296), ('and', 312.939369499045), ('they', 251.77932460013807), ('their', 224.15830417475303), ('in', 201.45922863826956), ('school', 196.56015643840635), ('of', 182.81389736692233), ('online', 181.27794879254282)]\n", + "Topic 7:\n", + "[('seagoing', 20.066666607537133), ('you', 18.037471568403703), ('cowboy', 11.066666590129229), ('coyboy', 8.066666641968167), ('places', 7.633341556927242), ('amazing', 7.622580202233638), ('things', 6.099029400050452), ('ship', 6.06666667608841), ('animals', 6.0666666502745805), ('war', 6.066666611701502)]\n", + "Topic 8:\n", + "[('average', 11.308399771585469), ('grade', 9.625821215277004), ('passing', 5.047121645629949), ('participate', 3.8439501881743223), ('child', 3.4755662456490746), ('sport', 3.3030891317954163), ('school_name', 3.066666609082168), ('lastly', 2.086080274940592), ('barely', 2.0666666043854436), ('letter', 2.0666666043854436)]\n", + "Topic 9:\n", + "[('you', 519.4367958436296), ('to', 421.48116546565325), ('the', 275.7971811285436), ('and', 258.21894414462685), ('is', 198.09580449424888), ('it', 189.60314278271116), ('can', 158.11660545360803), ('for', 145.94543135239417), ('of', 143.89389097076636), ('that', 143.4358858385895)]\n", + "Topic 10:\n", + "[('citizens', 4.054726841300221), ('elector', 3.714380116691819), ('represented', 3.0666666308258255), ('feel', 2.8494660975029418), ('may', 2.794665651696974), ('causing', 2.6635390365227667), ('changed', 2.6348577914714233), ('due', 2.385720344823054), ('provide', 2.066666707613025), ('correctly', 2.066666646373739)]\n", + "Topic 11:\n", + "[('to', 198.7503361187078), ('the', 177.29841854517497), ('of', 109.16196585529299), ('in', 101.0329253668334), ('and', 75.14367776677948), ('is', 65.80781809901187), ('it', 64.39209858332921), ('be', 61.74673793319909), ('would', 60.97001650279835), ('that', 59.40936099637912)]\n", + "Topic 12:\n", + "[('extracurricular', 12.852125879071167), ('activities', 8.608069537000686), ('many', 5.927141299566947), ('agree', 5.647290225906643), ('decision', 4.637508637210874), ('principal', 3.3901718731068224), ('participate', 2.9162404932021024), ('future', 2.722822024447124), ('develop', 2.661094725588789), ('way', 2.4960322734718945)]\n", + "Topic 13:\n", + "[('plurality', 0.06666666666687518), ('pedigree', 0.06666666666687518), ('continuing', 0.06666666666677169), ('intimidating', 0.06666666666675286), ('handedly', 0.0666666666667507), ('thirds', 0.0666666666667507), ('machine', 0.06666666666674344), ('forth', 0.0666666666667403), ('therefor', 0.0666666666667403), ('vanquished', 0.0666666666667403)]\n", + "Topic 14:\n", + "[('to', 108.13720613982179), ('and', 107.44568646510484), ('the', 93.33015730788745), ('in', 67.27347279952798), ('of', 58.68887518305737), ('it', 50.207002389842174), ('that', 48.88016796500254), ('with', 37.87874207159876), ('have', 36.989560324822854), ('be', 34.61769483547835)]\n" + ] + } + ], + "source": [ + "model = LatentDirichletAllocation()\n", + "model = LatentDirichletAllocation(n_components=15)\n", + "model.fit(X)\n", + "\n", + "def print_topics(model, vect):\n", + " for idx, topic in enumerate(model.components_):\n", + " print(\"Topic %d:\" % (idx))\n", + " print([(vect.get_feature_names()[i], topic[i])\n", + " for i in topic.argsort()[:-10 - 1:-1]])\n", + "\n", + "print_topics(model,vect)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}