diff --git a/src/Full_BERT_Type_Classifier.ipynb b/src/Full_BERT_Type_Classifier.ipynb new file mode 100644 index 0000000..bde170c --- /dev/null +++ b/src/Full_BERT_Type_Classifier.ipynb @@ -0,0 +1,417 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Full BERT Type Classifier", + "provenance": [], + "collapsed_sections": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "TPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "-S-iFkX0m3NE", + "colab_type": "code", + "outputId": "5ba5e679-de51-4a02-a1cb-cfd87e6a8063", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 672 + } + }, + "source": [ + "!pip install transformers" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting transformers\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)\n", + "\u001b[K |████████████████████████████████| 573kB 3.1MB/s \n", + "\u001b[?25hCollecting sacremoses\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)\n", + "\u001b[K |████████████████████████████████| 890kB 8.3MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.38.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)\n", + "Collecting sentencepiece\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)\n", + "\u001b[K |████████████████████████████████| 1.0MB 17.3MB/s \n", + "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n", + "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.12.43)\n", + "Collecting tokenizers==0.5.2\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)\n", + "\u001b[K |████████████████████████████████| 3.7MB 20.6MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.3)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.1)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.4.5.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)\n", + "Requirement already satisfied: botocore<1.16.0,>=1.15.43 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.15.43)\n", + "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.5)\n", + "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.16.0,>=1.15.43->boto3->transformers) (0.15.2)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.16.0,>=1.15.43->boto3->transformers) (2.8.1)\n", + "Building wheels for collected packages: sacremoses\n", + " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sacremoses: filename=sacremoses-0.0.41-cp36-none-any.whl size=893334 sha256=b87f354dc0f3703370c24ace917b1e7a2cb844f78be0a3a56a7f6bdd8197ac05\n", + " Stored in directory: /root/.cache/pip/wheels/22/5a/d4/b020a81249de7dc63758a34222feaa668dbe8ebfe9170cc9b1\n", + "Successfully built sacremoses\n", + "Installing collected packages: sacremoses, sentencepiece, tokenizers, transformers\n", + "Successfully installed sacremoses-0.0.41 sentencepiece-0.1.86 tokenizers-0.5.2 transformers-2.8.0\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HK8wfsrNm5-6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "import torch\n", + "import transformers as ppb\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "aiw1oGrym6w5", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Read in dataframes, classify one real dataset vs all fantasy datasets\n", + "df_real = pd.read_csv(\"current_history_NYT.csv\")\n", + "df_dorothy = pd.read_csv(\"dorothy.csv\")\n", + "df_arthur = pd.read_csv(\"arthur.csv\")\n", + "df_wonder = pd.read_csv(\"bookofwonder.csv\")\n", + "df_irish = pd.read_csv(\"irishfairy.csv\")\n", + "df_iceandfire = pd.read_csv(\"iceandfire.csv\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1uFmH1_iifNc", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Since the lines in the realistic dataset may contain footnote numbers and formatting,\n", + "#code removes formatting, but not numbers since numbers may be important to history\n", + "#Referenced for formatting: https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column\n", + "df_real[\"Sentences\"] = df_real[\"Sentences\"].str.replace(\"*\", \"\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "TBhTWP4cu-3x", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#For BERT features, limit datasets with more than 1200 lines to 1200.\n", + "#This is done to avoid exceeding the RAM provided by free Colab\n", + "df_real = df_real[:1200]\n", + "df_dorothy = df_dorothy[:1200]\n", + "df_arthur = df_arthur[:1200]\n", + "df_wonder = df_wonder[:1200]\n", + "df_irish = df_irish[:1200]\n", + "df_iceandfire = df_iceandfire[:1200]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "3gqfM9O8wDpT", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from sklearn.utils import shuffle\n", + "\n", + "#Create batch dataframes that store combined realistic and fantasy data\n", + "dorothy_batch = df_real.append(df_dorothy, ignore_index=True)\n", + "arthur_batch = df_real.append(df_arthur, ignore_index=True)\n", + "wonder_batch = df_real.append(df_wonder, ignore_index=True)\n", + "irish_batch = df_real.append(df_irish, ignore_index=True)\n", + "iceandfire_batch = df_real.append(df_iceandfire, ignore_index=True)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "oNN6oJVQm-dJ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Place all batch columns into variables\n", + "\n", + "dorothy_sentences = dorothy_batch[\"Sentences\"]\n", + "dorothy_labels = dorothy_batch[\"Label\"]\n", + "\n", + "arthur_sentences = arthur_batch[\"Sentences\"]\n", + "arthur_labels = arthur_batch[\"Label\"]\n", + "\n", + "wonder_sentences = wonder_batch[\"Sentences\"]\n", + "wonder_labels = wonder_batch[\"Label\"]\n", + "\n", + "irish_sentences = irish_batch[\"Sentences\"]\n", + "irish_labels = irish_batch[\"Label\"]\n", + "\n", + "iceandfire_sentences = iceandfire_batch[\"Sentences\"]\n", + "iceandfire_labels = iceandfire_batch[\"Label\"]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qWoAY8A-nCNc", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Code from this point downward is a modified version of base code for a BERT classifier from below link:\n", + "#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT\n", + "\n", + "model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')\n", + "\n", + "# Load pretrained model/tokenizer\n", + "tokenizer = tokenizer_class.from_pretrained(pretrained_weights)\n", + "model = model_class.from_pretrained(pretrained_weights)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vTlNoa6enEAS", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Create tokenized inputs for BERT\n", + "tokenized = iceandfire_sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=False)))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "GhI6XFMPnHJE", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Pad all sentences to greatest length because BERT needs all inputs to be the same length\n", + "max_len = 0\n", + "for i in tokenized.values:\n", + " if len(i) > max_len:\n", + " max_len = len(i)\n", + "\n", + "padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "rnunMGDTnKNC", + "colab_type": "code", + "outputId": "6f9a4e72-f873-45c7-a276-2ccc247356a1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 33 + } + }, + "source": [ + "#Create attention mask on padded that tells BERT to avoid calculating attention on padding\n", + "attention_mask = np.where(padded != 0, 1, 0)\n", + "attention_mask.shape" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(2400, 180)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 50 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "f34MeBHcnOIF", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Place variables in tensors since the library is a pytorch base\n", + "input_ids = torch.tensor(padded) \n", + "attention_mask = torch.tensor(attention_mask)\n", + "\n", + "#torch.no_grad disables autograd on the last_hidden_states variable\n", + "with torch.no_grad():\n", + " last_hidden_states = model(input_ids, attention_mask=attention_mask)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "B9jSKLZznQJp", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Get only the [CLS] token feature\n", + "features = last_hidden_states[0][:,0,:].numpy()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xXhFmq0cnRUz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Assign labels variable based on current dataset\n", + "labels = iceandfire_labels" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "r9sJYh58nSpx", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#Split using .25 test split\n", + "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ApftcACunh_y", + "colab_type": "code", + "outputId": "31eb3a87-e20e-4877-f163-81733f9f9784", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 100 + } + }, + "source": [ + "#Fit logistic regression model with 100 epochs\n", + "lr_clf = LogisticRegression()\n", + "lr_clf.fit(train_features, train_labels)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 56 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8hX31EiYnlKU", + "colab_type": "code", + "outputId": "b418e43f-e5c7-40fa-e9ce-cbe36fec1362", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 33 + } + }, + "source": [ + "#Get test accuracy\n", + "lr_clf.score(test_features, test_labels)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.9416666666666667" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 57 + } + ] + } + ] +} \ No newline at end of file diff --git a/src/NSPandAuthorship.ipynb b/src/NSPandAuthorship.ipynb new file mode 100644 index 0000000..88e7940 --- /dev/null +++ b/src/NSPandAuthorship.ipynb @@ -0,0 +1,1295 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 683 + }, + "colab_type": "code", + "id": "A08cLa726vN0", + "outputId": "d814fa84-740b-45b0-99c1-038cf64d1746" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: transformers in /opt/conda/lib/python3.7/site-packages (2.8.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (2020.4.4)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from transformers) (2.23.0)\n", + "Requirement already satisfied: tokenizers==0.5.2 in /opt/conda/lib/python3.7/site-packages (from transformers) (0.5.2)\n", + "Requirement already satisfied: sentencepiece in /opt/conda/lib/python3.7/site-packages (from transformers) (0.1.86)\n", + "Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from transformers) (1.18.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.7/site-packages (from transformers) (4.45.0)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from transformers) (3.0.10)\n", + "Requirement already satisfied: boto3 in /opt/conda/lib/python3.7/site-packages (from transformers) (1.13.2)\n", + "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.7/site-packages (from transformers) (0.0.43)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (1.25.9)\n", + "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2.9)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2020.4.5.1)\n", + "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /opt/conda/lib/python3.7/site-packages (from boto3->transformers) (0.3.3)\n", + "Requirement already satisfied: botocore<1.17.0,>=1.16.2 in /opt/conda/lib/python3.7/site-packages (from boto3->transformers) (1.16.2)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.7/site-packages (from boto3->transformers) (0.9.5)\n", + "Requirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (7.1.1)\n", + "Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (0.14.1)\n", + "Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (1.14.0)\n", + "Requirement already satisfied: docutils<0.16,>=0.10 in /opt/conda/lib/python3.7/site-packages (from botocore<1.17.0,>=1.16.2->boto3->transformers) (0.15.2)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.7/site-packages (from botocore<1.17.0,>=1.16.2->boto3->transformers) (2.8.1)\n", + "Requirement already satisfied: torch in /opt/conda/lib/python3.7/site-packages (1.5.0)\n", + "Requirement already satisfied: future in /opt/conda/lib/python3.7/site-packages (from torch) (0.18.2)\n", + "Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from torch) (1.18.3)\n" + ] + } + ], + "source": [ + "!pip install transformers\n", + "!pip install torch" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "A3BwpCu663-q" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.utils import shuffle\n", + "import torch\n", + "import transformers as ppb\n", + "import warnings\n", + "import collections\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from tensorflow.keras import layers\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4bztdPRBcI1f" + }, + "outputs": [], + "source": [ + "#Read in dataframes, classify one real dataset vs all fantasy datasets\n", + "df_bird = pd.read_csv(\"bird_history.csv\")\n", + "df_NYT = pd.read_csv(\"current_history_NYT.csv\")\n", + "df_dorothy = pd.read_csv(\"dorothy.csv\")\n", + "df_arthur = pd.read_csv(\"arthur.csv\")\n", + "df_wonder = pd.read_csv(\"bookofwonder.csv\")\n", + "df_irish = pd.read_csv(\"irishfairy.csv\")\n", + "df_iceandfire = pd.read_csv(\"iceandfire.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ae6UNU7RdWzo" + }, + "outputs": [], + "source": [ + "#Since the lines in the realistic dataset may contain footnote numbers and formatting,\n", + "#code removes formatting, but not numbers since numbers may be important to history\n", + "#Referenced for formatting: https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column\n", + "df_NYT[\"Sentences\"] = df_NYT[\"Sentences\"].str.replace(\"*\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "nyL8UGe9_Wrq" + }, + "outputs": [], + "source": [ + "#Modifiable lists of which realistic and fantasy dataframes to consider when randomly choosing next sentences\n", + "real_list = [df_bird, df_NYT]\n", + "fantasy_list = [df_dorothy, df_arthur, df_wonder, df_irish, df_iceandfire]\n", + "\n", + "#List of datasets to pass to getTrain and use in NSP fine tuning\n", + "#Authorship labels assigned in order corresponding to this full list of datasets\n", + "df_list = [df_bird, df_NYT, df_dorothy, df_arthur, df_wonder, df_irish, df_iceandfire]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "SCoeCtLEdevm" + }, + "outputs": [], + "source": [ + "#Called in getTrain. \n", + "#Contains code that fills in half correct next sentences and half random sentences from opposite genre\n", + "def fillNSP(fill_list, ref_df, opposite_df):\n", + " half = int(len(fill_list) / 2)\n", + " sequence_list = []\n", + " #Code to get indices from https://www.geeksforgeeks.org/how-to-get-rows-index-names-in-pandas-dataframe/\n", + " index_list = fill_list.index.values.tolist()\n", + " #Fill in accurate next lines\n", + " for j in range(half):\n", + " current_sentence = fill_list.iloc[j].strip()\n", + " \n", + " #Check index first to preempt edge case where trying to access next sentence at end of df\n", + " index = index_list[j]\n", + " if(index + 1 < len(ref_df)):\n", + " next_sentence = ref_df[\"Sentences\"][index + 1].strip()\n", + " else:\n", + " #If at end of df, just step back 1 and use current sentence as next sentence\n", + " next_sentence = fill_list.iloc[j].strip()\n", + " current_sentence = ref_df[\"Sentences\"][index - 1].strip()\n", + " #Add formatting for first sentence\n", + " sequence = \"[CLS] \" + current_sentence + \" [SEP] \" + next_sentence + \" [SEP]\"\n", + " sequence_list.append(sequence)\n", + " \n", + " #Fill in random next_lines from the opposite genre\n", + " for j in range(half, len(fill_list)):\n", + " #Get index of df first since np.random.choice can't choose a random dataframe directly\n", + " random_df_index = np.random.choice(range(len(opposite_df)), 1)\n", + " random_df_index = random_df_index[0]\n", + " random_df = opposite_df[random_df_index]\n", + " random_index = np.random.choice(range(len(random_df)), 1)\n", + " random_index = random_index[0]\n", + " next_sentence = random_df[\"Sentences\"][random_index].strip()\n", + " \n", + " current_sentence = fill_list.iloc[j].strip()\n", + " #Add formatting for first sentence\n", + " sequence = \"[CLS] \" + current_sentence + \" [SEP] \" + next_sentence + \" [SEP]\"\n", + " sequence_list.append(sequence)\n", + " return sequence_list\n", + "\n", + "def getFeatures(df_list, real_list, fantasy_list, max_size):\n", + " #df_list is a full list of dataframes to get examples from\n", + " #real_list is a list of realistic dataframes to use\n", + " #fantasy_list is a list of fantasy dataframes to use\n", + " #max_size is the maximum number of examples to grab from any given dataset\n", + " #Returns a list of lists of next sentence prediction formatted examples, to be split into train/test hidden states outputs after being passed through the model.\n", + " #Return will be ordered in same order as the input. Authorship labels not added in this function\n", + "\n", + "\n", + " #Shuffling dataframe references https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows\n", + "\n", + " #Split off the testing examples after shuffling\n", + " shuffled = []\n", + " for i in range(len(df_list)):\n", + " shuffled.append(shuffle(df_list[i]))\n", + "\n", + " sample_list = []\n", + "\n", + " for df in shuffled:\n", + " #Get size of split using max train and test_size\n", + " #If dataset is big enough, use max_size examples, else just split whole dataset\n", + " if(max_size < len(df)):\n", + " #Just grab max_size first examples since the dataframes are shuffled\n", + " df = df[:max_size]\n", + "\n", + " #Append tuples to fill in subsequent sentences\n", + " sample_list.append((df[\"Sentences\"], df[\"Label\"]))\n", + " \n", + " #Define lists of lists to fill and return\n", + " nsp_inputs = []\n", + "\n", + " #Fill train sequences\n", + " for i in range(len(sample_list)):\n", + " ref_df = df_list[i]\n", + " X = sample_list[i][0]\n", + " y = sample_list[i][1]\n", + "\n", + " #Since all dataframes have same label of realistic vs fantasy,\n", + " #use first index label to get whether currently working on a fantasy dataframe\n", + " fantasy = y.iloc[0]\n", + " if(fantasy == 1):\n", + " opposite_df = real_list\n", + " else:\n", + " opposite_df = fantasy_list\n", + " \n", + " sequence_list = fillNSP(X, ref_df, opposite_df)\n", + " nsp_inputs.append(sequence_list)\n", + " return nsp_inputs\n", + "\n", + "def assignLabels(nsp_inputs, used_labels, mixed_label=7):\n", + " #nsp_inputs is a list of lists of inputs divided by dataframe returned by getFeatures()\n", + " #used_labels is the corresponding authorship labels used in getFeatures, since getFeatures can use a subset of the data\n", + " #mixed_label is an index corresponding to an extra class beyond the datasets. It represents data that comes from two different authors\n", + " #The mixed label is used since we are using NSP examples as inputs into the authorship classifier, and it can be changed if the test requires it.\n", + " #Returns three values:\n", + " # 1. A list of all of the examples from all of the dataframes concatenated together\n", + " # 2. A list of nsp_labels where 0 indicates that the second sentence follows and 1 indicates the second sentence is random\n", + " # 3. A list of authorship labels corresponding to each example\n", + "\n", + " nsp_examples = []\n", + " nsp_labels = []\n", + " author_labels = []\n", + "\n", + " for i in range(len(nsp_inputs)):\n", + " examples = nsp_inputs[i]\n", + " author = used_labels[i]\n", + " half = float(len(examples) / 2)\n", + " for j in range(len(examples)):\n", + " nsp_examples.append(examples[j])\n", + " if(j < half):\n", + " nsp_labels.append(0)\n", + " author_labels.append(author)\n", + " else:\n", + " nsp_labels.append(1)\n", + " author_labels.append(mixed_label)\n", + " return nsp_examples, nsp_labels, author_labels\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "foDKfkTWxZzs" + }, + "outputs": [], + "source": [ + "#Use functions defined in previous block to get features, then assign labels and format features\n", + "\n", + "#Returns a list of lists of examples drawn from the dataframes listed in the first argument\n", + "nsp_inputs = getFeatures(df_list, real_list, fantasy_list, 2000)\n", + "\n", + "#Manually create a list of the authorship labels corresponding to the list passed in above\n", + "#For example, if df_arthur wasn't used in the first argument, then 3 should be excluded from this list\n", + "used_labels = [0, 1, 2, 3, 4, 5, 6]\n", + "\n", + "#Assign authorship and NSP labels here\n", + "nsp_examples, nsp_labels, author_labels = assignLabels(nsp_inputs, used_labels)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 164, + "referenced_widgets": [ + "1f075ce511ae4d0abf083036a65b8820", + "59de8356cd2e4edba767cb5e67b18192", + "4d31a904f4d54aa3be7bce714a928e9f", + "24217de2454b41b8b30db1fcd5a8ac3e", + "3a372648383b4862b1ac4ebb005f21e8", + "4b6ab1c05635435ab75367352fd7887f", + "67a4f93c6e2f4c9e884b17a32f2a839d", + "2e728b3a85e34a0f829b6fae1cfbe850", + "3829a22cc9db44b0a5209ac0bf5204c2", + "2ce5d9d3e16b4a069c35a798de6720e6", + "01d5ea780cde4d0595f863ba8a1f54d9", + "08c8a2ffdf85425ca4dd406859ae0b25", + "5d919eb39e224117b54cbea5912d9169", + "692368553b724434875f7317a58180cf", + "bc855c20a3404821847b147a9364e2b8", + "5554699bce4a411dad80f0bdb3c68938", + "258a65e8793148b3ab3149bf9b3e3607", + "05411df4e5974735944644e51d967223", + "d59b727af104495e885723dd8804750b", + "1068ee75eb234e81ad91f61c68c0ab0a", + "8961f8d3ee08481d80e78214d96bca7c", + "7753fbaa8b0843bfaf69d4b3a7f831eb", + "646da4c4a06e471186b995e96fae83ac", + "9061259e86b8416fa1dba1dd355145c1" + ] + }, + "colab_type": "code", + "id": "HjAeVWm44tL8", + "outputId": "327c99cb-2462-4d3e-ca9b-31e87f172c3c" + }, + "outputs": [], + "source": [ + "#Perform Tokenization\n", + "#Code modified from example given in https://huggingface.co/transformers/model_doc/bert.html\n", + "from transformers import BertTokenizer, BertForNextSentencePrediction,BertConfig\n", + "\n", + "config = BertConfig.from_pretrained('bert-base-uncased',output_hidden_states=True, output_attentions=True)\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased',config=config)\n", + "\n", + "#Encode all examples in place\n", + "for i in range(len(nsp_examples)):\n", + " nsp_examples[i] = tokenizer.encode(nsp_examples[i])\n", + "\n", + "#After encoding, need to do padding and attention mask before creating the tensor since example len needs to match\n", + "#Code to do padding and create attention mask came from the below link:\n", + "#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT\n", + "max_val = 0\n", + "for example in nsp_examples:\n", + " if(len(example) > max_val):\n", + " max_val = len(example)\n", + "\n", + "\n", + "padded = np.array([i + [0]*(max_val-len(i)) for i in nsp_examples])\n", + "attention_mask = np.where(padded != 0, 1, 0)\n", + "\n", + "#Create tensors to pass into model\n", + "input_ids = torch.tensor(padded) \n", + "attention_mask = torch.tensor(attention_mask)\n", + "nsp_labels = torch.tensor(nsp_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "283\n" + ] + } + ], + "source": [ + "print(max_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "id": "cjPGijC_QEV4", + "outputId": "a9054b39-5a00-47f7-e62a-2a633f2d095d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10977\n", + "(10977, 283, 768)\n" + ] + } + ], + "source": [ + "#Code to do remove grad variable restriction from below link:\n", + "#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT\n", + "with torch.no_grad():\n", + " outputs = model(input_ids, attention_mask=attention_mask, next_sentence_label=nsp_labels)\n", + "\n", + "#Get BERT loss on the NSP task\n", + "loss = outputs[0]\n", + "hidden_states = outputs[2]\n", + "embedding_output = hidden_states[0]\n", + "print(len(embedding_output))\n", + "features= embedding_output.numpy()\n", + "print(np.shape(features))\n", + "\n", + "#Referenced to fix an error with model inputs:\n", + "#https://stackoverflow.com/questions/58682026/failed-to-find-data-adapter-that-can-handle-input-class-numpy-ndarray-cl\n", + "labels=np.asarray(author_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "rB7vr3Ho65mQ", + "outputId": "6df09b48-a54e-403f-fcaf-d6d59ab1fe16" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 8232 samples\n", + "Epoch 1/10\n", + "8232/8232 [==============================] - 345s 42ms/sample - loss: 1.0541 - acc: 0.6166\n", + "Epoch 2/10\n", + "8232/8232 [==============================] - 339s 41ms/sample - loss: 0.4656 - acc: 0.8256\n", + "Epoch 3/10\n", + "8232/8232 [==============================] - 351s 43ms/sample - loss: 0.2326 - acc: 0.9173\n", + "Epoch 4/10\n", + "8232/8232 [==============================] - 353s 43ms/sample - loss: 0.1152 - acc: 0.9614\n", + "Epoch 5/10\n", + "8232/8232 [==============================] - 351s 43ms/sample - loss: 0.0587 - acc: 0.9801\n", + "Epoch 6/10\n", + "8232/8232 [==============================] - 350s 42ms/sample - loss: 0.0291 - acc: 0.9921\n", + "Epoch 7/10\n", + "8232/8232 [==============================] - 350s 43ms/sample - loss: 0.0188 - acc: 0.9951\n", + "Epoch 8/10\n", + "8232/8232 [==============================] - 336s 41ms/sample - loss: 0.0147 - acc: 0.9960\n", + "Epoch 9/10\n", + "8232/8232 [==============================] - 355s 43ms/sample - loss: 0.0102 - acc: 0.9970\n", + "Epoch 10/10\n", + "8232/8232 [==============================] - 350s 43ms/sample - loss: 0.0058 - acc: 0.9984\n", + "2745/2745 [==============================] - 24s 9ms/sample - loss: 1.0556 - acc: 0.8186\n" + ] + }, + { + "data": { + "text/plain": [ + "[1.0555759338299, 0.81857926]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Referenced during creation of the model: https://keras.io/layers/recurrent/\n", + "\n", + "#Do 0.25 test split\n", + "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)\n", + "\n", + "data_dim = 768\n", + "timesteps = max_val\n", + "a = tf.keras.Sequential()\n", + "a.add(layers.LSTM(32, return_sequences=True,\n", + " input_shape=(timesteps, data_dim))) \n", + "a.add(layers.Flatten())\n", + "a.add(layers.Dense(8, activation='softmax'))\n", + "a.compile(loss='sparse_categorical_crossentropy',\n", + " optimizer='rmsprop',\n", + " metrics=['accuracy'])\n", + "\n", + "a.fit(train_features,train_labels,epochs=10)\n", + "a.evaluate(test_features, test_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "M-xf5REM7HZp" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2745\n" + ] + } + ], + "source": [ + "print(len(test_features))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "TPU", + "colab": { + "collapsed_sections": [], + "machine_shape": "hm", + "name": "TestingNSP.ipynb", + "provenance": [] + }, + "environment": { + "name": "tf-gpu.1-15.m47", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/tf-gpu.1-15:m47" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "01d5ea780cde4d0595f863ba8a1f54d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "Downloading: 100%", + "description_tooltip": null, + "layout": "IPY_MODEL_692368553b724434875f7317a58180cf", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5d919eb39e224117b54cbea5912d9169", + "value": 231508 + } + }, + "05411df4e5974735944644e51d967223": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "08c8a2ffdf85425ca4dd406859ae0b25": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5554699bce4a411dad80f0bdb3c68938", + "placeholder": "​", + "style": "IPY_MODEL_bc855c20a3404821847b147a9364e2b8", + "value": " 232k/232k [00:00<00:00, 1.75MB/s]" + } + }, + "1068ee75eb234e81ad91f61c68c0ab0a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9061259e86b8416fa1dba1dd355145c1", + "placeholder": "​", + "style": "IPY_MODEL_646da4c4a06e471186b995e96fae83ac", + "value": " 440M/440M [00:06<00:00, 72.0MB/s]" + } + }, + "1f075ce511ae4d0abf083036a65b8820": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4d31a904f4d54aa3be7bce714a928e9f", + "IPY_MODEL_24217de2454b41b8b30db1fcd5a8ac3e" + ], + "layout": "IPY_MODEL_59de8356cd2e4edba767cb5e67b18192" + } + }, + "24217de2454b41b8b30db1fcd5a8ac3e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2e728b3a85e34a0f829b6fae1cfbe850", + "placeholder": "​", + "style": "IPY_MODEL_67a4f93c6e2f4c9e884b17a32f2a839d", + "value": " 433/433 [00:00<00:00, 18.2kB/s]" + } + }, + "258a65e8793148b3ab3149bf9b3e3607": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d59b727af104495e885723dd8804750b", + "IPY_MODEL_1068ee75eb234e81ad91f61c68c0ab0a" + ], + "layout": "IPY_MODEL_05411df4e5974735944644e51d967223" + } + }, + "2ce5d9d3e16b4a069c35a798de6720e6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2e728b3a85e34a0f829b6fae1cfbe850": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3829a22cc9db44b0a5209ac0bf5204c2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_01d5ea780cde4d0595f863ba8a1f54d9", + "IPY_MODEL_08c8a2ffdf85425ca4dd406859ae0b25" + ], + "layout": "IPY_MODEL_2ce5d9d3e16b4a069c35a798de6720e6" + } + }, + "3a372648383b4862b1ac4ebb005f21e8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "initial" + } + }, + "4b6ab1c05635435ab75367352fd7887f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4d31a904f4d54aa3be7bce714a928e9f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "Downloading: 100%", + "description_tooltip": null, + "layout": "IPY_MODEL_4b6ab1c05635435ab75367352fd7887f", + "max": 433, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3a372648383b4862b1ac4ebb005f21e8", + "value": 433 + } + }, + "5554699bce4a411dad80f0bdb3c68938": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59de8356cd2e4edba767cb5e67b18192": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5d919eb39e224117b54cbea5912d9169": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "initial" + } + }, + "646da4c4a06e471186b995e96fae83ac": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "67a4f93c6e2f4c9e884b17a32f2a839d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "692368553b724434875f7317a58180cf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7753fbaa8b0843bfaf69d4b3a7f831eb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8961f8d3ee08481d80e78214d96bca7c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "initial" + } + }, + "9061259e86b8416fa1dba1dd355145c1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bc855c20a3404821847b147a9364e2b8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d59b727af104495e885723dd8804750b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "Downloading: 100%", + "description_tooltip": null, + "layout": "IPY_MODEL_7753fbaa8b0843bfaf69d4b3a7f831eb", + "max": 440473133, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8961f8d3ee08481d80e78214d96bca7c", + "value": 440473133 + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/Processing.ipynb b/src/Processing.ipynb new file mode 100644 index 0000000..d2c9491 --- /dev/null +++ b/src/Processing.ipynb @@ -0,0 +1,448 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Note: This notebook creates datasets out of the corresponding .txt files for the fantasy datasets\n", + "\n", + "!pip install nltk\n", + "\n", + "import nltk\n", + "nltk.download('punkt')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Referenced https://pythonspot.com/tokenizing-words-and-sentences-with-nltk/ to see how sent_tokenize works\n", + "from nltk.tokenize import sent_tokenize, word_tokenize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Arthur Dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n", + "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n", + "with open(\"arthurmodded.txt\", 'r', encoding=\"utf-8-sig\") as file:\n", + " data = file.read().replace('\\n', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Split file into sentence tokens\n", + "tokenized = sent_tokenize(data)\n", + "\n", + "#Replace unwanted tokens and punctuation\n", + "for i in range(len(tokenized)):\n", + " tokenized[i] = tokenized[i].replace('\\n', ' ')\n", + " tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n", + " tokenized[i] = tokenized[i].replace('\"\"', '')\n", + " tokenized[i] = tokenized[i].replace('.', '')\n", + " tokenized[i] = tokenized[i].replace('!', '')\n", + " tokenized[i] = tokenized[i].replace('?', '')\n", + "\n", + "#Filter out lines that contain unwanted words or formats, like illustrations\n", + "filtered = []\n", + "for sentence in tokenized:\n", + " if(\"Illustration\" not in sentence):\n", + " filtered.append(sentence)\n", + "\n", + "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n", + "filtered2 = []\n", + "for sentence in filtered:\n", + " if(sentence[0:3] != \"***\"):\n", + " filtered2.append(sentence)\n", + " else:\n", + " break\n", + "\n", + "#Final processing. Lowercase all sentences and append label for sentence. Label of 1 represents a fantasy example.\n", + "final = []\n", + "labels = []\n", + "for sentence in filtered2:\n", + " final.append(sentence.lower())\n", + " labels.append(1)\n", + "#print(final)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import numpy as np\n", + "import pandas as pd \n", + "frame_data = zip(final, labels)\n", + "\n", + "df = pd.DataFrame(frame_data)\n", + "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n", + "df_new.to_csv('arthur.csv', index=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Wizard of Oz Dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n", + "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n", + "with open(\"wonderfulwizardofozmodded.txt\", 'r', encoding=\"utf-8-sig\") as file:\n", + " data = file.read().replace('\\n', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Split file into sentence tokens\n", + "tokenized = sent_tokenize(data)\n", + "\n", + "#Replace unwanted tokens and punctuation\n", + "for i in range(len(tokenized)):\n", + " tokenized[i] = tokenized[i].replace('\\n', ' ')\n", + " tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n", + " tokenized[i] = tokenized[i].replace('\"\"', '')\n", + " tokenized[i] = tokenized[i].replace('.', '')\n", + " tokenized[i] = tokenized[i].replace('!', '')\n", + " tokenized[i] = tokenized[i].replace('?', '')\n", + "\n", + "filtered = []\n", + "for sentence in tokenized:\n", + " if(\"Illustration\" not in sentence):\n", + " filtered.append(sentence)\n", + " \n", + "\n", + "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n", + "filtered2 = []\n", + "for sentence in filtered:\n", + " if(sentence[0:3] != \"***\"):\n", + " filtered2.append(sentence)\n", + " else:\n", + " break\n", + "\n", + "#Final processing\n", + "final = []\n", + "labels = []\n", + "for sentence in filtered2:\n", + " final.append(sentence.lower())\n", + " labels.append(1)\n", + "#print(final)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import numpy as np\n", + "import pandas as pd \n", + "frame_data = zip(final, labels)\n", + "\n", + "df = pd.DataFrame(frame_data)\n", + "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n", + "df_new.to_csv('dorothy.csv', index=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Irish Fairy Tales Dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n", + "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n", + "with open(\"irishfairy.txt\", 'r', encoding=\"utf-8-sig\") as file:\n", + " data = file.read().replace('\\n', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Split file into sentence tokens\n", + "tokenized = sent_tokenize(data)\n", + "\n", + "#Replace unwanted tokens and punctuation\n", + "for i in range(len(tokenized)):\n", + " tokenized[i] = tokenized[i].replace('\\n', ' ')\n", + " tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n", + " tokenized[i] = tokenized[i].replace('\"\"', '')\n", + " tokenized[i] = tokenized[i].replace('“', '')\n", + " tokenized[i] = tokenized[i].replace('”', '')\n", + " tokenized[i] = tokenized[i].replace(\"’\", \"'\")\n", + " tokenized[i] = tokenized[i].replace('.', '')\n", + " tokenized[i] = tokenized[i].replace('!', '')\n", + " tokenized[i] = tokenized[i].replace('?', '')\n", + "\n", + "#Filter out lines that contain unwanted words or formats, like illustrations\n", + "filtered = []\n", + "for sentence in tokenized:\n", + " if '\"' in sentence:\n", + " print(sentence)\n", + " if(\"Illustration\" not in sentence and \"CHAPTER\" not in sentence):\n", + " filtered.append(sentence)\n", + "\n", + "\n", + "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n", + "filtered2 = []\n", + "for sentence in filtered:\n", + " if(sentence[0:3] != \"***\"):\n", + " filtered2.append(sentence)\n", + " else:\n", + " break\n", + "\n", + "#Final processing\n", + "final = []\n", + "labels = []\n", + "for sentence in filtered2:\n", + " final.append(sentence.lower())\n", + " labels.append(1)\n", + "#print(final)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import numpy as np\n", + "import pandas as pd \n", + "frame_data = zip(final, labels)\n", + "\n", + "df = pd.DataFrame(frame_data)\n", + "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n", + "df_new.to_csv('irishfairy.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Book of Wonder Dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n", + "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n", + "with open(\"bookofwonder.txt\", 'r', encoding=\"utf-8-sig\") as file:\n", + " data = file.read().replace('\\n', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Split file into sentence tokens\n", + "tokenized = sent_tokenize(data)\n", + "\n", + "#Replace unwanted tokens and punctuation\n", + "for i in range(len(tokenized)):\n", + " tokenized[i] = tokenized[i].replace('\\n', ' ')\n", + " tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n", + " tokenized[i] = tokenized[i].replace('\"\"', '')\n", + " tokenized[i] = tokenized[i].replace('“', '')\n", + " tokenized[i] = tokenized[i].replace('”', '')\n", + " tokenized[i] = tokenized[i].replace(\"’\", \"'\")\n", + " tokenized[i] = tokenized[i].replace('.', '')\n", + " tokenized[i] = tokenized[i].replace('!', '')\n", + " tokenized[i] = tokenized[i].replace('?', '')\n", + "\n", + "#Filter out lines that contain unwanted words or formats, like illustrations\n", + "filtered = []\n", + "for sentence in tokenized:\n", + " if(\"Illustration\" not in sentence and \"CHAPTER\" not in sentence and not sentence.isupper()):\n", + " filtered.append(sentence)\n", + "\n", + "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n", + "filtered2 = []\n", + "for sentence in filtered:\n", + " if(sentence[0:3] != \"***\"):\n", + " filtered2.append(sentence)\n", + " else:\n", + " break\n", + "\n", + "#Final processing\n", + "final = []\n", + "labels = []\n", + "for sentence in filtered2:\n", + " final.append(sentence.lower())\n", + " labels.append(1)\n", + "#print(final)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import numpy as np\n", + "import pandas as pd \n", + "frame_data = zip(final, labels)\n", + "\n", + "df = pd.DataFrame(frame_data)\n", + "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n", + "df_new.to_csv('bookofwonder.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**World of Ice and Fire Dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n", + "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n", + "with open(\"iceandfire.txt\", 'r', encoding=\"utf-8-sig\") as file:\n", + " data = file.read().replace('\\n', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Split file into sentence tokens\n", + "tokenized = sent_tokenize(data)\n", + "\n", + "#Replace unwanted tokens and punctuation\n", + "for i in range(len(tokenized)):\n", + " tokenized[i] = tokenized[i].replace('\\n', ' ')\n", + " tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n", + " tokenized[i] = tokenized[i].replace('\"\"', '')\n", + " tokenized[i] = tokenized[i].replace('“', '')\n", + " tokenized[i] = tokenized[i].replace('”', '')\n", + " tokenized[i] = tokenized[i].replace(\"’\", \"'\")\n", + " tokenized[i] = tokenized[i].replace('.', '')\n", + " tokenized[i] = tokenized[i].replace('!', '')\n", + " tokenized[i] = tokenized[i].replace('?', '')\n", + " \n", + "#Filter out lines that contain unwanted words or formats, like illustrations\n", + "filtered = []\n", + "for sentence in tokenized:\n", + " if(\"illustration\" not in sentence and \"CHAPTER\" not in sentence and len(sentence) > 0):\n", + " filtered.append(sentence)\n", + "\n", + "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n", + "filtered2 = []\n", + "for sentence in filtered:\n", + " if(sentence[0:3] != \"***\"):\n", + " filtered2.append(sentence)\n", + " else:\n", + " break\n", + "\n", + "#Final processing\n", + "final = []\n", + "labels = []\n", + "for sentence in filtered2:\n", + " final.append(sentence.lower())\n", + " labels.append(1)\n", + "#print(final)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import numpy as np\n", + "import pandas as pd \n", + "frame_data = zip(final, labels)\n", + "\n", + "df = pd.DataFrame(frame_data)\n", + "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n", + "df_new.to_csv('iceandfire.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}