diff --git a/src/Full_BERT_Type_Classifier.ipynb b/src/Full_BERT_Type_Classifier.ipynb
new file mode 100644
index 0000000..bde170c
--- /dev/null
+++ b/src/Full_BERT_Type_Classifier.ipynb
@@ -0,0 +1,417 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Full BERT Type Classifier",
+      "provenance": [],
+      "collapsed_sections": [],
+      "machine_shape": "hm"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "TPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-S-iFkX0m3NE",
+        "colab_type": "code",
+        "outputId": "5ba5e679-de51-4a02-a1cb-cfd87e6a8063",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 672
+        }
+      },
+      "source": [
+        "!pip install transformers"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Collecting transformers\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)\n",
+            "\u001b[K     |████████████████████████████████| 573kB 3.1MB/s \n",
+            "\u001b[?25hCollecting sacremoses\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)\n",
+            "\u001b[K     |████████████████████████████████| 890kB 8.3MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.38.0)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)\n",
+            "Collecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 17.3MB/s \n",
+            "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n",
+            "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n",
+            "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.12.43)\n",
+            "Collecting tokenizers==0.5.2\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)\n",
+            "\u001b[K     |████████████████████████████████| 3.7MB 20.6MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.3)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.1)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)\n",
+            "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.4.5.1)\n",
+            "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n",
+            "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)\n",
+            "Requirement already satisfied: botocore<1.16.0,>=1.15.43 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.15.43)\n",
+            "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)\n",
+            "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.5)\n",
+            "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.16.0,>=1.15.43->boto3->transformers) (0.15.2)\n",
+            "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.16.0,>=1.15.43->boto3->transformers) (2.8.1)\n",
+            "Building wheels for collected packages: sacremoses\n",
+            "  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for sacremoses: filename=sacremoses-0.0.41-cp36-none-any.whl size=893334 sha256=b87f354dc0f3703370c24ace917b1e7a2cb844f78be0a3a56a7f6bdd8197ac05\n",
+            "  Stored in directory: /root/.cache/pip/wheels/22/5a/d4/b020a81249de7dc63758a34222feaa668dbe8ebfe9170cc9b1\n",
+            "Successfully built sacremoses\n",
+            "Installing collected packages: sacremoses, sentencepiece, tokenizers, transformers\n",
+            "Successfully installed sacremoses-0.0.41 sentencepiece-0.1.86 tokenizers-0.5.2 transformers-2.8.0\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HK8wfsrNm5-6",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "import torch\n",
+        "import transformers as ppb\n",
+        "import warnings\n",
+        "warnings.filterwarnings('ignore')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "aiw1oGrym6w5",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Read in dataframes, classify one real dataset vs all fantasy datasets\n",
+        "df_real = pd.read_csv(\"current_history_NYT.csv\")\n",
+        "df_dorothy = pd.read_csv(\"dorothy.csv\")\n",
+        "df_arthur = pd.read_csv(\"arthur.csv\")\n",
+        "df_wonder = pd.read_csv(\"bookofwonder.csv\")\n",
+        "df_irish = pd.read_csv(\"irishfairy.csv\")\n",
+        "df_iceandfire = pd.read_csv(\"iceandfire.csv\")"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "1uFmH1_iifNc",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Since the lines in the realistic dataset may contain footnote numbers and formatting,\n",
+        "#code removes formatting, but not numbers since numbers may be important to history\n",
+        "#Referenced for formatting: https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column\n",
+        "df_real[\"Sentences\"] = df_real[\"Sentences\"].str.replace(\"*\", \"\")"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "TBhTWP4cu-3x",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#For BERT features, limit datasets with more than 1200 lines to 1200.\n",
+        "#This is done to avoid exceeding the RAM provided by free Colab\n",
+        "df_real = df_real[:1200]\n",
+        "df_dorothy = df_dorothy[:1200]\n",
+        "df_arthur = df_arthur[:1200]\n",
+        "df_wonder = df_wonder[:1200]\n",
+        "df_irish = df_irish[:1200]\n",
+        "df_iceandfire = df_iceandfire[:1200]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "3gqfM9O8wDpT",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "from sklearn.utils import shuffle\n",
+        "\n",
+        "#Create batch dataframes that store combined realistic and fantasy data\n",
+        "dorothy_batch = df_real.append(df_dorothy, ignore_index=True)\n",
+        "arthur_batch = df_real.append(df_arthur, ignore_index=True)\n",
+        "wonder_batch = df_real.append(df_wonder, ignore_index=True)\n",
+        "irish_batch = df_real.append(df_irish, ignore_index=True)\n",
+        "iceandfire_batch = df_real.append(df_iceandfire, ignore_index=True)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "oNN6oJVQm-dJ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Place all batch columns into variables\n",
+        "\n",
+        "dorothy_sentences = dorothy_batch[\"Sentences\"]\n",
+        "dorothy_labels = dorothy_batch[\"Label\"]\n",
+        "\n",
+        "arthur_sentences = arthur_batch[\"Sentences\"]\n",
+        "arthur_labels = arthur_batch[\"Label\"]\n",
+        "\n",
+        "wonder_sentences = wonder_batch[\"Sentences\"]\n",
+        "wonder_labels = wonder_batch[\"Label\"]\n",
+        "\n",
+        "irish_sentences = irish_batch[\"Sentences\"]\n",
+        "irish_labels = irish_batch[\"Label\"]\n",
+        "\n",
+        "iceandfire_sentences = iceandfire_batch[\"Sentences\"]\n",
+        "iceandfire_labels = iceandfire_batch[\"Label\"]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qWoAY8A-nCNc",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Code from this point downward is a modified version of base code for a BERT classifier from below link:\n",
+        "#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT\n",
+        "\n",
+        "model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')\n",
+        "\n",
+        "# Load pretrained model/tokenizer\n",
+        "tokenizer = tokenizer_class.from_pretrained(pretrained_weights)\n",
+        "model = model_class.from_pretrained(pretrained_weights)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vTlNoa6enEAS",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Create tokenized inputs for BERT\n",
+        "tokenized = iceandfire_sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=False)))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GhI6XFMPnHJE",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Pad all sentences to greatest length because BERT needs all inputs to be the same length\n",
+        "max_len = 0\n",
+        "for i in tokenized.values:\n",
+        "    if len(i) > max_len:\n",
+        "        max_len = len(i)\n",
+        "\n",
+        "padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "rnunMGDTnKNC",
+        "colab_type": "code",
+        "outputId": "6f9a4e72-f873-45c7-a276-2ccc247356a1",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 33
+        }
+      },
+      "source": [
+        "#Create attention mask on padded that tells BERT to avoid calculating attention on padding\n",
+        "attention_mask = np.where(padded != 0, 1, 0)\n",
+        "attention_mask.shape"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "(2400, 180)"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 50
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "f34MeBHcnOIF",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Place variables in tensors since the library is a pytorch base\n",
+        "input_ids = torch.tensor(padded)  \n",
+        "attention_mask = torch.tensor(attention_mask)\n",
+        "\n",
+        "#torch.no_grad disables autograd on the last_hidden_states variable\n",
+        "with torch.no_grad():\n",
+        "    last_hidden_states = model(input_ids, attention_mask=attention_mask)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "B9jSKLZznQJp",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Get only the [CLS] token feature\n",
+        "features = last_hidden_states[0][:,0,:].numpy()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xXhFmq0cnRUz",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Assign labels variable based on current dataset\n",
+        "labels = iceandfire_labels"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "r9sJYh58nSpx",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Split using .25 test split\n",
+        "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ApftcACunh_y",
+        "colab_type": "code",
+        "outputId": "31eb3a87-e20e-4877-f163-81733f9f9784",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 100
+        }
+      },
+      "source": [
+        "#Fit logistic regression model with 100 epochs\n",
+        "lr_clf = LogisticRegression()\n",
+        "lr_clf.fit(train_features, train_labels)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+              "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+              "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
+              "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
+              "                   warm_start=False)"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 56
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8hX31EiYnlKU",
+        "colab_type": "code",
+        "outputId": "b418e43f-e5c7-40fa-e9ce-cbe36fec1362",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 33
+        }
+      },
+      "source": [
+        "#Get test accuracy\n",
+        "lr_clf.score(test_features, test_labels)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "0.9416666666666667"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 57
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/src/NSPandAuthorship.ipynb b/src/NSPandAuthorship.ipynb
new file mode 100644
index 0000000..88e7940
--- /dev/null
+++ b/src/NSPandAuthorship.ipynb
@@ -0,0 +1,1295 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 683
+    },
+    "colab_type": "code",
+    "id": "A08cLa726vN0",
+    "outputId": "d814fa84-740b-45b0-99c1-038cf64d1746"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: transformers in /opt/conda/lib/python3.7/site-packages (2.8.0)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (2020.4.4)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from transformers) (2.23.0)\n",
+      "Requirement already satisfied: tokenizers==0.5.2 in /opt/conda/lib/python3.7/site-packages (from transformers) (0.5.2)\n",
+      "Requirement already satisfied: sentencepiece in /opt/conda/lib/python3.7/site-packages (from transformers) (0.1.86)\n",
+      "Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from transformers) (1.18.3)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.7/site-packages (from transformers) (4.45.0)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from transformers) (3.0.10)\n",
+      "Requirement already satisfied: boto3 in /opt/conda/lib/python3.7/site-packages (from transformers) (1.13.2)\n",
+      "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.7/site-packages (from transformers) (0.0.43)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (1.25.9)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2.9)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (3.0.4)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2020.4.5.1)\n",
+      "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /opt/conda/lib/python3.7/site-packages (from boto3->transformers) (0.3.3)\n",
+      "Requirement already satisfied: botocore<1.17.0,>=1.16.2 in /opt/conda/lib/python3.7/site-packages (from boto3->transformers) (1.16.2)\n",
+      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.7/site-packages (from boto3->transformers) (0.9.5)\n",
+      "Requirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (7.1.1)\n",
+      "Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (0.14.1)\n",
+      "Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (1.14.0)\n",
+      "Requirement already satisfied: docutils<0.16,>=0.10 in /opt/conda/lib/python3.7/site-packages (from botocore<1.17.0,>=1.16.2->boto3->transformers) (0.15.2)\n",
+      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.7/site-packages (from botocore<1.17.0,>=1.16.2->boto3->transformers) (2.8.1)\n",
+      "Requirement already satisfied: torch in /opt/conda/lib/python3.7/site-packages (1.5.0)\n",
+      "Requirement already satisfied: future in /opt/conda/lib/python3.7/site-packages (from torch) (0.18.2)\n",
+      "Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from torch) (1.18.3)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install transformers\n",
+    "!pip install torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "A3BwpCu663-q"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.utils import shuffle\n",
+    "import torch\n",
+    "import transformers as ppb\n",
+    "import warnings\n",
+    "import collections\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import tensorflow as tf\n",
+    "from tensorflow.keras import layers\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "4bztdPRBcI1f"
+   },
+   "outputs": [],
+   "source": [
+    "#Read in dataframes, classify one real dataset vs all fantasy datasets\n",
+    "df_bird = pd.read_csv(\"bird_history.csv\")\n",
+    "df_NYT = pd.read_csv(\"current_history_NYT.csv\")\n",
+    "df_dorothy = pd.read_csv(\"dorothy.csv\")\n",
+    "df_arthur = pd.read_csv(\"arthur.csv\")\n",
+    "df_wonder = pd.read_csv(\"bookofwonder.csv\")\n",
+    "df_irish = pd.read_csv(\"irishfairy.csv\")\n",
+    "df_iceandfire = pd.read_csv(\"iceandfire.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "Ae6UNU7RdWzo"
+   },
+   "outputs": [],
+   "source": [
+    "#Since the lines in the realistic dataset may contain footnote numbers and formatting,\n",
+    "#code removes formatting, but not numbers since numbers may be important to history\n",
+    "#Referenced for formatting: https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column\n",
+    "df_NYT[\"Sentences\"] = df_NYT[\"Sentences\"].str.replace(\"*\", \"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "nyL8UGe9_Wrq"
+   },
+   "outputs": [],
+   "source": [
+    "#Modifiable lists of which realistic and fantasy dataframes to consider when randomly choosing next sentences\n",
+    "real_list = [df_bird, df_NYT]\n",
+    "fantasy_list = [df_dorothy, df_arthur, df_wonder, df_irish, df_iceandfire]\n",
+    "\n",
+    "#List of datasets to pass to getTrain and use in NSP fine tuning\n",
+    "#Authorship labels assigned in order corresponding to this full list of datasets\n",
+    "df_list = [df_bird, df_NYT, df_dorothy, df_arthur, df_wonder, df_irish, df_iceandfire]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "SCoeCtLEdevm"
+   },
+   "outputs": [],
+   "source": [
+    "#Called in getTrain. \n",
+    "#Contains code that fills in half correct next sentences and half random sentences from opposite genre\n",
+    "def fillNSP(fill_list, ref_df, opposite_df):\n",
+    "  half = int(len(fill_list) / 2)\n",
+    "  sequence_list = []\n",
+    "  #Code to get indices from https://www.geeksforgeeks.org/how-to-get-rows-index-names-in-pandas-dataframe/\n",
+    "  index_list = fill_list.index.values.tolist()\n",
+    "  #Fill in accurate next lines\n",
+    "  for j in range(half):\n",
+    "    current_sentence = fill_list.iloc[j].strip()\n",
+    "    \n",
+    "    #Check index first to preempt edge case where trying to access next sentence at end of df\n",
+    "    index = index_list[j]\n",
+    "    if(index + 1 < len(ref_df)):\n",
+    "      next_sentence = ref_df[\"Sentences\"][index + 1].strip()\n",
+    "    else:\n",
+    "      #If at end of df, just step back 1 and use current sentence as next sentence\n",
+    "      next_sentence = fill_list.iloc[j].strip()\n",
+    "      current_sentence = ref_df[\"Sentences\"][index - 1].strip()\n",
+    "    #Add formatting for first sentence\n",
+    "    sequence = \"[CLS] \" + current_sentence + \" [SEP] \" + next_sentence + \" [SEP]\"\n",
+    "    sequence_list.append(sequence)\n",
+    "  \n",
+    "  #Fill in random next_lines from the opposite genre\n",
+    "  for j in range(half, len(fill_list)):\n",
+    "    #Get index of df first since np.random.choice can't choose a random dataframe directly\n",
+    "    random_df_index = np.random.choice(range(len(opposite_df)), 1)\n",
+    "    random_df_index = random_df_index[0]\n",
+    "    random_df = opposite_df[random_df_index]\n",
+    "    random_index = np.random.choice(range(len(random_df)), 1)\n",
+    "    random_index = random_index[0]\n",
+    "    next_sentence = random_df[\"Sentences\"][random_index].strip()\n",
+    "    \n",
+    "    current_sentence = fill_list.iloc[j].strip()\n",
+    "    #Add formatting for first sentence\n",
+    "    sequence = \"[CLS] \" + current_sentence + \" [SEP] \" + next_sentence + \" [SEP]\"\n",
+    "    sequence_list.append(sequence)\n",
+    "  return sequence_list\n",
+    "\n",
+    "def getFeatures(df_list, real_list, fantasy_list, max_size):\n",
+    "  #df_list is a full list of dataframes to get examples from\n",
+    "  #real_list is a list of realistic dataframes to use\n",
+    "  #fantasy_list is a list of fantasy dataframes to use\n",
+    "  #max_size is the maximum number of examples to grab from any given dataset\n",
+    "  #Returns a list of lists of next sentence prediction formatted examples, to be split into train/test hidden states outputs after being passed through the model.\n",
+    "  #Return will be ordered in same order as the input. Authorship labels not added in this function\n",
+    "\n",
+    "\n",
+    "  #Shuffling dataframe references https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows\n",
+    "\n",
+    "  #Split off the testing examples after shuffling\n",
+    "  shuffled = []\n",
+    "  for i in range(len(df_list)):\n",
+    "    shuffled.append(shuffle(df_list[i]))\n",
+    "\n",
+    "  sample_list = []\n",
+    "\n",
+    "  for df in shuffled:\n",
+    "    #Get size of split using max train and test_size\n",
+    "    #If dataset is big enough, use max_size examples, else just split whole dataset\n",
+    "    if(max_size < len(df)):\n",
+    "      #Just grab max_size first examples since the dataframes are shuffled\n",
+    "      df = df[:max_size]\n",
+    "\n",
+    "    #Append tuples to fill in subsequent sentences\n",
+    "    sample_list.append((df[\"Sentences\"], df[\"Label\"]))\n",
+    "  \n",
+    "  #Define lists of lists to fill and return\n",
+    "  nsp_inputs = []\n",
+    "\n",
+    "  #Fill train sequences\n",
+    "  for i in range(len(sample_list)):\n",
+    "    ref_df = df_list[i]\n",
+    "    X = sample_list[i][0]\n",
+    "    y = sample_list[i][1]\n",
+    "\n",
+    "    #Since all dataframes have same label of realistic vs fantasy,\n",
+    "    #use first index label to get whether currently working on a fantasy dataframe\n",
+    "    fantasy = y.iloc[0]\n",
+    "    if(fantasy == 1):\n",
+    "      opposite_df = real_list\n",
+    "    else:\n",
+    "      opposite_df = fantasy_list\n",
+    "    \n",
+    "    sequence_list = fillNSP(X, ref_df, opposite_df)\n",
+    "    nsp_inputs.append(sequence_list)\n",
+    "  return nsp_inputs\n",
+    "\n",
+    "def assignLabels(nsp_inputs, used_labels, mixed_label=7):\n",
+    "  #nsp_inputs is a list of lists of inputs divided by dataframe returned by getFeatures()\n",
+    "  #used_labels is the corresponding authorship labels used in getFeatures, since getFeatures can use a subset of the data\n",
+    "  #mixed_label is an index corresponding to an extra class beyond the datasets. It represents data that comes from two different authors\n",
+    "  #The mixed label is used since we are using NSP examples as inputs into the authorship classifier, and it can be changed if the test requires it.\n",
+    "  #Returns three values:\n",
+    "  # 1. A list of all of the examples from all of the dataframes concatenated together\n",
+    "  # 2. A list of nsp_labels where 0 indicates that the second sentence follows and 1 indicates the second sentence is random\n",
+    "  # 3. A list of authorship labels corresponding to each example\n",
+    "\n",
+    "  nsp_examples = []\n",
+    "  nsp_labels = []\n",
+    "  author_labels = []\n",
+    "\n",
+    "  for i in range(len(nsp_inputs)):\n",
+    "    examples = nsp_inputs[i]\n",
+    "    author = used_labels[i]\n",
+    "    half = float(len(examples) / 2)\n",
+    "    for j in range(len(examples)):\n",
+    "      nsp_examples.append(examples[j])\n",
+    "      if(j < half):\n",
+    "        nsp_labels.append(0)\n",
+    "        author_labels.append(author)\n",
+    "      else:\n",
+    "        nsp_labels.append(1)\n",
+    "        author_labels.append(mixed_label)\n",
+    "  return nsp_examples, nsp_labels, author_labels\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "foDKfkTWxZzs"
+   },
+   "outputs": [],
+   "source": [
+    "#Use functions defined in previous block to get features, then assign labels and format features\n",
+    "\n",
+    "#Returns a list of lists of examples drawn from the dataframes listed in the first argument\n",
+    "nsp_inputs = getFeatures(df_list, real_list, fantasy_list, 2000)\n",
+    "\n",
+    "#Manually create a list of the authorship labels corresponding to the list passed in above\n",
+    "#For example, if df_arthur wasn't used in the first argument, then 3 should be excluded from this list\n",
+    "used_labels = [0, 1, 2, 3, 4, 5, 6]\n",
+    "\n",
+    "#Assign authorship and NSP labels here\n",
+    "nsp_examples, nsp_labels, author_labels = assignLabels(nsp_inputs, used_labels)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 164,
+     "referenced_widgets": [
+      "1f075ce511ae4d0abf083036a65b8820",
+      "59de8356cd2e4edba767cb5e67b18192",
+      "4d31a904f4d54aa3be7bce714a928e9f",
+      "24217de2454b41b8b30db1fcd5a8ac3e",
+      "3a372648383b4862b1ac4ebb005f21e8",
+      "4b6ab1c05635435ab75367352fd7887f",
+      "67a4f93c6e2f4c9e884b17a32f2a839d",
+      "2e728b3a85e34a0f829b6fae1cfbe850",
+      "3829a22cc9db44b0a5209ac0bf5204c2",
+      "2ce5d9d3e16b4a069c35a798de6720e6",
+      "01d5ea780cde4d0595f863ba8a1f54d9",
+      "08c8a2ffdf85425ca4dd406859ae0b25",
+      "5d919eb39e224117b54cbea5912d9169",
+      "692368553b724434875f7317a58180cf",
+      "bc855c20a3404821847b147a9364e2b8",
+      "5554699bce4a411dad80f0bdb3c68938",
+      "258a65e8793148b3ab3149bf9b3e3607",
+      "05411df4e5974735944644e51d967223",
+      "d59b727af104495e885723dd8804750b",
+      "1068ee75eb234e81ad91f61c68c0ab0a",
+      "8961f8d3ee08481d80e78214d96bca7c",
+      "7753fbaa8b0843bfaf69d4b3a7f831eb",
+      "646da4c4a06e471186b995e96fae83ac",
+      "9061259e86b8416fa1dba1dd355145c1"
+     ]
+    },
+    "colab_type": "code",
+    "id": "HjAeVWm44tL8",
+    "outputId": "327c99cb-2462-4d3e-ca9b-31e87f172c3c"
+   },
+   "outputs": [],
+   "source": [
+    "#Perform Tokenization\n",
+    "#Code modified from example given in https://huggingface.co/transformers/model_doc/bert.html\n",
+    "from transformers import BertTokenizer, BertForNextSentencePrediction,BertConfig\n",
+    "\n",
+    "config = BertConfig.from_pretrained('bert-base-uncased',output_hidden_states=True, output_attentions=True)\n",
+    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased',config=config)\n",
+    "\n",
+    "#Encode all examples in place\n",
+    "for i in range(len(nsp_examples)):\n",
+    "  nsp_examples[i] = tokenizer.encode(nsp_examples[i])\n",
+    "\n",
+    "#After encoding, need to do padding and attention mask before creating the tensor since example len needs to match\n",
+    "#Code to do padding and create attention mask came from the below link:\n",
+    "#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT\n",
+    "max_val = 0\n",
+    "for example in nsp_examples:\n",
+    "  if(len(example) > max_val):\n",
+    "    max_val = len(example)\n",
+    "\n",
+    "\n",
+    "padded = np.array([i + [0]*(max_val-len(i)) for i in nsp_examples])\n",
+    "attention_mask = np.where(padded != 0, 1, 0)\n",
+    "\n",
+    "#Create tensors to pass into model\n",
+    "input_ids = torch.tensor(padded)  \n",
+    "attention_mask = torch.tensor(attention_mask)\n",
+    "nsp_labels = torch.tensor(nsp_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "283\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(max_val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 51
+    },
+    "colab_type": "code",
+    "id": "cjPGijC_QEV4",
+    "outputId": "a9054b39-5a00-47f7-e62a-2a633f2d095d"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10977\n",
+      "(10977, 283, 768)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Code to do remove grad variable restriction from below link:\n",
+    "#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT\n",
+    "with torch.no_grad():\n",
+    "  outputs = model(input_ids, attention_mask=attention_mask, next_sentence_label=nsp_labels)\n",
+    "\n",
+    "#Get BERT loss on the NSP task\n",
+    "loss = outputs[0]\n",
+    "hidden_states = outputs[2]\n",
+    "embedding_output = hidden_states[0]\n",
+    "print(len(embedding_output))\n",
+    "features= embedding_output.numpy()\n",
+    "print(np.shape(features))\n",
+    "\n",
+    "#Referenced to fix an error with model inputs:\n",
+    "#https://stackoverflow.com/questions/58682026/failed-to-find-data-adapter-that-can-handle-input-class-numpy-ndarray-cl\n",
+    "labels=np.asarray(author_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000
+    },
+    "colab_type": "code",
+    "id": "rB7vr3Ho65mQ",
+    "outputId": "6df09b48-a54e-403f-fcaf-d6d59ab1fe16"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train on 8232 samples\n",
+      "Epoch 1/10\n",
+      "8232/8232 [==============================] - 345s 42ms/sample - loss: 1.0541 - acc: 0.6166\n",
+      "Epoch 2/10\n",
+      "8232/8232 [==============================] - 339s 41ms/sample - loss: 0.4656 - acc: 0.8256\n",
+      "Epoch 3/10\n",
+      "8232/8232 [==============================] - 351s 43ms/sample - loss: 0.2326 - acc: 0.9173\n",
+      "Epoch 4/10\n",
+      "8232/8232 [==============================] - 353s 43ms/sample - loss: 0.1152 - acc: 0.9614\n",
+      "Epoch 5/10\n",
+      "8232/8232 [==============================] - 351s 43ms/sample - loss: 0.0587 - acc: 0.9801\n",
+      "Epoch 6/10\n",
+      "8232/8232 [==============================] - 350s 42ms/sample - loss: 0.0291 - acc: 0.9921\n",
+      "Epoch 7/10\n",
+      "8232/8232 [==============================] - 350s 43ms/sample - loss: 0.0188 - acc: 0.9951\n",
+      "Epoch 8/10\n",
+      "8232/8232 [==============================] - 336s 41ms/sample - loss: 0.0147 - acc: 0.9960\n",
+      "Epoch 9/10\n",
+      "8232/8232 [==============================] - 355s 43ms/sample - loss: 0.0102 - acc: 0.9970\n",
+      "Epoch 10/10\n",
+      "8232/8232 [==============================] - 350s 43ms/sample - loss: 0.0058 - acc: 0.9984\n",
+      "2745/2745 [==============================] - 24s 9ms/sample - loss: 1.0556 - acc: 0.8186\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[1.0555759338299, 0.81857926]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Referenced during creation of the model: https://keras.io/layers/recurrent/\n",
+    "\n",
+    "#Do 0.25 test split\n",
+    "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)\n",
+    "\n",
+    "data_dim = 768\n",
+    "timesteps = max_val\n",
+    "a = tf.keras.Sequential()\n",
+    "a.add(layers.LSTM(32, return_sequences=True,\n",
+    "               input_shape=(timesteps, data_dim))) \n",
+    "a.add(layers.Flatten())\n",
+    "a.add(layers.Dense(8, activation='softmax'))\n",
+    "a.compile(loss='sparse_categorical_crossentropy',\n",
+    "            optimizer='rmsprop',\n",
+    "              metrics=['accuracy'])\n",
+    "\n",
+    "a.fit(train_features,train_labels,epochs=10)\n",
+    "a.evaluate(test_features, test_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "M-xf5REM7HZp"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2745\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(test_features))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "TPU",
+  "colab": {
+   "collapsed_sections": [],
+   "machine_shape": "hm",
+   "name": "TestingNSP.ipynb",
+   "provenance": []
+  },
+  "environment": {
+   "name": "tf-gpu.1-15.m47",
+   "type": "gcloud",
+   "uri": "gcr.io/deeplearning-platform-release/tf-gpu.1-15:m47"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "01d5ea780cde4d0595f863ba8a1f54d9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "IntProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "IntProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "Downloading: 100%",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_692368553b724434875f7317a58180cf",
+      "max": 231508,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_5d919eb39e224117b54cbea5912d9169",
+      "value": 231508
+     }
+    },
+    "05411df4e5974735944644e51d967223": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "08c8a2ffdf85425ca4dd406859ae0b25": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_5554699bce4a411dad80f0bdb3c68938",
+      "placeholder": "​",
+      "style": "IPY_MODEL_bc855c20a3404821847b147a9364e2b8",
+      "value": " 232k/232k [00:00&lt;00:00, 1.75MB/s]"
+     }
+    },
+    "1068ee75eb234e81ad91f61c68c0ab0a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_9061259e86b8416fa1dba1dd355145c1",
+      "placeholder": "​",
+      "style": "IPY_MODEL_646da4c4a06e471186b995e96fae83ac",
+      "value": " 440M/440M [00:06&lt;00:00, 72.0MB/s]"
+     }
+    },
+    "1f075ce511ae4d0abf083036a65b8820": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_4d31a904f4d54aa3be7bce714a928e9f",
+       "IPY_MODEL_24217de2454b41b8b30db1fcd5a8ac3e"
+      ],
+      "layout": "IPY_MODEL_59de8356cd2e4edba767cb5e67b18192"
+     }
+    },
+    "24217de2454b41b8b30db1fcd5a8ac3e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_2e728b3a85e34a0f829b6fae1cfbe850",
+      "placeholder": "​",
+      "style": "IPY_MODEL_67a4f93c6e2f4c9e884b17a32f2a839d",
+      "value": " 433/433 [00:00&lt;00:00, 18.2kB/s]"
+     }
+    },
+    "258a65e8793148b3ab3149bf9b3e3607": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_d59b727af104495e885723dd8804750b",
+       "IPY_MODEL_1068ee75eb234e81ad91f61c68c0ab0a"
+      ],
+      "layout": "IPY_MODEL_05411df4e5974735944644e51d967223"
+     }
+    },
+    "2ce5d9d3e16b4a069c35a798de6720e6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2e728b3a85e34a0f829b6fae1cfbe850": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "3829a22cc9db44b0a5209ac0bf5204c2": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_01d5ea780cde4d0595f863ba8a1f54d9",
+       "IPY_MODEL_08c8a2ffdf85425ca4dd406859ae0b25"
+      ],
+      "layout": "IPY_MODEL_2ce5d9d3e16b4a069c35a798de6720e6"
+     }
+    },
+    "3a372648383b4862b1ac4ebb005f21e8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": "initial"
+     }
+    },
+    "4b6ab1c05635435ab75367352fd7887f": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4d31a904f4d54aa3be7bce714a928e9f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "IntProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "IntProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "Downloading: 100%",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4b6ab1c05635435ab75367352fd7887f",
+      "max": 433,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_3a372648383b4862b1ac4ebb005f21e8",
+      "value": 433
+     }
+    },
+    "5554699bce4a411dad80f0bdb3c68938": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "59de8356cd2e4edba767cb5e67b18192": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5d919eb39e224117b54cbea5912d9169": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": "initial"
+     }
+    },
+    "646da4c4a06e471186b995e96fae83ac": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "67a4f93c6e2f4c9e884b17a32f2a839d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "692368553b724434875f7317a58180cf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7753fbaa8b0843bfaf69d4b3a7f831eb": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8961f8d3ee08481d80e78214d96bca7c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": "initial"
+     }
+    },
+    "9061259e86b8416fa1dba1dd355145c1": {
+     "model_module": "@jupyter-widgets/base",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "bc855c20a3404821847b147a9364e2b8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "d59b727af104495e885723dd8804750b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_name": "IntProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "IntProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "Downloading: 100%",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7753fbaa8b0843bfaf69d4b3a7f831eb",
+      "max": 440473133,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_8961f8d3ee08481d80e78214d96bca7c",
+      "value": 440473133
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/Processing.ipynb b/src/Processing.ipynb
new file mode 100644
index 0000000..d2c9491
--- /dev/null
+++ b/src/Processing.ipynb
@@ -0,0 +1,448 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Note: This notebook creates datasets out of the corresponding .txt files for the fantasy datasets\n",
+    "\n",
+    "!pip install nltk\n",
+    "\n",
+    "import nltk\n",
+    "nltk.download('punkt')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Referenced https://pythonspot.com/tokenizing-words-and-sentences-with-nltk/ to see how sent_tokenize works\n",
+    "from nltk.tokenize import sent_tokenize, word_tokenize"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Arthur Dataset**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n",
+    "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n",
+    "with open(\"arthurmodded.txt\", 'r', encoding=\"utf-8-sig\") as file:\n",
+    "    data = file.read().replace('\\n', ' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Split file into sentence tokens\n",
+    "tokenized = sent_tokenize(data)\n",
+    "\n",
+    "#Replace unwanted tokens and punctuation\n",
+    "for i in range(len(tokenized)):\n",
+    "    tokenized[i] = tokenized[i].replace('\\n', ' ')\n",
+    "    tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n",
+    "    tokenized[i] = tokenized[i].replace('\"\"', '')\n",
+    "    tokenized[i] = tokenized[i].replace('.', '')\n",
+    "    tokenized[i] = tokenized[i].replace('!', '')\n",
+    "    tokenized[i] = tokenized[i].replace('?', '')\n",
+    "\n",
+    "#Filter out lines that contain unwanted words or formats, like illustrations\n",
+    "filtered = []\n",
+    "for sentence in tokenized:\n",
+    "    if(\"Illustration\" not in sentence):\n",
+    "        filtered.append(sentence)\n",
+    "\n",
+    "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n",
+    "filtered2 = []\n",
+    "for sentence in filtered:\n",
+    "    if(sentence[0:3] != \"***\"):\n",
+    "        filtered2.append(sentence)\n",
+    "    else:\n",
+    "        break\n",
+    "\n",
+    "#Final processing. Lowercase all sentences and append label for sentence. Label of 1 represents a fantasy example.\n",
+    "final = []\n",
+    "labels = []\n",
+    "for sentence in filtered2:\n",
+    "    final.append(sentence.lower())\n",
+    "    labels.append(1)\n",
+    "#print(final)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import numpy as np\n",
+    "import pandas as pd \n",
+    "frame_data = zip(final, labels)\n",
+    "\n",
+    "df = pd.DataFrame(frame_data)\n",
+    "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n",
+    "df_new.to_csv('arthur.csv', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Wizard of Oz Dataset**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n",
+    "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n",
+    "with open(\"wonderfulwizardofozmodded.txt\", 'r', encoding=\"utf-8-sig\") as file:\n",
+    "    data = file.read().replace('\\n', ' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Split file into sentence tokens\n",
+    "tokenized = sent_tokenize(data)\n",
+    "\n",
+    "#Replace unwanted tokens and punctuation\n",
+    "for i in range(len(tokenized)):\n",
+    "    tokenized[i] = tokenized[i].replace('\\n', ' ')\n",
+    "    tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n",
+    "    tokenized[i] = tokenized[i].replace('\"\"', '')\n",
+    "    tokenized[i] = tokenized[i].replace('.', '')\n",
+    "    tokenized[i] = tokenized[i].replace('!', '')\n",
+    "    tokenized[i] = tokenized[i].replace('?', '')\n",
+    "\n",
+    "filtered = []\n",
+    "for sentence in tokenized:\n",
+    "    if(\"Illustration\" not in sentence):\n",
+    "        filtered.append(sentence)\n",
+    "        \n",
+    "\n",
+    "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n",
+    "filtered2 = []\n",
+    "for sentence in filtered:\n",
+    "    if(sentence[0:3] != \"***\"):\n",
+    "        filtered2.append(sentence)\n",
+    "    else:\n",
+    "        break\n",
+    "\n",
+    "#Final processing\n",
+    "final = []\n",
+    "labels = []\n",
+    "for sentence in filtered2:\n",
+    "    final.append(sentence.lower())\n",
+    "    labels.append(1)\n",
+    "#print(final)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import numpy as np\n",
+    "import pandas as pd \n",
+    "frame_data = zip(final, labels)\n",
+    "\n",
+    "df = pd.DataFrame(frame_data)\n",
+    "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n",
+    "df_new.to_csv('dorothy.csv', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Irish Fairy Tales Dataset**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n",
+    "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n",
+    "with open(\"irishfairy.txt\", 'r', encoding=\"utf-8-sig\") as file:\n",
+    "    data = file.read().replace('\\n', ' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Split file into sentence tokens\n",
+    "tokenized = sent_tokenize(data)\n",
+    "\n",
+    "#Replace unwanted tokens and punctuation\n",
+    "for i in range(len(tokenized)):\n",
+    "    tokenized[i] = tokenized[i].replace('\\n', ' ')\n",
+    "    tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n",
+    "    tokenized[i] = tokenized[i].replace('\"\"', '')\n",
+    "    tokenized[i] = tokenized[i].replace('“', '')\n",
+    "    tokenized[i] = tokenized[i].replace('”', '')\n",
+    "    tokenized[i] = tokenized[i].replace(\"’\", \"'\")\n",
+    "    tokenized[i] = tokenized[i].replace('.', '')\n",
+    "    tokenized[i] = tokenized[i].replace('!', '')\n",
+    "    tokenized[i] = tokenized[i].replace('?', '')\n",
+    "\n",
+    "#Filter out lines that contain unwanted words or formats, like illustrations\n",
+    "filtered = []\n",
+    "for sentence in tokenized:\n",
+    "    if '\"' in sentence:\n",
+    "        print(sentence)\n",
+    "    if(\"Illustration\" not in sentence and \"CHAPTER\" not in sentence):\n",
+    "        filtered.append(sentence)\n",
+    "\n",
+    "\n",
+    "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n",
+    "filtered2 = []\n",
+    "for sentence in filtered:\n",
+    "    if(sentence[0:3] != \"***\"):\n",
+    "        filtered2.append(sentence)\n",
+    "    else:\n",
+    "        break\n",
+    "\n",
+    "#Final processing\n",
+    "final = []\n",
+    "labels = []\n",
+    "for sentence in filtered2:\n",
+    "    final.append(sentence.lower())\n",
+    "    labels.append(1)\n",
+    "#print(final)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import numpy as np\n",
+    "import pandas as pd \n",
+    "frame_data = zip(final, labels)\n",
+    "\n",
+    "df = pd.DataFrame(frame_data)\n",
+    "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n",
+    "df_new.to_csv('irishfairy.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Book of Wonder Dataset**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n",
+    "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n",
+    "with open(\"bookofwonder.txt\", 'r', encoding=\"utf-8-sig\") as file:\n",
+    "    data = file.read().replace('\\n', ' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Split file into sentence tokens\n",
+    "tokenized = sent_tokenize(data)\n",
+    "\n",
+    "#Replace unwanted tokens and punctuation\n",
+    "for i in range(len(tokenized)):\n",
+    "    tokenized[i] = tokenized[i].replace('\\n', ' ')\n",
+    "    tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n",
+    "    tokenized[i] = tokenized[i].replace('\"\"', '')\n",
+    "    tokenized[i] = tokenized[i].replace('“', '')\n",
+    "    tokenized[i] = tokenized[i].replace('”', '')\n",
+    "    tokenized[i] = tokenized[i].replace(\"’\", \"'\")\n",
+    "    tokenized[i] = tokenized[i].replace('.', '')\n",
+    "    tokenized[i] = tokenized[i].replace('!', '')\n",
+    "    tokenized[i] = tokenized[i].replace('?', '')\n",
+    "\n",
+    "#Filter out lines that contain unwanted words or formats, like illustrations\n",
+    "filtered = []\n",
+    "for sentence in tokenized:\n",
+    "    if(\"Illustration\" not in sentence and \"CHAPTER\" not in sentence and not sentence.isupper()):\n",
+    "        filtered.append(sentence)\n",
+    "\n",
+    "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n",
+    "filtered2 = []\n",
+    "for sentence in filtered:\n",
+    "    if(sentence[0:3] != \"***\"):\n",
+    "        filtered2.append(sentence)\n",
+    "    else:\n",
+    "        break\n",
+    "\n",
+    "#Final processing\n",
+    "final = []\n",
+    "labels = []\n",
+    "for sentence in filtered2:\n",
+    "    final.append(sentence.lower())\n",
+    "    labels.append(1)\n",
+    "#print(final)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import numpy as np\n",
+    "import pandas as pd \n",
+    "frame_data = zip(final, labels)\n",
+    "\n",
+    "df = pd.DataFrame(frame_data)\n",
+    "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n",
+    "df_new.to_csv('bookofwonder.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**World of Ice and Fire Dataset**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Replacing newlines referenced from: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines\n",
+    "#\\uffef token fix https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string\n",
+    "with open(\"iceandfire.txt\", 'r', encoding=\"utf-8-sig\") as file:\n",
+    "    data = file.read().replace('\\n', ' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Split file into sentence tokens\n",
+    "tokenized = sent_tokenize(data)\n",
+    "\n",
+    "#Replace unwanted tokens and punctuation\n",
+    "for i in range(len(tokenized)):\n",
+    "    tokenized[i] = tokenized[i].replace('\\n', ' ')\n",
+    "    tokenized[i] = tokenized[i].replace(\"\\\"\", '')\n",
+    "    tokenized[i] = tokenized[i].replace('\"\"', '')\n",
+    "    tokenized[i] = tokenized[i].replace('“', '')\n",
+    "    tokenized[i] = tokenized[i].replace('”', '')\n",
+    "    tokenized[i] = tokenized[i].replace(\"’\", \"'\")\n",
+    "    tokenized[i] = tokenized[i].replace('.', '')\n",
+    "    tokenized[i] = tokenized[i].replace('!', '')\n",
+    "    tokenized[i] = tokenized[i].replace('?', '')\n",
+    "    \n",
+    "#Filter out lines that contain unwanted words or formats, like illustrations\n",
+    "filtered = []\n",
+    "for sentence in tokenized:\n",
+    "    if(\"illustration\" not in sentence and \"CHAPTER\" not in sentence and len(sentence) > 0):\n",
+    "        filtered.append(sentence)\n",
+    "\n",
+    "#Stop using sentences if they exist beyond detected end of book boundary from Gutenberg\n",
+    "filtered2 = []\n",
+    "for sentence in filtered:\n",
+    "    if(sentence[0:3] != \"***\"):\n",
+    "        filtered2.append(sentence)\n",
+    "    else:\n",
+    "        break\n",
+    "\n",
+    "#Final processing\n",
+    "final = []\n",
+    "labels = []\n",
+    "for sentence in filtered2:\n",
+    "    final.append(sentence.lower())\n",
+    "    labels.append(1)\n",
+    "#print(final)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import numpy as np\n",
+    "import pandas as pd \n",
+    "frame_data = zip(final, labels)\n",
+    "\n",
+    "df = pd.DataFrame(frame_data)\n",
+    "df_new = df.rename(columns={0: 'Sentences', 1: 'Label'})\n",
+    "df_new.to_csv('iceandfire.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}