diff --git a/evalstudent/utils.py b/evalstudent/utils.py
index ecdb44d..9188713 100644
--- a/evalstudent/utils.py
+++ b/evalstudent/utils.py
@@ -1,3 +1,4 @@
+import os
import pandas as pd
pd.options.mode.chained_assignment = None # Disabling pandas warnings triggered by display_classes
from IPython.core.display import HTML
@@ -20,15 +21,15 @@ def display_classes(essay_id, train_df):
Prints the essay text (keeping its exact original formatting) using colors to highlight discourse elements and their classes.
Uses only `predictionstring`, which is useful to display models predictions.
'''
-
+
# Handling submission format :
discourse_type = "class" if "discourse_type" not in train_df.columns else "discourse_type"
-
+
elements_df = train_df[train_df["id"] == essay_id]
essay_text = open(f'../../raw_data/train/{essay_id}.txt').read()
essay_words = essay_text.split()
formatted_essay = ""
-
+
# First we make sure discourse elements are in the text order
elements_df["prediction_list"] = elements_df["predictionstring"].map(lambda x : x.split())
elements_df["start_word_index"] = elements_df["prediction_list"].map(lambda x : int(x[0]))
@@ -38,7 +39,7 @@ def display_classes(essay_id, train_df):
# and then we highlight the exact part of the essay corresponding to the discourse class.
end_char = 0
for i, element in elements_df.iterrows():
- start_word = essay_words[element["start_word_index"]]
+ start_word = essay_words[element["start_word_index"]]
start_char = essay_text[end_char:].find(start_word) + len(essay_text[:end_char])
formatted_essay += essay_text[end_char:start_char]
for word_index in element["prediction_list"]:
@@ -68,4 +69,116 @@ def generate_predictionstring(discourse_start, discourse_end, essay_text):
word_end = word_start + len(essay_text[char_start:char_end].split())
word_end = min( word_end, len(essay_text.split()) )
predictionstring = " ".join( [str(x) for x in range(word_start,word_end)] )
- return predictionstring
\ No newline at end of file
+ return predictionstring
+
+## ADD ARTHUR ##
+
+def get_essay(id,mode='train'):
+ """Function to get the full text of an essay from the .txt file.
+
+ Args:
+ id_ (str): id of the essay
+ mode (str, optional): determines whether to access *train* or *test* texts. \
+ Defaults to 'train'.
+
+ Returns:
+ str: Returns the full text of the id
+ """
+ with open(os.path.join(os.path.dirname(os.path.dirname(__file__)),
+ 'raw_data',
+ mode,
+ f'{id}.txt'),'r') as file:
+ txt = file.read()
+ return txt
+
+def slicering(ps,txt):
+ """
+ Allow for predictionstring to match with corresponding words of an essay.
+ Given a predictionstring of a portion of a text and the full text, the
+ function returns the portion of the text corresponding to the predictionstring.
+
+ Args:
+ ps (str): predictionstring of a discourse
+ txt (str): full text of an essay
+
+ Returns:
+ str: portion of the text corresponding to the predictionstring
+ """
+ ps_l = ps.split()
+ txt = txt.split()
+
+ return ' '.join(txt[int(ps_l[0]):int(ps_l[-1])+1])
+
+
+def css():
+ """
+ Apply custom.css into the notebook
+
+ Returns:
+ str: HTML style tag
+ """
+ styles = open("./styles/custom.css", "r").read()
+ return HTML('')
+
+
+
+def render_html(df):
+ """
+ Transforms each discourse into a html string with appropriates tags for
+ visualization.
+
+ Args:
+ df (DataFrame): dataframe containing discourse_type and discourse_text
+
+ Returns:
+ str: html string
+ """
+ if 'class' in df.keys():
+ class_='class'
+ else:
+ class_='discourse_type'
+
+ html = "<{0} style='padding: 2px'>{1} [{0}] {0}>"\
+ .format(df[class_],df['discourse_text'])
+
+ return html
+
+
+def comparison_text(prediction, ground_truth):
+ """
+ Allow for visual comparison of an essay with predicted classes vs the essay
+ with the true classes
+
+ Args:
+ prediction (str): essay with predicted classes in html formatting
+ ground_truth (str): essay with true classes in html formatting
+
+ Returns:
+ html: visual table
+ """
+
+
+ html = f"""
+
+
Legend -->
+
Lead
+
Position
+
Claim
+
Counterclaim
+
Rebuttal
+
Evidence
+
Concluding_Statement
+
+
+
+
+
Prediction
+
{prediction}
+
+
+
Ground Truth
+
{ground_truth}
+
+
+ """
+ return HTML(html)
diff --git a/notebooks/arthur/findings.ipynb b/notebooks/arthur/findings.ipynb
new file mode 100644
index 0000000..b21b496
--- /dev/null
+++ b/notebooks/arthur/findings.ipynb
@@ -0,0 +1,1488 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ace5385f",
+ "metadata": {},
+ "source": [
+ "**Key learnings** \n",
+ " \n",
+ "This notebooks aims to regroup all the key learnings found during the project from EDA to deep dives on the predictions."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f786b60",
+ "metadata": {
+ "toc": true
+ },
+ "source": [
+ "Table of Contents \n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f925070b",
+ "metadata": {},
+ "source": [
+ "# Imports and data loading"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "6571aa17",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:33:56.995010Z",
+ "start_time": "2022-02-15T16:33:56.964549Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
+ ]
+ }
+ ],
+ "source": [
+ "# imports\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2 \n",
+ "\n",
+ "import pickle\n",
+ "import os\n",
+ "import random\n",
+ "from tqdm.notebook import tqdm\n",
+ "\n",
+ "import pandas as pd \n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt \n",
+ "import seaborn as sns\n",
+ "\n",
+ "from IPython.display import HTML \n",
+ "from termcolor import colored"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "e01ed419",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:05.554782Z",
+ "start_time": "2022-02-15T16:30:05.533905Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# utils var\n",
+ "sns.set_style('whitegrid')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "155795d2",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:06.263710Z",
+ "start_time": "2022-02-15T16:30:05.556927Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# data loading\n",
+ "df = pd.read_csv('../raw_data/train.csv')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "69d9f13b",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:07.610686Z",
+ "start_time": "2022-02-15T16:30:06.267018Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#Max len of essay \n",
+ "SEQ_LEN = 1024 ## THIS SHOULD NOT BE CHANGED without appropriate changes in the preprocessing \n",
+ "\n",
+ "#Train, val, test split proportion\n",
+ "VAL_SPLIT = 0.8\n",
+ "TEST_SPLIT = 0.9\n",
+ "\n",
+ "#only to get length at this stage\n",
+ "df_essays = pd.read_csv('../raw_data/preprocessed_v3.csv')\n",
+ "LEN=len(df_essays)\n",
+ "del df_essays\n",
+ "\n",
+ "idx_val=int(LEN*VAL_SPLIT)\n",
+ "idx_test=int(LEN*TEST_SPLIT)\n",
+ "\n",
+ "idx_train=list(range(0,idx_val))\n",
+ "idx_val=list(range(idx_val,idx_test))\n",
+ "idx_test=list(range(idx_test,LEN))\n",
+ "\n",
+ "assert(len(idx_test)+len(idx_train)+len(idx_val)==LEN)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "c57dcd1e",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:07.638833Z",
+ "start_time": "2022-02-15T16:30:07.612877Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#labels map\n",
+ "labels_mapping = {'B-Lead' : 0,\n",
+ " 'B-Position' : 1,\n",
+ " 'B-Evidence' : 2,\n",
+ " 'B-Claim' : 3,\n",
+ " 'B-Concluding_Statement' : 4,\n",
+ " 'B-Counterclaim' : 5,\n",
+ " 'B-Rebuttal' : 6,\n",
+ " 'I-Lead' : 7,\n",
+ " 'I-Position' : 8,\n",
+ " 'I-Evidence' : 9,\n",
+ " 'I-Claim' : 10,\n",
+ " 'I-Concluding_Statement' : 11,\n",
+ " 'I-Counterclaim' : 12,\n",
+ " 'I-Rebuttal': 13,\n",
+ " 'O':14,\n",
+ " 'PAD':15}\n",
+ "\n",
+ "reversed_mapping = {v:(k[2:] if v<14 else k) for k,v in labels_mapping.items()}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0454f0cf",
+ "metadata": {
+ "heading_collapsed": true
+ },
+ "source": [
+ "# Exploratory data analysis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c5be3194",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ "## Is there a lot of sentences that overlap 2 different discourses ? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "3f56b957",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:08.773477Z",
+ "start_time": "2022-02-15T16:30:07.640570Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Proportion of sentences overlapping 2 different discourses : 33.05%\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Check whether there is a lot of type change in the middle of a sentence\n",
+ "count=0\n",
+ "for i in range(len(df)-1):\n",
+ " if '.' in df.loc[i,'discourse_text'][-3:]:\n",
+ " count+=1 \n",
+ " \n",
+ "print(f'Proportion of sentences overlapping 2 different discourses : {(1-count/len(df))*100:.2f}%')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b13d9207",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ "## Distribution length of discourses"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "07d102e2",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:09.260800Z",
+ "start_time": "2022-02-15T16:30:08.775627Z"
+ },
+ "hidden": true
+ },
+ "outputs": [],
+ "source": [
+ "#create discourse_len feature\n",
+ "df['discourse_w_len']=df['discourse_text'].apply(lambda txt : len(txt.split()))\n",
+ "df['discourse_c_len']=df['discourse_text'].apply(len)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "8c47696c",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:12.028809Z",
+ "start_time": "2022-02-15T16:30:09.263079Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# plotting lenght both in term of words and characters\n",
+ "\n",
+ "fig, axs = plt.subplots(1,2,figsize=(15,5),tight_layout=True)\n",
+ "\n",
+ "sns.histplot(x=df['discourse_w_len'],kde=True,ax=axs[0],color='lightblue')\n",
+ "sns.histplot(x=df['discourse_c_len'],kde=True,ax=axs[1],color='orange')\n",
+ "\n",
+ "axs[0].set_title('Length in term of words',size=14)\n",
+ "axs[1].set_title('Length in term of characters',size=14)\n",
+ "\n",
+ "axs[0].set_xlim((0,300))\n",
+ "axs[1].set_xlim((0,1500))\n",
+ "\n",
+ "plt.suptitle('Distribution of lengths',size=18,weight='bold');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b6eebfa1",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ "### Length distribution across discourse types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "444f6d11",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:12.342088Z",
+ "start_time": "2022-02-15T16:30:12.031135Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(16,8),tight_layout=True)\n",
+ "sns.boxplot(y=df['discourse_w_len'],x=df['discourse_type'],showfliers=False)\n",
+ "\n",
+ "plt.xlabel('Discourse type',size=16,labelpad=20)\n",
+ "plt.ylabel('# words',size=16,labelpad=20)\n",
+ "\n",
+ "plt.xticks(size=14)\n",
+ "plt.yticks(size=14)\n",
+ "\n",
+ "plt.title('Length distribution across discourse types',size=18,pad=20,weight='bold');\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19c3bc9a",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ "## Distribution length of essays"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "ae5dc133",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:13.224269Z",
+ "start_time": "2022-02-15T16:30:12.347075Z"
+ },
+ "hidden": true,
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "## agg by essay id\n",
+ "df_essays = df.groupby('id').agg({'discourse_c_len':sum,'discourse_w_len':sum})\n",
+ "\n",
+ "#plot\n",
+ "fig, axs = plt.subplots(1,2,figsize=(15,5),tight_layout=True)\n",
+ "\n",
+ "sns.histplot(x=df_essays['discourse_w_len'],kde=True,ax=axs[0],color='lightblue')\n",
+ "sns.histplot(x=df_essays['discourse_c_len'],kde=True,ax=axs[1],color='orange')\n",
+ "\n",
+ "axs[0].set_title('Length in term of words',size=14)\n",
+ "axs[1].set_title('Length in term of characters',size=14)\n",
+ "\n",
+ "#sns.lineplot(x=[1024,1024],y=[1,740],color='r',ax=axs[0])\n",
+ "axs[0].axvline(1024,ls='--',lw=1.5,color='r')\n",
+ "axs[0].arrow(x=1200,y=600,dx=-160,dy=-100,width=10,length_includes_head=True,)\n",
+ "axs[0].annotate('SEQ_LEN',(1200,600),size=12)\n",
+ "axs[0].annotate('1024',(1030,760),size=12)\n",
+ "\n",
+ "plt.suptitle('Distribution of lengths',size=18, weight='bold');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dbb613a2",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ "## Distribution of discourse types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "95058dec",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:13.453224Z",
+ "start_time": "2022-02-15T16:30:13.226426Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwYAAAI7CAYAAABWVYwTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABZiklEQVR4nO3dd1iV9f/H8ddhoyiKkqaooeXeImriJDNN08q+mqUNK3OVZYmauLepFY6yHKWW5iotv47ckIIzTDEL94icKEPGOef3hz/OVwRZCTfK83FdXpfc94dz3uc+B7hf92fcJqvVahUAAACAAs3O6AIAAAAAGI9gAAAAAIBgAAAAAIBgAAAAAEAEAwAAAAAiGAAPHIvFYnQJqeS3emA8PhPGHAOO+7/HMcSDjmAA5IHWrVurSpUqtn/VqlVTw4YN1bNnT+3cuTNV21WrVqlKlSpq3bp1tp7j+vXrGjNmjNasWZNp2/SeI6W20NDQbD3v3Rw/flyvvfaazp8/b9uWchxWrVp1T57jXpo1a5aaNm2qWrVqqWPHjlk+AciLY/kgWbt2rT744AOjy8iW0NBQ23t6L4SHh6tLly6ptuXmZ8ZsNuubb77RxIkT78nj/ZvjkZ9/B2Rm586d6tWrl9FlALnKwegCgILE3d1dzs7OSkxMVHR0tEJDQxUWFqaJEyfq2WeflSS5urqqVKlS8vT0zNZjv/TSSzp27Jhq1qyZaducPkdW/fPPP3rmmWeUlJSUarunp6eSk5Pl6uqaK8+bUydPntRnn30m6daxcXR0lJ1dzq+blCpVSpLk5OR0T+p7UCxYsECTJk2Sr6+v0aVki5OTk+09/bcOHTqkrl275umV54kTJ2rRokW23zH/1r85Hvn1d0BmNm7cqAEDBqhs2bJGlwLkKoIBkIeGDBmi5557TpJ09epVjRgxQhs3btSYMWPUqlUrFStWTO3atVO7du2y/dixsbFZbpvT58iqxMTENKFAkpYtW5Zrz/lvXLx40fb/X3/9VYUKFfpXj7djx45/W9IDKSYmxugScqRevXr37D2Ni4vL8+Eo9/q4/5vjkV9/B2Tmfv3sAtnFUCLAIMWLF9fEiRNVqFAhxcXFad26dZLSH5qyf/9+9ezZU40aNVLdunXVsWNHff/997b9rVu31rlz5yRJQ4cOtX1vjx49VKVKFX3xxRfq1KmT6tevr7lz52Y4XOnvv//Wm2++qdq1a+uJJ55I9Yf87NmztiEEZ8+etW1PeZ6goCCdPXtW/v7+tn3+/v4aMmSIrc47hxHcuHFD48ePV8uWLVWzZk21a9dOX3/9tW6/KfuQIUNUpUoVzZ07V/Pnz1eLFi1Up04dvf3224qKisrwOCcmJiooKEht2rRRzZo15e/vr88++0yJiYmSpKCgIL388su29vXq1VNQUFC6j2W1WjV79mw1a9ZMdevW1aBBg3Tjxo007e4cFhIdHa1Ro0apVatWqlWrlpo3b67AwEBFR0en+r5ff/1V3bp1U+3atdW4cWP169dPJ06cSNXm4MGDeuONN+Tj46N69eqpV69eCg8Pz/D5bz+GKe9FynCQp556SnPmzFG9evXk7++vmJiYTD9v0q0gOnr0aDVu3Fi1a9dWt27dtGvXrru+D0FBQZo5c6YkKSwsTFWqVNHPP/+s6tWrq0qVKtqzZ4+tbWRkpKpUqaLq1avrn3/+sX1ee/XqpXXr1qlt27aqXbu2evXqpdOnT6d6nuDgYD333HOqWbOm/Pz8NG7cOMXFxdn2X7hwQYMGDVLz5s1Vq1YttW7dWpMnT7Z9HtKT3tCZlM/8Dz/8oGnTpqlJkyaqV6+ePvzww7ueRIaGhqpnz56p3qc7P2vXrl3T+++/r7p168rPz09z585Ntf/SpUsaNGiQ7f1//fXXdeTIkbvWPmTIEK1evVqStHr1atvPblBQkKpUqaKhQ4dqwIABqlOnjvr27StJ+vPPP/Xmm2+qUaNGqlWrltq0aaPZs2fbfib/zfG483fA7e/tjh071LFjR9WqVUvPPfec9u3bl+q1BAcHq1OnTrb94eHhatq0aZrfR7ezWq2aN2+e2rVrpzp16qhJkyZ666239Mcff6Rqd+jQIfXo0cP2szd06FBduXLFVuPQoUMlSefOnbtvh0IBWUGPAWAgNzc31apVS6GhoQoPD1f37t3TtImKilKvXr0UFxenQoUKydHRUceOHVNgYKDc3NzUvn17eXp66u+//5bZbJa7u3uaIUJBQUGyt7eX2WxWrVq1dOHChbvWNHLkSCUnJ8vOzk5nzpzRiBEjVKxYMbVt2zZLr8nBwUGenp62q/Cenp5yd3dPt+3NmzfVvXt3HTt2TJJUuHBhHT9+XBMmTNCJEyc0atSoVO2XLl2q8+fPy9XVVTdv3tTWrVvl5ORkGwZ0J6vVqrffflshISG2xz979qxmzZqlw4cPa86cOXJzc1Px4sV19epVSbeGAbm5uaX7eDNnzrSd3BYqVEjr1q3T5s2bMz0mw4YN0y+//CJ7e3u5u7vr4sWL+v7773XhwgV99dVXkm6d9Lz11lsym81ycXFRbGysfvnlFx06dEhr1qxRsWLFtHv3br3xxhtKSkqSo6Oj7fvCwsI0f/58NWzYMNNa7nT27Fl98sknKlq0qEqVKqXY2NhMP29Wq1V9+/bV7t275eDgoMKFC+vAgQN64403tHDhwnTrcHNzU+HChRUbGytHR0d5eHioTJkyatq0qXbs2KENGzbYvm/Dhg2SpMaNG+uhhx6yPUZERITef/99ubq6KjExUcHBwXrllVe0du1aubm5adeuXbZjWLRoUV27dk2LFi3SiRMnNG/ePElSnz59FBERIUdHRxUpUkTnzp3T/PnzFRcXp9GjR2f7+H322Wf6+++/5ezsrLi4OK1Zs0YlS5ZUQEBAmrZOTk6ZftYCAwOVkJAgs9msixcvatq0aapRo4aaNm2qmzdvqmfPnoqMjJSTk5OcnZ0VEhKiAwcOaNWqVfL29k7znO7u7nJ1dVV8fLxcXV1VtGhROTj870//2rVrZbFY5OzsrMqVK+vmzZt6/fXX9c8//6hQoUJydnbW6dOn9emnn8rLy0vPPPPMPTset/vzzz/Vp08fubi4KDExUYcPH9a7776rrVu3ytHRUb/99pt69+6t5ORkOTg4KDIyUr169VJycnKGj/vNN99oypQpkm5djImJidH27dv1+++/a+PGjXJzc9Nff/2lHj16KD4+XoULF1ZcXJxWrVql33//XStXrpSrq6vc3d0VHR0te3t7lSxZ8r4bCgVkFT0GgMFKliwpSbp8+XK6+3/77TfFxcWpdu3a2rt3r/bs2aP+/furZcuWunnzpqRb3fOlS5eWdOsK4Z3d9UWKFNHWrVsVHByc6fjuihUrKiQkRHv27FGzZs0k3ZqYm1WlS5fW0qVLbV8vXbrUdrXtTosWLdKxY8fk7u6uH374Qfv379fkyZMlSd99912aK+GXLl3S999/r3379uk///mPJNlO+tPz3//+VyEhIXJ0dNSCBQu0f/9+LVy4UI6Ojtq2bZvWr1+v1157TZ9++qnte3bs2KHXXnstzWMlJiZq4cKFkqQuXbpo79692rJly11Dz+1SrqSvXr1au3bt0vfff68GDRrIy8tLCQkJkqRPPvlEZrNZTz31lPbu3avg4GBVqFBBly9f1tatWyVJY8eOVVJSklq2bKk9e/Zoz549atmypRITE9OEqKxKSkrSgAEDtGfPHs2cOTNLn7edO3dq9+7dKl++vHbu3KmwsDCNGjVKycnJtuB0p9dee812XFOGotSrV8827n3jxo22K9KbNm2SJHXs2DHVY1y+fFn9+vXT/v379cMPP6hQoUI6f/68VqxYIUmaMWOGzGazhg4dqj179igkJESVKlVScHCw9u7dq2vXrtlCwc6dO7Vr1y7NmTNHvr6+KlKkSI6OX0JCgtavX6+wsDDbz8vdPpP16tXL9LNWtWpV7dq1S7/++qttPPv27dsl3fr8REZGql69etq9e7f27Nmj3r17Ky4uTl9++WW6zzl06FA99dRTkqSnnnpKO3bssP2ukG69/wsXLtSePXv02muv6cyZM6pcubKaNm1qe4727dtLUpqfx397PG4XFRWloUOHat++fbbfARcvXtSff/4pSfriiy+UnJysqlWrKjg4WHv27FHTpk1T9QalJ+Vnb/jw4dq9e7eCg4Pl5+cnPz8/W2/jrFmzFB8fr1deeUV79+5VaGioGjVqpGPHjmndunVq166draetdOnS2rFjR64OxQSMRI8BYDCTySTp7svgVatWTY6OjgoPD9fLL7+spk2bqkmTJurbt6/s7e2z9Bx+fn7y8PDIUtuePXvaTnbfeOMN7dy5U3/99VemV+ZyIuWE94UXXlC1atUkSZ07d9aSJUsUHh6uLVu2qHbt2rb2DRs2tH3t7++v77//PsO5FSmP/8QTT+jxxx+XJDVp0kRPPPGE/vvf/2rr1q22k57MHD9+3DYkon///rK3t9fDDz+s559/PtPgVLt2bdvV7FatWsnX11ezZ89WsWLFJN0ad/77779Lkl599VU5OjrK3d1dixYtUvHixeXk5KTTp0/rr7/+kiQNHjzYdsVy8ODB2rZtm/766y+dPn1a5cuXz9LruV3KVWAPD48sfd7CwsIk3Zpk3rlzZ0n/+/zu27cvVY9GZvz9/VWkSBFFRUXpwIEDeuihh3TkyBG5uLioTZs2qdo6OTnp7bfflslkUtWqVfXkk0/aAmXXrl116NAhSdKXX36p+fPnS7q1Wpck7d69Wz4+PipfvrxOnz6tbt26qWXLlvL19dUXX3yR43kl/v7+tmPeqlUr7dy5M1vzfe7UrVs3Wy116tTRuXPnbJ+7lOP+xx9/2E5MU4ZA5XQ1o9KlS9suFri7u8vd3V3z5s1TQkKCwsPDtX//fttQpay8rpweDxcXF7300kuSpCeffNLWw5Dyvfv375ckvfLKKypevLgkaeDAgfrvf/+b4ePWrl1bW7du1fTp07V//341btxYo0aNUrly5WxtUo7rjz/+qPXr10v635yC0NBQ22ccKAjoMQAMljKONeWP3Z3KlSunoKAgPfroo9q/f7+CgoL00ksvqXXr1vr111+z9BzZWX0opQdDkm0Yh9lsttWZnpyGhpReEi8vr1TbU76+sxfl9mPk4uIiSanmItwppeasPn5Gbh8nnd4xysjUqVP1xBNP6PLly/ruu+/03nvvyc/Pz7Z85PXr122vIyUsSLeGmqSsbHR7rbe/ntv/f+nSpbvWkNF7dPvnIyuft5S5ETdv3lRUVJSioqJsQ8eSkpJ07dq1DI/H7ZydnW3hbMOGDbbeglatWqUZZuPu7p4qcKQc+xs3buj69eu2cHLp0iVbXfHx8ZJuhRjp1pXnxo0b6/Tp01q4cKH69u2rpk2b2oYaZVd2P5PZeTxnZ2dJ/wtdKcc9Li7O9vpShiWlvL7suvN3g9ls1rhx4+Tr66uXX35Zq1atsg09ysrryunxcHd3t10kuX2YTsprT/n5u/3n7eGHH870cXv37q1XX31V9vb2WrdunUaMGKEnnnhCvXr1soXGlON67do123FNCSQ5Pa7A/YoeA8BAiYmJtivFt18Zv1PLli3VqFEjXbx4UWFhYbYhMoMHD1ZwcHCmz5PyBzorbr/vQMqJpslkUpEiRWwnISm1p7hzsmXKH/jMlCxZUidPnrRNnE6RMpHw9hNwSanGRmflOUqUKCFJWX78jNx+wh4VFWU7Ic9s8rN06+QrZXhEWFiYwsLCtHjxYi1cuFBNmzaVj4+PTCaTrFar/vnnH9tY8b179+rq1auqWbNmqlrPnj2rSpUqpXotKc8jyfZYGb1Ht7vz85HZ5y3luLZq1Uqff/65pFufB5PJlGFPwd3es86dO2vZsmXauHGjbRnMO4cRSbeCXsrcB+l/q0kVK1ZMHh4esrOzk8Vi0apVq1SjRg1Jt644Fy5c2PYYFStW1KxZs5SYmKjQ0FDt2rVLy5Yt05QpU9SyZUvbcc2q7H4mM2uT0TK5Kce9R48eGj58uKRbQ3cyW143o+e8871funSpFi1apEcffVTz5s1T6dKlNWPGDFtvVWayezyy+n0lSpTQ33//rb///tu27fbfVXdjb2+v999/XwMHDtThw4e1Z88efffddwoODtbcuXP1wQcf2B47KChITz75pCSl+pxl97UA9zN6DACDxMTEaNy4cbp+/bpcXV3TPRGSpIULF6p+/fp68cUXVbx4cb3wwgu2ScrR0dG2K8Epf1hjYmLSXB3Ozh+1BQsW6J9//lFiYqJtYmz16tVtExdTHDhwQNKtcceRkZGpHuP2P/Lp1ZPCz89PkrR8+XIdPXpUkrRmzRrbWObbVzfKiZTH37Rpk3bv3i3p1pCSX375JduP7+3tbRuONWfOHCUnJ+vs2bO28e13c/bsWTVr1kw+Pj46dOiQWrdurf79+9uufF69elWFChWy3X9iwYIFSkxM1I0bNzRixAj1799fP/74o8qVK6cKFSpIkj7++GPFx8fr5s2b+vjjjyVJlStXtg2PSBkKlvIenTlzJtWqP3e6/fORlc9bgwYNJN1aRSll+E5QUJDq1aun/v373/V5UoYipYSUlM9F/fr19cgjj+j8+fM6cOCAihUrpubNm6f5frPZrM8++0wWi0WRkZG23oUGDRrI0dHRFq7nz5+vpKQkRUVFqXXr1mratKlCQ0NtQ0maNWumy5cvq127durfv7/tCvXtwTe33D78L6OfjfSkHPf//ve/OnPmjCwWi4YNG6Z69epp3LhxmT5nTEyMrFZrqmGLd/5uSBnT7+LiIg8PD128eNF2nI2862/Ka1+0aJGuXbumhIQEzZgxI8PvsVqt6tatm+rWrat58+apQYMGeuutt1S3bl1J/3u/b3/s2NhYxcTE6Nlnn1WjRo20du1aSf87hinLzebG0EogPyAYAHlo0qRJat68ufz8/NS4cWPbJOHhw4ffdSiRv7+/nJycdPToUT3++OO2ZSwlqUOHDraT8JSJiilXPnMqPj5eLVq0UMOGDW2THlOWMUxZRSml5qefflrdu3dPMxyhePHitqttL774ot5///10n6tHjx7y9vbWtWvXbMupfvjhh5Kkl19+OUs3a8tI+/bt5ePjo6SkJL3yyitq0KCBXnnlFSUlJal169a2q4NZYW9vbzsOK1askI+Pj9q0aZPpMAkvLy/Vrl1bVqtVr776qho3bqzGjRvr3LlzKlmypC28DBw4UPb29tq6dat8fHzUtGlTRUZG6qGHHtILL7wgSfroo49kb2+vLVu2yNfXVw0bNtSWLVvk7OycavJx48aNJd2aVPnUU0+pQ4cOd11p6U5Z+bw1a9ZM9erVU0JCgrp06aKGDRtq7ty5SkpKynDORspn9MiRI2rQoIG2bdtm29epUyfb/9u2bZtuz4OTk5MWLVqkBg0a6Omnn1ZMTIzKli1rGwPet29fmUwm/fTTT2rYsKH8/f117do1lShRQvXr11ft2rVVtmxZxcXFqWPHjmrSpIlatmyp+Ph4VapUyfbZzk23D/1q3ry5pk6dmuXv7dy5s8qVK6dLly6pTZs28vX11U8//WSbtH43Kcd906ZNatCgQZqlOm+XctL8+++/q1GjRmrZsqUt+Bu5lv8bb7whR0dHHT16VH5+fmrYsKFtboCU/sUPk8mkdu3ayWKxKCgoSD4+PvLx8dGGDRtkZ2dnuxjz1ltvycnJSWFhYWrcuLH8/Px08uRJubi42H4+U47h1atX1bBhQy1ZsiQPXjWQ9wgGQB6Kjo5WVFSULl26JFdXVzVp0kRfffWVunTpctfvKVeunBYtWqQnn3xSxYoVU3x8vB555BENGDAg1cng22+/rYoVK8pkMql48eI5vqI1a9YstWrVSpJUvnx52/j4FB9//LF8fX3l6Oio5ORkjR07Vi1atEj1GE5OTnr33Xfl6ekpq9V615NSNzc3LVu2TD169NDDDz+sxMREeXt7a/jw4bahEv+Gg4OD5s+fr759+6p8+fJKSEhQ2bJl1b9/f3322WfZHh7Qo0cPDRs2TKVKlZLJZFL79u01fvz4TL9v2rRp6tOnjypUqKC4uDgVL15cbdu21TfffGMbHpKyZn3KiVnhwoXVpk2bVG1atGihxYsXq1mzZnJ2dpaDg4P8/Py0ZMkS21VP6daSl61bt5arq6sSEhL0zjvvpLsUbnqy+nn74osv1K1bN3l6eiohIUFVqlTR9OnTMwwG/v7+atmypQoVKiQXF5dUPUtNmza1/f9uS2J6enoqKChInp6ecnJykp+fn77++mvb56tFixaaNWuWLYgVKVJEnTp10vz58+Xo6CgHBwd99dVX6tGjh8qUKaOYmBiVKlVKzz33nBYsWGAb05+bSpUqpV69etkuBGQ1sEm3ruIvWrRIHTp0kLu7u5KTk1WnTh198cUX8vHxuev3Pffcc/Lx8ZGLi4uKFi2a4ZX/Tp066e2335anp6fs7OxUp04d2zKuKRPLjVC9enXNmjVLjz32mEwmk6pXr55qJaa7LR/6yiuvaOLEiapRo4YsFoscHR3l4+OjuXPn2gJ01apVtXDhQvn6+srBwUFOTk7y9/fXN998Y3uf6tWrp06dOqlIkSJycHBguVI8sEzWfzNLCgCAe2Do0KFatWqVypYtq82bN6cKbSk3mCpbtqy2bNliYJUwypIlS3Tu3Dk99NBDevnll+Xg4KDt27frrbfeUqFChbRv374M51kAyBomHwMADDNkyBBt377dtoJUz549meiJNBITE20rR3366adydna2zRHo2LEjoQC4RwgGAADDlC5dWrGxsSpZsqSef/55vfLKK0aXhHzo1VdfVWxsrH766SedO3dOSUlJKlu2rNq2bat3333X6PKABwZDiQAAAAAw+RgAAAAAwQAAAACACAYAAAAARDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAAAiGAAAAAAQwQAAAACACAYAAAAARDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAACSHIwuIKsOHjwoZ2dno8sAAAAA7lsJCQmqW7duuvvum2Dg7OysatWqGV0GAAAAcN+KiIi46z6GEgEAAAAgGAAAAAAgGAAAAAAQwQAAAACACAYAAAAARDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAAAiGAAAAAAQwQAAAACACAYAAAAARDAAAAAAIMkhK42effZZubm5SZK8vLzUtWtXjR8/Xvb29vLz81P//v1lsVg0atQo/fHHH3JyctK4ceNUoUIFHTx4MMttAQAAABgj02CQkJAgq9WqRYsW2bZ16tRJQUFBKleunN566y0dOXJEZ8+eVWJiopYtW6aDBw9q0qRJmjNnjkaOHJnltgAAAACMkWkwOHr0qOLj4/X6668rOTlZAwYMUGJiosqXLy9J8vPz06+//qqLFy+qWbNmkqS6devq999/V0xMTJbb3ivJFosc7BghdS9wLAEAAAqOTIOBi4uLevXqpRdeeEEnT57Um2++qaJFi9r2Fy5cWGfOnFFMTIxtuJEk2dvbp9mWUdvk5GQ5OGRpZFPGL8jOTnP27PjXjwOpT8PmRpcAAACAPJLpmbi3t7cqVKggk8kkb29vFSlSRNeuXbPtj42NVdGiRXXz5k3FxsbatlssFrm5uaXallHbzEJBQkKCIiIiMn1B1apVy7QNsi4rxxwAAAD3v0yDwYoVK3Ts2DGNGjVKUVFRio+PV6FChXT69GmVK1dOwcHB6t+/v/7++29t3bpV7du318GDB1W5cmW5ubnJ0dExS20z4+zszEm/ATjmAAAAD46MLvpmGgy6dOmioUOH6sUXX5TJZNKECRNkZ2enDz74QGazWX5+fqpTp45q1aqlkJAQdevWTVarVRMmTJAkjR49OsttAQAAABjDZLVarUYXkRURERFZvnrNHIN7gzkGAAAAD5aMzqlZcgYAAAAAwQAAAAAAwQAAAACACAYAAAAARDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAAAiGAAAAAAQwQAAAACACAYAAAAARDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAAAiGAAAAAAQwQAAAACACAYAAAAARDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAAAiGAAAAAAQwQAAAACACAbIY1ar2egSHhgcSwAAcC85GF0AChaTyV7x8ZuNLuOB4Orqb3QJAADgAUKPAQAAAACCAQAAAACCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAAAoi8Hg8uXLatGihSIjI3Xq1Cm9+OKL6t69u0aOHCmLxSJJmjlzprp06aJu3bopPDxckrLVFgAAAIBxMg0GSUlJGjFihFxcXCRJEydO1MCBA/Xtt9/KarVq8+bNOnz4sMLCwrR8+XJNnz5do0ePznZbAAAAAMbJNBhMnjxZ3bp100MPPSRJOnz4sHx9fSVJzZs316+//qp9+/bJz89PJpNJZcqUkdls1pUrV7LVFgAAAIBxHDLauWrVKnl4eKhZs2aaO3euJMlqtcpkMkmSChcurBs3bigmJkbFihWzfV/K9uy09fDwyLDQhIQERUREZPqCqlWrlmkbZF1Wjnl28P7cW/f6/QEAAAVXhsFg5cqVMplM2rVrlyIiIhQQEJDq6n5sbKyKFi0qNzc3xcbGptpepEgR2dnZZbltZpydnTmpNADHPH/j/QEAANmR0UXFDIcSLVmyRIsXL9aiRYtUrVo1TZ48Wc2bN1doaKgkaceOHfLx8VH9+vUVHBwsi8Wi8+fPy2KxyMPDQ9WrV89yWwAAAADGybDHID0BAQEKDAzU9OnTVbFiRbVt21b29vby8fFR165dZbFYNGLEiGy3BQAAAGAck9VqtRpdRFZERERkedjEnD07crmagqFPw+a58rjx8Ztz5XELGldXf6NLAAAA95mMzqm5wRkAAAAAggEAAAAAggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAkh8wamM1mDR8+XCdOnJDJZNLo0aPl7OysIUOGyGQy6bHHHtPIkSNlZ2enmTNnatu2bXJwcNCwYcNUu3ZtnTp1KsttAQAAABgj02CwdetWSdLSpUsVGhqqGTNmyGq1auDAgWrUqJFGjBihzZs3q0yZMgoLC9Py5ct14cIFDRgwQCtXrtTEiROz3BYAAACAMTINBk888YRatmwpSTp//ryKFi2qX3/9Vb6+vpKk5s2bKyQkRN7e3vLz85PJZFKZMmVkNpt15coVHT58OMttPTw8cu+VAgAAALirTIOBJDk4OCggIECbNm3SZ599ppCQEJlMJklS4cKFdePGDcXExKhYsWK270nZbrVas9w2o2CQkJCgiIiITGutVq1aVl4Ssigrxzw7eH/urXv9/gAAgIIrS8FAkiZPnqwPPvhA//nPf5SQkGDbHhsbq6JFi8rNzU2xsbGpthcpUkR2dnZZbpsRZ2dnTioNwDHP33h/AABAdmR0UTHTVYl++OEHffHFF5IkV1dXmUwm1axZU6GhoZKkHTt2yMfHR/Xr11dwcLAsFovOnz8vi8UiDw8PVa9ePcttAQAAABgj0x6DJ598UkOHDtVLL72k5ORkDRs2TJUqVVJgYKCmT5+uihUrqm3btrK3t5ePj4+6du0qi8WiESNGSJICAgKy3BYAAACAMUxWq9VqdBFZERERkeVhE3P27MjlagqGPg2b58rjxsdvzpXHLWhcXf2NLgEAANxnMjqn5gZnAAAAAAgGAAAAAAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAkhwy2pmUlKRhw4bp3LlzSkxMVJ8+ffToo49qyJAhMplMeuyxxzRy5EjZ2dlp5syZ2rZtmxwcHDRs2DDVrl1bp06dynJbAAAAAMbJMBisWbNGxYoV09SpU3Xt2jV17txZVatW1cCBA9WoUSONGDFCmzdvVpkyZRQWFqbly5frwoULGjBggFauXKmJEydmuS0AAAAA42QYDJ566im1bdtWkmS1WmVvb6/Dhw/L19dXktS8eXOFhITI29tbfn5+MplMKlOmjMxms65cuZKtth4eHrn8UgEAAADcTYbBoHDhwpKkmJgYvfPOOxo4cKAmT54sk8lk23/jxg3FxMSoWLFiqb7vxo0bslqtWW6bWTBISEhQREREpi+oWrVqmbZB1mXlmGcH78+9da/fHwAAUHBlGAwk6cKFC+rXr5+6d++ujh07aurUqbZ9sbGxKlq0qNzc3BQbG5tqe5EiRWRnZ5fltplxdnbmpNIAHPP8jfcHAABkR0YXFTNclejSpUt6/fXX9eGHH6pLly6SpOrVqys0NFSStGPHDvn4+Kh+/foKDg6WxWLR+fPnZbFY5OHhka22AAAAAIyTYY/B559/ruvXr2v27NmaPXu2JOmjjz7SuHHjNH36dFWsWFFt27aVvb29fHx81LVrV1ksFo0YMUKSFBAQoMDAwCy1BQAAAGAck9VqtRpdRFZERERkedjEnD07crmagqFPw+a58rjx8Ztz5XELGldXf6NLAAAA95mMzqm5wRkAAAAAggEAAAAAggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAA4P9ZzBajS3hgcCwBAPcjB6MLAJA/2NnbacfWCKPLeCA0b1XN6BIAAMg2egwAAAAAEAwAAAAAEAwAAAAAiGAAAAAAQAQDAAAAACIYAAAAABDBAAAAAIAIBgAAAABEMAAAAAAgggEAAAAAEQwAAAAAiGAAAAAAQAQDALgvWJOTjS7hgcGxBID0ORhdAAAgcyYHB12cMc7oMh4Inu8NN7oEAMiX6DEAAAAAQDAAAAAAQDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAAAiGAAAAAAQwQAAAACACAYAAAAARDAAAAAAIIIBAAAAABEMAAAAAIhgAAAAAEAEAwAAAAAiGAAAAAAQwQAAAACACAYAAAAARDAAAAAAIIIBAAAAAGUxGPz222/q0aOHJOnUqVN68cUX1b17d40cOVIWi0WSNHPmTHXp0kXdunVTeHh4ttsCAAAAME6mweDLL7/U8OHDlZCQIEmaOHGiBg4cqG+//VZWq1WbN2/W4cOHFRYWpuXLl2v69OkaPXp0ttsCAAAAME6mwaB8+fIKCgqyfX348GH5+vpKkpo3b65ff/1V+/btk5+fn0wmk8qUKSOz2awrV65kqy0AAAAA4zhk1qBt27Y6e/as7Wur1SqTySRJKly4sG7cuKGYmBgVK1bM1iZle3baenh4ZFhHQkKCIiIiMn1B1apVy7QNsi4rxzw7eH/urXv5/vDe3Fv87ORv9/r9AYAHQabB4E52dv/rZIiNjVXRokXl5uam2NjYVNuLFCmSrbaZcXZ25g+jATjm+RvvT/7Fe5O/8f4AKKgyujCS7VWJqlevrtDQUEnSjh075OPjo/r16ys4OFgWi0Xnz5+XxWKRh4dHttoCAAAAME62ewwCAgIUGBio6dOnq2LFimrbtq3s7e3l4+Ojrl27ymKxaMSIEdluCwAAAMA4JqvVajW6iKyIiIjIctfvnD07crmagqFPw+a58rjx8Ztz5XELGldX/3v+mDu2Mu76XmjeKneGqVycMS5XHreg8XxvuNElAIBhMjqn5gZnAAD8S8lmi9ElPDA4loBxsj2UCAAApOZgb6epP4QZXcYD4cPOvkaXABRY9BgAAAAAIBgAAAAAIBgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAwAPMak4yuoQHBsfywedgdAEAAAC5xWTvqIs/DTa6jAeCZ4cpRpeAXEaPAQAAAACCAQAAAACCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAgEHMliSjS3hg3Itj6XAP6gAAAACyzd7OUT8cHGp0GQ+EznUn/uvHoMcAAAAAAMEAAAAAAMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAEMEAAAAAgAgGAAAAAEQwAAAAACCCAQAAAAARDAAAAACIYAAAAABABAMAAAAAIhgAAAAAkORg1BNbLBaNGjVKf/zxh5ycnDRu3DhVqFDBqHIAAACAAs2wHoNffvlFiYmJWrZsmQYNGqRJkyYZVQoAAABQ4BkWDPbt26dmzZpJkurWravff//dqFIAAACAAs9ktVqtRjzxRx99pCeffFItWrSQJLVs2VK//PKLHBzSH9108OBBOTs752WJAAAAwAMlISFBdevWTXefYXMM3NzcFBsba/vaYrHcNRRIuusLAAAAAPDvGTaUqH79+tqxY4ekW70BlStXNqoUAAAAoMAzbChRyqpEx44dk9Vq1YQJE1SpUiUjSgEAAAAKPMOCAQAAAID8gxucAQAAACAYAAAAACAYAAAAABDBAACQQ+fPn1d609TMZrMOHz5sQEUAgH+DYAAAyBF/f39dvXo1zfYzZ86oe/fuBlQEAPg3DLvBWUFy6dIlzZs3TydOnFBiYmKa/fPnzzegKgDIvmXLlmnOnDmSJKvVqs6dO8vOLvU1phs3buixxx4zojzgvnL9+nWdPn063XOD+vXrG1ARCjqCQR547733dO7cOfn7+8vFxcXocpCOo0ePavHixTp16pQ+/vhjbdq0SeXLl1fz5s2NLg3IV5577jk5OzvLYrFo2LBheuONN1SkSBHbfpPJpEKFCqlx48YGVllwVa1aVSaTKUttIyIicrkaZGTJkiWaNGmSkpKS0uwzmUy8PwawWCxZbnvnBZEHBcEgDxw6dEjfffedqlWrZnQpSMfOnTs1YMAAtWvXTr/99psSExN15coVTZo0SRMnTlTHjh2NLrFAu3btmubNm6dDhw4pOTk5zZj2JUuWGFRZweTo6KjOnTtLkry8vFS/fn05OPCnJL9YsGCB7f+HDx/WwoUL1adPH9WsWVMODg46cuSI5syZox49ehhYJSRp1qxZeuONN/Taa69x0TCfqF69eoEP1vw2zwO1a9fWuXPnCAb51IwZMzR06FB17dpV69evlyS988478vT01OzZswkGBhs8eLAOHz6sjh07ys3NzehycBtfX1/t3LnzrqHt3XffNaiygqtJkya2/48ZM0aTJ09W06ZNbdtq1KihcuXK6aOPPtJrr71mRIn4fyaTSR07dlTRokWNLgX/75tvvjG6BMMRDPLAhAkT1L17d23evFllypRJk0b79+9vUGWQpOPHj+vxxx9Ps71p06aaOHGiARXhdrt379bixYtVu3Zto0vBHcaPH68lS5aoatWqKly4cKp9Wb3qhtwTFRWlEiVKpNnu6uqq6OhoAyrC7fr06aNp06bpo48+UpkyZYwuB7p1sSMr/v7771yuxDgEgzzwySef6MqVK/rjjz906tSpVPtMJhPBwGBeXl46ePCgypUrl2r7li1b0mxD3itdujQnmfnU6tWrNWnSJD3zzDNGl4J0tGzZUsOGDdOwYcNUpUoVWa1WHTp0SOPHj1e7du2MLq/Aq1Spkj777DP5+/unu/9BHapyv4iMjNSUKVP0559/2uYeWK1WJSYm6tq1aw/s+0MwyAO//PKLvvrqKybj5VMDBw7U4MGDdejQIZnNZq1cuVJnzpzRhg0bNHXqVKPLK/A++OADjR49WgMGDJCXl5ecnJxS7Se8GcfR0ZGenHxszJgxGjlypF555RXbiY29vb06deqk4cOHG1wdAgMD1bhxYz377LPMMciHAgMDZbFY1Lt3b02YMEGDBw/WuXPn9O233z7QowlM1vTuToN7qn379po0aRJ/QPOxo0ePav78+YqMjJTZbJa3t7deffVV1alTx+jSCryqVaum2WYymWS1Wlm5w2CzZs3S8ePHNWbMmDRDiZB/xMTE6MSJE5Ikb29v5urkE/Xq1dOaNWu4uJFP1a5dW8uWLVO1atX04osv6p133lGTJk20fPlyrV69Wt9++63RJeYKgkEeWLdunYKCgvTKK6/Iy8tL9vb2qfbfPlkMxoiLi9P169dVunRpSVJwcLDq1q3LH9B84Ny5cxnuL1u2bB5Vgjt1795d4eHhslgsKl68uBwdHVPt37ZtmzGFwebixYs6ceKEzGazbVtiYqKOHDmiPn36GFgZxo0bJzc3Nw0cONDoUpCO+vXra82aNfLy8tJHH32kSpUq6fXXX9e5c+fUqVMn7d271+gScwXBIA+kd8UzBVc8jXfw4EH17t1bL7zwgj744ANJ0tNPP63o6Gh9+eWXrCaVD1itVu3YsUPHjx+X2WxWxYoV5efnl2ZYEfLW6tWrM9z/7LPP5lElSM93332n8ePHKzk52dbLJt36u1OnTh0tXbrU4AoLtkGDBmnjxo1yd3dP96IhSzEb680335SXl5cGDx6sH374QWvXrtXixYu1fv16jR8/XiEhIUaXmCsIBijw/vOf/8jX11eDBg1KNcl16tSpOnDgwAPbXXi/OH/+vPr06aPTp0/L29tbZrNZp06dUunSpfX111+rVKlSRpdY4FksFp07d04PP/ywLBYLgS2faN26tZ577jm99dZbat26tZYvX67Y2FgNHjxY7dq105tvvml0iQXazJkzM9zPwiTGioyMVN++fdW1a1d169ZNzz//vKKionTz5k317dv3gX1/CAa55MyZM/Ly8pLJZNKZM2cybMv4QmPVrVtXa9euTfM+nD59Ws8884wOHjxoTGGQJL399tsym836+OOP5e7uLkm6evWqBg8eLFdXV3322WcGV1hwJScna9q0aVq8eLHMZrM2bNigjz/+WA4ODho7dqwKFSpkdIkFWs2aNbV+/Xp5eXmpd+/e6tSpk9q3b6+9e/dq2LBh2rhxo9ElAvlefHy8XF1dFRcXp7CwMBUrVkx169Y1uqxcw6pEuaRNmzYKCQlRiRIl1KZNm1TduBKTJ/MTLy8vhYSEqFu3bqm2h4aGytPT06CqkCI0NFTff/+9LRRIUvHixfXBBx+oe/fuBlaGTz/9VMHBwZo/f77eeustSVKPHj0UGBioSZMmacyYMQZXWLCVKFFCV65ckZeXlypWrKiIiAi1b99epUqV0j///GN0eQXS4MGDNWLECLm5uWnw4MEZtp0yZUoeVYX0+Pv7a+XKlSpWrJgkqVChQmrZsqWioqLUpEkT7dq1y9gCcwnBIJds3rxZHh4etv8j/+rTp48CAgK0f/9+1ahRQ9Kt9aPXrVvHiU0+4O7urmvXrqXZfu3atTSTXZG3fv75Z02dOlUNGjSwbfPx8dGECRPUt29ffn4M1r59ewUEBGjcuHFq1qyZPvjgA1WrVk1bt25VhQoVjC6vQLp9HsGdcwpgvHXr1tkWTTh37pxGjhwpZ2fnVG3Onz8vB4cH9/T5wX1lBrt9pZTb/3/nTTKOHDnCqioGe/rpp+Xh4aGlS5dq+fLlcnR0VIUKFbRw4ULVr1/f6PIKvA4dOmj48OEKDAy0Lfn722+/ady4cerQoYPB1RVsV69eveuddW/evGlARbjdoEGDVLRoUV27dk3+/v564YUXNGbMGBUrVuyBXoc9P7v9uPMe5D+NGzfWzp07bV/b2dmlCXBVq1ZVQEBAXpeWZ5hjkAf27t2rUaNGKTIyMs0+BwcHHTp0yICqgPtDYmKiRowYoTVr1tiG49nb26tbt2768MMP01zNQd7p27evihcvrnHjxtmW9itWrJgGDRoke3t7zZkzx+gSgXzLYrFo48aN+uuvv2zLyaZcNIyIiNCCBQsMrrBgmzlzpl5//fV050pduXLFNirkQUMwyAPPPvusHn74Yb344ot69913NWXKFEVFRWnmzJkKDAxU+/btjS6xQEtMTNSqVat06NAhJScn684fCcZ55g/Xr1/XyZMn5ezsrPLly8vV1dXokgq8qKgo9evXT2fPntX169f1yCOP6MKFC/Ly8tLnn39Ob2g+sG3bNi1cuFCnT5/WokWLtHz5cj388MPq2rWr0aUVeKNGjdKqVatUvXp1hYeHq169ejp9+rQuXbqkl156ibtTG6xatWoKCQlJEwDOnj2rjh076sCBAwZVlrsYSpQH/vrrL3388ceqVKmSatSoIUdHR7300ksqUaKE5s6dSzAw2LBhw/TLL7+oWbNm3NAsn9i1a5caNmwoBweHNBO8YmNjdeXKFdvX3CDQOKVKldKKFSu0a9cuHT9+XMnJyfL29pafn5/s7OyMLq/A+/HHHzV+/Hj17NlTBw4ckMVikaenpyZNmqT4+Hi9+uqrRpdYoK1fv14ff/yxnnzyST311FMaNWqUKlasqICAAMXHxxtdXoG0evVqrVixQtKt3ps+ffqkmU9w8eJFPfTQQ0aUlycIBnnA1dXV9keyYsWKOnr0qFq0aKHatWvbblMP42zevFmzZs1S06ZNjS4F/++1116zrer12muv3bUdq3rlD1WrVlXFihVtX1+8eFGSuMeEwb766iuNHj1a7dq107x58yTJdlFqypQpBAODxcTEqFatWpKkypUr67ffftNjjz2m3r176/XXXze4uoKpbdu2OnfunCRp3759ql+/vgoXLpyqTeHChfXkk08aUV6eIBjkgcaNG2vatGkKDAxUvXr1NH/+fHXp0kWbNm2yLYMF4xQtWpQTmHzm6NGj6f4f+cumTZs0YsSINKtGsRRz/nD69GnVrFkzzfZq1arp0qVLBlSE25UvX16HDx/Www8/rMcee0zh4eHq0qWLLBaLYmJijC6vQCpUqJDtxmVly5bV008/XeBu2EgwyAOBgYEaPHiwNm7cqG7dumnFihVq2rSp7O3tNWrUKKPLK/D69eun8ePH66OPPlK5cuXSLIHJkAhj3bmWdIqoqCh17tz5gV1L+n4wfvx4tW7dWi+//DKTwPOhypUra/v27Xr55ZdTbV+5cqWqVKliUFVI0atXLw0aNEgTJkxQ+/bt9eyzz8pkMungwYOplgCGMcxms9asWXPX/V26dMnDavIOk48NYLVa9ddff3GlOp9o0aKFLl++bFsV4k5c9cx7t68lvWbNGrVt2zbdtaRPnTqVamk55K2GDRvq+++/l7e3t9GlIB179+5V79691bhxY23fvl0dO3bUyZMnFRERoc8//1yNGzc2usQCb9++fXJxcVGNGjW0c+dOLV++XMWKFdOAAQO4wabBWrduneprs9msy5cvy8HBQfXr19f8+fMNqix3EQxySXauYjJ50lhhYWEZ7vf19c2jSpDiypUrmjp1qqRbk8HatWsnFxeXVG0KFy6sTp062cboIu/NmDFD0dHR+uijj7jZXD516dIlLVmyRJGRkTKbzfL29lb37t1VpkwZo0sD7jtxcXEaOXKkKlWqpLffftvocnIFwSCXVK1aNUvtGIebf0RFRenEiROqW7euYmJiVLJkSaNLgm6tJd2rVy+WJ82Hjh49qp49e+rmzZsqWbKkTCZTqv3c9T1/io2N1cmTJ213ekfe6d69e5qfk7tZsmRJLleDnDh16pS6dev2wA5jZY5BLkmZMBkeHq7KlSunutr5yy+/qESJEqpXr55R5eE2sbGxGjp0qDZu3Cg7Oztt2LBBEyZM0NWrVzVr1qx07+yK3HX7cqUNGjTQwYMH79qWHjfjfPjhh6pYsaI6dOjAHIP7yP79+/XWW29xUcoAjz/+uNEl4F86cuSILBaL0WXkGoJBLjGbzRo6dKjWrl2rr7/+OtVwlJ9//lnr16/X888/r9GjR6e53Tby1uTJk3X16lVt3rxZHTp0kCQNGTJEAQEBGjdunGbMmGFwhQUPy5XeH86cOaM1a9aofPnyRpcC3BdSVrxJERcXp+vXr6t06dKSpODgYNWtW5d76uQD6fXuxMbG6s8//8zw79L9jmCQS+bNm6fdu3frm2++UcOGDVPtmzFjhrp27ar33ntPjz76KGtJG2zLli2aO3duqru0VqhQQaNGjVLPnj0NrKzgYrnS+0Pr1q0VHBys7t27G10KcN85ePCgevfurRdeeEEffPCBJGnixImKjo7Wl19+qWrVqhlcYcGWXu+Ok5OTatWq9UD3VBMMcsnq1asVGBiYJhSkaNy4sQYPHqx58+YRDAx28+bNdCdOJiYmiik4+cOJEydUsmRJFSlSRL/++qs2bdqkmjVr6vnnnze6tAKtVKlSmjRpkn744QeVK1cuTe/nlClTDKoMyP8mTJigF154QYMGDbJt+/nnnzV16lSNHTtW3377rYHV4c7enYKCYJBLLly4oOrVq2fYxsfHR6NHj86jinA3/v7+mjZtWqqTmJMnT2rs2LFq2bKlcYVBkrRq1SoFBgZq/vz5cnd319tvv60GDRpow4YNunDhQoH95Z0fXLt2TU8//bTRZeA2WZkQyfC7/OHYsWOaNm1amuEqXbt2ZeJxPrFixQotXbpUx48fl6OjoypWrKhevXrpiSeeMLq0XEMwyCUlS5bU2bNnUw1PudP58+dVvHjxPKwK6QkMDNTQoUPVqFEjWa1Wde7cWXFxcfLz89NHH31kdHkF3hdffKFx48apUaNGmjhxoh599FEtWLBAu3fvVkBAAMHAQBMnTjS6BNwhq2Ofs7oyDnKPl5eXQkJC1K1bt1TbQ0NDuYdBPjB9+nQtXbpUPXv2VL9+/WSxWBQeHq7BgwfrnXfeeWBHexAMckmbNm0UFBSk+vXrpztMJSkpSTNnzlTz5s0NqA63c3NzU1BQkM6cOaPIyEglJyfL29tblSpVMro06FbvW8qNmLZu3apnnnlG0q0/qtHR0UaWBknbtm3T119/rVOnTmnRokVavny5Hn74YXXt2tXo0gok5uTcP/r06aOAgADt37/ftnRsRESE1q1bpzFjxhhcHZYvX67JkyerVatWtm3+/v6qVq2axo0bRzBA9vTt21ddunTRc889px49eqhmzZoqUqSIoqOjFR4eriVLlighIUHTp083utQC6fz582m22dvbq3LlymnacCMgY5UrV04hISF66KGHdPr0afn7+0uSfvjhB1WsWNHg6gq2H3/8UePHj1fPnj21f/9+WSwWeXp6atKkSYqPj39g/3AC98LTTz8tDw8PLV26VMuXL5ejo6MqVKighQsXqn79+kaXB0kPP/xwmm0VKlRQYmKiAdXkDW5wlouio6M1depUrVu3TvHx8ZIkq9Uqd3d3dejQQf369ZOHh4fBVRZMVatWzXJXOuNxjbV+/Xp98MEHMpvNat26tWbNmqXJkydr6dKlmjlzppo2bWp0iQVWx44d1bdvX7Vr10716tXTmjVrVK5cOa1fv15TpkzRli1bjC4RyLc++ugjvfXWW6pQoYLRpeD/3X5/gsWLF+vnn3/W+PHj9eijj0q6tUTz0KFD1bp1a73++utGlZmrCAZ5IDExUWfOnNH169dVvHhxlS9fXnZ2dkaXVaCdPn3a9v+dO3dq0aJFGjJkiGrWrCkHBwcdOXJEkydPVteuXVmKMR+4cuWKoqKibMv3HT9+XEWLFuXu1AarU6eOfvrpJ5UrVy5VMDh16pQ6duyo8PBwo0sE8i1fX1+tWrVKXl5eRpeC/3fnRUOr1SqTySRnZ2eZTCbdvHlTJpNJ7u7u3PkYOefk5MR49Xzm9hsyffXVV/rkk09Up04d27bHH39cY8eOVd++fQkG+YCLi4sOHDiglStXymw2y9vb23YzOhincuXK2r59u15++eVU21euXKkqVaoYVBVwf3j11Vc1cuRI9ezZU2XLlk1z9/By5coZVFnB9c033xhdguEIBijwYmJilJycnO72pKQkAyrC7Y4ePao33nhDjo6Oqlmzpsxms7Zs2aLZs2dr0aJFeuyxx4wuscAKCAhQ7969tWvXLiUlJWn27Nk6efKkIiIi9PnnnxtdHpCvffbZZ5KkkJAQSf9bKSrlKjXDWPOer69vmm1RUVE6ceKE6tatq5iYmAe+p5qhRCjwxowZo+3bt2vAgAGqWrWqrFarDh06pKCgID333HN67733jC6xQOvRo4e8vLw0duxYOTjcupaRnJyswMBA/f3331qwYIHBFRZsly5d0pIlSxQZGWnrzenevTuT9oFMnDt3LsP9GS13jtwXGxuroUOHauPGjbKzs9OGDRs0YcIEXb16VbNmzVKJEiWMLjFXEAxQ4CUnJ+uzzz7TihUrdOXKFUm37kPRvXt39enTh/W+DVanTh2tXr06zQpEx48f1/PPP68DBw4YVBlmzpypXr16ydXVNdX2mJgYzZw5U0OGDDGoMuD+sW/fPp08eVJt27bV+fPn9cgjj8jJycnosgq8ESNG6MSJE5o0aZI6dOigNWvWyGKxKCAgQA8//LBmzJhhdIm5gqFEKPAcHBz0/vvv6/3337cFA1aLyj9Slim9MxicPHlSbm5uBlVVcP3111+6ePGiJGnWrFmqXLmyihQpkqbN0qVLCQZABi5fvqzevXvrr7/+UmJionx9fTVjxgwdO3ZM8+fPZ7Uig23ZskVz585N1XNToUIFjRo1Sj179jSwstxFMECBtGLFCj3zzDNycnLSihUrMmzbpUuXPKoK6enatauGDx+uAQMGqHbt2pKk3377TTNnzkxzx1DkvsuXL6e6u+4777yTpk2hQoUe2KX8gHtl7NixKlu2rBYvXqwmTZpIkqZOnarBgwdr3Lhx+vLLLw2usGC7efNmujeoTUxM1IM82IZggAJp9uzZ8vf3l5OTk2bPnn3XdiaTiWBgsF69eik+Pl7Tp0+33enY09NTvXr14gZaBmjUqJHt7rqtW7fWihUr6GEDcmDXrl1asmSJXFxcbNvc3Nw0aNAg/ec//zGwMki37nI8bdo0TZkyxbbt5MmTGjt2rFq0aGFgZbmLYIAC6fYbL/3yyy/cVyIf+vHHH7Vp0yY5OjrK399foaGhunz5spydnRlClE9kdAOzv//+W6VLl87DaoD7i52dne3mp7e7ePFimqVLkfcCAwM1dOhQNWrUSFarVZ07d1ZcXJz8/Pw0fPhwo8vLNQQDFHjNmzfXU089paefflr16tUzuhxImjt3rj799FM1adJEycnJGjp0qI4dO6b333/f6NJwm8jISE2ZMkV//vmn7Y6hVqtViYmJunbtGsstAhno0KGDxo0bp9GjR8tkMikmJkYhISEaM2aM2rVrZ3R5BVpMTIzs7e0VFBSkM2fOKDIyUsnJyfL29pabm5vGjh2radOmGV1mrmBVIhR469at0/r167Vjxw55eHjYQkKNGjWMLq3AeuKJJ9S/f3917txZkrRx40YNHTpUe/fuZZWofKR79+6yWCx69tlnNWHCBA0ePFjnzp3Tt99+q1GjRtnePwBpJSYmavr06VqyZIntnjkODg7q0qWLhgwZkmqIEfLG33//rSFDhig0NFTSrQuHU6ZMkbu7u8xmsxYsWKA5c+bIwcHB1uZBQzAA/l98fLy2bt2qDRs2KDg4WCVLltTTTz+d7uRK5K6aNWtq8+bNKlWqlKRbS8rWrl1bW7dutW2D8WrXrq1ly5apWrVqevHFF/XOO++oSZMmWr58uVavXq1vv/3W6BKBfO/mzZs6c+aMzGazypUrp8KFC+vKlSvM3TFA37599eeff+qdd96Ro6Oj5s6dq8qVK2vgwIHq27evjh49qi5duui9995T8eLFjS43VzCUCPh/rq6uat++vSpVqqSKFSvq66+/1tdff00wMEBycrLtZmbSratozs7OSkxMNLAq3MnBwcG2VGnFihUVERGhJk2a6PHHH9fkyZMNrg7I36pVq6aQkBB5eHikuoP72bNn1bFjR+7RYoB9+/bpk08+sa0SVaNGDXXu3FlHjx6V1WrVsmXLVKtWLYOrzF0EA0BSeHi4Nm7cqE2bNikqKkrNmzfXhAkT1KpVK6NLA/KtBg0aaN68eRo8eLBq1qyptWvX6tVXX9Vvv/3G5EkgHatXr7YtkW21WtWnT59UF0GkW5OPH3roISPKK/CuX7+uSpUq2b4uV66ckpKSVK5cOU2fPj3d5UsfNAQDFHgtW7bUpUuX1KRJE/Xp00dPPPEEq97kAz/99JMKFy5s+9pisei///1vmu51lpM1zpAhQ9S3b19999136tatm7755hv5+Pjo5s2b6tu3r9HlAflO27Ztde7cOUm3rk7Xr18/1e85SSpcuLCefPJJI8or8KxWq+zt7VNts7e3V79+/QpEKJCYY4ACavPmzWrevLkcHR21dOlStW3bNs14wdjYWM2aNUuDBw82qMqCq3Xr1llqZzKZtHnz5lyuBpmJj4+Xq6ur4uLiFBYWpmLFiqlu3bpGlwXka6tXr1b79u3pXctHqlatqpCQEJUoUcK2rV69elqzZo3KlStnYGV5h2CAAqlatWoKDg5O9cPfsmVLLVmyxHb780uXLqlZs2YsuQjchb+/v1auXKlixYql2h4VFaXOnTtr165dxhQG3CciIyP1+++/Kzk5Oc3ddOkNzXtVq1bV0KFDU/XijB07Vv369SswvdUMJUKBlF4ejo6Otq3FDiB969at07Zt2yRJ586d08iRI9Nc8Tx//nyacdMAUps7d66mT58ud3f3NMOJTCbTA3vimZ+VKVNGX3/9daptJUqU0NKlS1Nte5DfH35zAwCyrHHjxtq5c6ftazs7uzRjcqtWraqAgIC8Lg24ryxYsEAffvihevXqZXQp+H8Z3c29oCAYAACyzMPDQxMnTpQklS1bVq+//roKFSpkcFXA/ScpKYlJxsh37IwuAABw/7l8+bLefvttWyg4evSoFixYoB9//FE3b940uDog/+vUqZOWLFmS7tBWwCj0GKDAymw5zJiYGKNKA/KtuLg4DR48WJs3b9bPP/+sihUr6scff9SwYcNUokQJubi4aNasWVq0aBF3qQYycPXqVW3cuFFr165V2bJl0yyHuWTJEoMqQ0HGqkQokLK6HKbEmEPgdlOmTNGWLVs0atQo+fr6KjExUc2aNdMjjzyiJUuWyMnJScOHD1diYqKmTJlidLlAvjVz5swM9/fv3z+PKgH+hx4DFEic7AM5s3HjRo0ePVqNGzeWJIWEhOjGjRt6+eWX5eTkJOnWMn7c4AzIGCf+yI8IBgCALIuKipK3t7ft6127dsnOzk7NmjWzbfP09GQoHpCJzG6eSY8bjMDkYwBAlpUsWVJRUVG2r3fu3KkaNWqkuvnPkSNHmF8AZMLe3j7VP6vVqtOnT2vDhg0qXbq00eWhgKLHAACQZU899ZSmTZumYcOGKSQkRKdOndKoUaNs+6OiojRjxoxszeMBCqKUZX/vtGDBAh05ciSPqwFuYfIxACDL4uLiNGTIEG3atEkmk0nPPvusxo8fL+nWZMrPP/9cVapU0cKFC1WkSBGDqwXuP2fPnlXHjh114MABo0tBAUQwAABkW8ocAjc3N9u2sLAwRUdHq1WrVnJwoEMayIjFYkmzLTY2Vl988YXWrVvHIhkwBL+5AQDZdnsgSOHr62tAJcD9qXr16jKZTGm2Ozs7a9y4cQZUBNBjAAAAkOfCwsJSfW0ymeTo6KhHH3003eAN5AWCAQAAgEEiIyMVGRkps9ksb29vVa1a1eiSUIAxlAgAACCPRUdHKyAgQNu2bZO7u7vMZrNiY2Pl4+Oj2bNnM3kfhuA+BgAAAHls7NixunjxotatW6fQ0FDt3btXa9euVXx8/F2XMgVyG0OJAAA5UrVq1XQnT0qSo6OjPD091a5dO7377rtydHTM4+qA/K1Bgwb6+uuvVbNmzVTbw8PD9eabbyo0NNSgylCQMZQIAJAjo0aN0syZMzVgwADVrVtXVqtVv//+u4KCgvT888+rcuXKmjVrlqxWqz788EOjywXyFRcXl3S3m0wmmc3mPK4GuIUeAwBAjrRp00aBgYFq3rx5qu2//vqrRo0apY0bN+rAgQMaMGCAgoODDaoSyJ8CAwP1xx9/aPLkyfL29pYkHT9+XEOGDFGFChU0depUgytEQUSPAQAgRy5duqRSpUql2e7h4aF//vlHkuTp6anY2Ni8Lg3I9z788EP169dP7dq1sy1PGhsbqxYtWigwMNDg6lBQEQwAADnStGlTjR49WpMmTVL58uUlSadPn9b48ePVuHFjmc1mrVixQpUrVza4UiB/CQ8PV5UqVbRo0SL98ccfioyMVGJiory8vOTj42N0eSjAWJUIAJAj48aNk7Ozs5588kk1bNhQPj4+atu2rVxcXDR27Fht375dS5cuVUBAgNGlAvlCcnKyPvzwQ3Xt2lW//fabJKlKlSpq3769tm/frh49emj48OHMMYBhmGMAAPhXTpw4oWPHjsne3l6PPvqoHnnkEUnSzZs35ezsfNeVi4CCZu7cuVq8eLGmTZumhg0bptm/e/duvffee+rdu7deffXVvC8QBR7BAACQY1arVVFRUUpKSkqzr1y5cgZUBORf7dq10/vvv682bdrctc3q1as1b948/fTTT3lYGXALcwwAADmyfft2jRgxwjbROIXVapXJZFJERIRBlQH504ULF1S9evUM2/j4+Gj06NF5VBGQGsEAAJAj48ePV7169dSnTx/bqioA7q5kyZI6e/asypYte9c258+fV/HixfOwKuB/CAYAgByJiorSvHnzGDIEZFGbNm0UFBSk+vXrp3s38KSkJM2cOTPNvUGAvMIcAwBAjrzxxhvq0KGDOnfubHQpwH3hxo0b6tKli5ycnNSjRw/VrFlTRYoUUXR0tMLDw7VkyRIlJCTou+++k6enp9HlogAiGAAAcmT27Nn68ssv1axZM5UvXz7NFdB3333XoMqA/Cs6OlpTp07VunXrFB8fL+nWvBx3d3d16NBB/fr1k4eHh8FVoqAiGAAAcqRHjx533WcymfTNN9/kYTXA/SUxMVFnzpzR9evXVbx4cZUvX152dtxeCsYiGAAAAABg8jEAIOtWrFihZ555Rk5OTlqxYsVd25lMJj3//PN5WBkA4N+ixwAAkGWtW7fWypUrVbx4cbVu3fqu7UwmkzZv3pyHlQEA/i2CAQAAAACGEgEAsm7Pnj1ZbtuwYcNcrAQAcK/RYwAAyLKqVaum+tpkMslqtcrZ2Vn29vaKi4uTvb29ChcurLCwMIOqBADkBD0GAIAsO3z4sO3/q1at0sqVKzV27Fg99thjkqRTp04pMDBQrVq1MqpEAEAO0WMAAMiRxx9/XPPnz0/Ti3Ds2DH16NFDoaGhBlUGAMgJ7qQBAMixqKioNNuOHz8uJycnA6oBAPwbDCUCAORI9+7dNXjwYPXs2VNVqlSRJB06dEiLFy/WO++8Y3B1AIDsYigRACDHli1bpuXLlysyMlKS9Nhjj+mll15Sp06dDK4MAJBdBAMAAAAADCUCAOTMp59+muH+d999N48qAQDcCwQDAECO7N27N9XXZrNZZ8+e1fXr19WuXTuDqgIA5BTBAACQI4sWLUp3++TJk5WcnJzH1QAA/i3mGAAA7qkzZ87o2WefTdOjAADI37iPAQDgntq6datcXFyMLgMAkE0MJQIA5EiLFi1kMplSbYuNjVVMTIwCAgIMqgoAkFMMJQIA5MiqVatSBQOTySRHR0fVrFlTFSpUMLAyAEBOEAwAAAAAMJQIAJB13bt3TzN86G6WLFmSy9UAAO4lggEAIMsef/xxo0sAAOQShhIBAHIsLi5O169fV+nSpSVJwcHBqlu3rtzc3AyuDACQXSxXCgDIkYMHD6pVq1ZavHixbdvEiRP11FNPKSIiwsDKAAA5QY8BACBH/vOf/8jX11eDBg1KNe9g6tSpOnDggL799lsDqwMAZBc9BgCAHDl27Ji6du2aZjJy165ddeTIEYOqAgDkFMEAAJAjXl5eCgkJSbM9NDRUnp6eBlQEAPg3WJUIAJAjffr0UUBAgPbv368aNWpIkiIiIrRu3TqNGTPG4OoAANnFHAMAQI7t2rVLS5cuVWRkpBwdHVWhQgX17NlT9evXN7o0AEA2EQwAAAAAMJQIAJAziYmJWrVqlQ4dOqTk5GTdeZ1pypQpBlUGAMgJggEAIEeGDRumX375Rc2aNeOGZgDwAGAoEQAgR+rVq6eZM2eqadOmRpcCALgHWK4UAJAjRYsWValSpYwuAwBwjxAMAAA50q9fP40fP15//fWXEhISZLFYUv0DANxfGEoEAMiRFi1a6PLlyzKbzenuj4iIyOOKAAD/BsEAAJAjYWFhGe739fXNo0oAAPcCwQAA8K9ERkYqMjJSZrNZ3t7eqlq1qtElAQBygOVKAQA5Eh0drYCAAG3btk3u7u4ym82KjY2Vj4+PZs+erSJFihhdIgAgG5h8DADIkbFjx+rixYtat26dQkNDtXfvXq1du1bx8fGaOHGi0eUBALKJoUQAgBxp0KCBvv76a9WsWTPV9vDwcL355psKDQ01qDIAQE7QYwAAyBEXF5d0t5tMpruuVAQAyL8IBgCAHGndurXGjBmjEydO2LYdP35cY8eOVatWrQysDACQEwwlAgDkyPXr19WvXz/t2bNHbm5ukqTY2Fi1aNFCU6ZMUdGiRQ2uEACQHQQDAMC/8scffygyMlIuLi7y9vaWt7e30SUBAHKAoUQAgGwLDw9XQkKCJKlKlSpq3769rFaroqOjDa4MAJBTBAMAQJYlJyfrww8/VNeuXfXbb7+l2vfTTz/pxRdf1PDhw5l8DAD3IYIBACDL5s+fr9DQUH3zzTfy9fVNtW/GjBlasGCBNm/erEWLFhlUIQAgp5hjAADIsnbt2un9999XmzZt7tpm9erVmjdvnn766ac8rAwA8G/RYwAAyLILFy6oevXqGbbx8fHR2bNn86giAMC9QjAAAGRZyZIlMz3pP3/+vIoXL55HFQEA7hWCAQAgy9q0aaOgoCAlJSWluz8pKUkzZ85U8+bN87gyAMC/xRwDAECW3bhxQ126dJGTk5N69OihmjVrqkiRIoqOjlZ4eLiWLFmihIQEfffdd/L09DS6XABANhAMAADZEh0dralTp2rdunWKj4+XJFmtVrm7u6tDhw7q16+fPDw8DK4SAJBdBAMAQI4kJibqzJkzun79uooXL67y5cvLzo4RqgBwvyIYAAAAAGDyMQAAAACCAQAAAAARDAAAAACIYAAAAABABAMAAAAAkv4PsRwyQTi3wOsAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(13,7))\n",
+ "\n",
+ "sns.countplot(x=df['discourse_type'],palette='Set3',\n",
+ " order=df['discourse_type'].value_counts().index);\n",
+ "\n",
+ "plt.xticks(rotation=90,size=14,);\n",
+ "plt.xlabel(None)\n",
+ "plt.ylabel(None)\n",
+ "\n",
+ "plt.title('Distribution of discourse types in the training set',size=16,pad=20,weight='bold');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e4834267",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ " __Learning__ \n",
+ " \n",
+ "The classes are __imbalanced__. Plan of action :\n",
+ " \n",
+ " Evaluate impact of training an imbalanced dataset on the result. One hypothesis is that it might not have a negative impact as essays naturally have classes longer than some others \n",
+ " Create a balanced dataset and iterate. A way to do so would be to create length per discourse for each essay, get the std and train on the essays with this lower std. \n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "140623ae",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ "### Distribution discourse types across essays on average"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "bb9e98a4",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:14.444030Z",
+ "start_time": "2022-02-15T16:30:13.456024Z"
+ },
+ "hidden": true,
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsgAAAHwCAYAAAC7apkrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABpRElEQVR4nO3deVxN+eMG8Oe2R2mTSJYKFU2pZM2WPYxsw5eRsYvBWCYylqzZM3YZ2xDGEsNM9p0hlMoQKutYUkJadFvu7w+vzu9eFXdG3aPb8369vLjnnu59Ojf19Lmf8zkSmUwmAxERERERAQA0xA5ARERERPQlYUEmIiIiIpLDgkxEREREJIcFmYiIiIhIDgsyEREREZEcFmQiIhJFXl6e2BGIiArFgkxUygwZMgR2dnZwcHBAYmKi2HFEceXKFdjZ2SEoKAgA0L9/f9jZ2eHNmzef9biZmZmYOHEiXFxc4OLighkzZij9sVOmTIGdnR2mTJkCAAgPD4ednR3s7Ow+K5M6ysrKwooVK/DLL7+IHYWIqFAsyESlSGJiIi5dugTg/ehbaGioyInEcePGDQDAV199hby8PNy6dQs1atSAkZHRZz3uoUOH8McffyAjIwNaWlrQ0dH5z4+lo6MDCwsLWFhYfFYmdTRu3DisXr0aWVlZYkchIiqUltgBiEh5Bw8eRG5uLnR1dZGVlYX9+/fD19dX7FgqJ1+Q7927h4yMDHz11Vef/bgvXrwAADg6OmLfvn2f9VguLi44d+7cZ2dSR+np6WJHICL6KI4gE5UiBw4cAABMmDAB2traePjwIa5cuSLc37FjR9jZ2WHr1q0KH9epUyfY2dlh9+7dAIDk5GRMnDgRDRo0gIuLCwYPHoxbt24J+4eGhsLOzg5DhgxBQEAAnJ2d0aNHD8hkMjx//hw//PADmjZtCkdHR7Rq1QoLFiyAVCoVPv7ly5cYP348XFxc4OHhgQ0bNmDJkiWws7PDypUrhf1u3LiBAQMGwMnJCY0bN4a/vz9SUlKK/PzzpywcPnwYANCiRQt07twZAPDHH38I0xsKk5iYCH9/fzRr1gxfffUVvL29cejQIeH+AQMGCNn+/vtv2NnZITw8vNDHSk1NxeTJk+Hm5obGjRtj1apV+PCipIVNsYiLi8PIkSPRrFkzODk5oUOHDli/fn2Bj922bRs6duwIR0dHtGzZEnPnzkVaWprCPr///jt69OgBZ2dnNGnSBP7+/kLBL+r5AcDT0xN2dnbCuw8rV66EnZ0d/P39MWbMGDg7O2PUqFEA3n8ddOvWDS4uLmjYsCEGDBiAq1evKjzew4cPMXLkSLi4uKBBgwYYM2YMHj9+XOTrMGDAAOFrdtWqVcLraWdnBxcXF7x7907Yd8eOHbCzs4OXlxeA/5/GsnbtWqxYsQJNmjSBm5sbpk+fjszMTIXn2bJlC9q2bQtHR0d07NgR27dvLzJT/uea/zUfFhaGDh06wMnJCUOGDMGjR48U9r1w4QJ69OgBR0dHeHh4YO7cucjIyFD4HO3s7LB+/Xp069YNrq6uCA4OLvR5c3JyEBQUhBYtWuCrr75Ct27dEBYWprDP6dOn8c0336BBgwZwc3NDr169cOLEiQL5P/VaXbt2Dd9++y0aNGgAZ2dndO7cGbt27QIAxMTECF8vT58+FT7m7NmzsLOzg7u7u8L/cSJ1x4JMVErExMQgPj4e+vr66N27N1q0aAEACiOd3bt3BwChQAJAbGws7t27B11dXXTq1Anv3r2Dj48P/vjjD2RlZUFTUxMXL15E//79cf/+fYXnDA8Px65du6ClpQVbW1tIJBL4+vri8OHDSEtLQ/ny5fHs2TNs3rwZGzduBPD+B/7QoUMRFhaGjIwMZGRkYMmSJTh48KDCY8fHxwtlSUtLCxkZGQgNDcXAgQOL/EFsYWEBc3NzAIC+vj4sLCygr68PAKhYsWKRUyxevnyJXr16ITQ0FC9fvoSWlhZiY2MxadIkbNiwAQBgYmKC8uXLAwC0tbVhYWFR5BSL0aNH48CBA0hLS4NUKsXKlStx9OjRQvfN9+7dOwwePBinT5/G27dvoa+vjwcPHmDZsmVCBgBYvnw55s6di/v370NXVxcvXrzAtm3b8MMPPwj7rFmzBn5+frh58yY0NDTw6tUrhIaGom/fvh/9BeNjDh06hJMnT0JDQwN16tTBiRMn4O/vj9u3b0NHRwdSqRRXrlzBsGHDhAKcnJyMfv364fTp05DJZMjNzcWxY8fwv//9D69evSr0eUxMTKCtrQ0AKF++PCwsLNCmTRsYGRkhIyNDYdT92LFjAICvv/5a4TG2b98uTNFIS0vD7t27MXHiROH+VatWITAwEP/88w/Kly+PBw8eYM6cOVi3bt0nj0NsbCwmTJiAFy9eQCqV4sKFCxg4cKDwC8qlS5cwfPhw3Lx5E/r6+nj9+jW2bduGMWPGFHislStX4sGDB5BKpUW+wzF9+nSsW7cOSUlJKFeuHG7fvo3x48cLv7zdvHkTo0ePRnR0NGQyGWQyGW7cuIGxY8ciMjISAJR6rRITEzF8+HBcvXoVMpkMGhoaiI+Px8yZMxEREQEnJyfY2toCgMLXcv5r0LFjx8+ackRU2rAgE5US+aPHbdq0Qfny5eHt7Q3g/Q+z/B/e3bp1g4aGBqKiooRRoD///BPA+5FDQ0ND7N+/HwkJCXBxccHly5dx9epVjBgxAhkZGQpFDQCys7MRGBiIa9euwd/fH0lJSbCwsICjoyPOnz+P8PBwDB06FAAQHR0NADhz5gxu3boFDQ0NbNy4EZGRkVi9enWBEwpXr16NzMxMDBw4ENeuXUN4eDgaNWqEu3fvFhhBy3fu3DksW7YMADBw4ECcO3cOTk5O0NDQwNGjR+Hv71/ox61cuRIvXryAlZUVTpw4gcjISIwbNw4A8PPPP+P58+dYsWIFBg0aBOD/p0e4uLgUeKzo6GhhBDQwMBCRkZHYsmXLJ+fTJiQk4MWLF6hUqRKuXr2K8PBwBAQEoFmzZtDU1AQAvH79WvhFw9/fHxEREdi3bx+0tLQQERGB+/fvIzExEatXrwYAoSSdOHECVlZWePLkCVasWPHRHEXJzs7Gli1bcPXqVQwaNAiXL18GAAwaNAjh4eEIDw9Hhw4d0Lp1ayQlJQF4P0qbnJyMDh064MqVK7h69Sq+/vprJCUlYceOHYU+z4oVK4TjOmjQIJw7dw46Ojro1KkTgP8vZ69fv8bVq1chkUjQpUsXhcdISUnBli1bEBkZiXnz5gEATp48idjYWKSmpiI4OBgaGhrYs2cPwsPD8fvvv0NbWxvBwcEFRpo/9PLlS4wePRqRkZE4cOAAypUrh6dPn2Lv3r0AgKCgIOTm5sLf3x9Xr17FxYsXYWtriwsXLuDatWsKj2VoaIjTp0/jwoULaNiwYYHnSkhIQGhoKCpUqIBjx44hPDxc+D/4888/A3h/Qmpubi46dOiAa9eu4erVq+jbty88PT3x+vVrAFDqtXrw4AHq1auHLl26CK9V/usQExMDAArfUwAgNzcXp06dAgB07dr1o8eNSN1wDjJRKSCVSoWimz+a1qpVKxgbG+P169cICwvDN998g8qVK6Nx48b466+/cPjwYeHtYuD/f/jll7s7d+4IpSR/xPbDKQWampro3LkzJBIJTE1NAQDr1q1DTk4Obt68ievXrwsnDea/xRwREQEAaNCgATw8PAAAbdu2hYuLC65fvy48dn6O33//HUeOHAEAoeiHh4cLeT8UFRUFAKhfvz7y8vLw999/w9bWFgYGBkUev9OnTwN4vwKIlZUVAGDkyJHYsWMHkpKScP78efTu3bvIj5eX/4tAtWrV0KNHDwAQ3ur/8C1teTVr1oShoSFevHiBPn36oEWLFmjYsCHWrVsnjMxFR0dDKpVCV1cXPj4+AIC6devi2LFjqFKlilD6cnJyYG5uDl9fX0gkElhZWWHIkCGYNWsWTp8+jYCAAKU+F3mVK1cWSpyRkZEw4hkSEoKHDx+iSZMmGDt2LGrVqiV8TP5reOnSJbRt2xYAhCkS4eHhGD16tNLP3717d+zatQunT5+GVCrFqVOnkJOTA1dXV+E1y+fu7o4mTZoAAHr16oXVq1fj6dOniIyMRFJSErKysqChoaHw/Hl5eUhPT8fff/8Nd3f3InPo6Ohg5MiRkEgksLe3R/v27XHgwAFERkaiT58+wvz3DRs2YNOmTQDeT7kB3hfVBg0aCI/l4eEh/L8pTP7xy8zMRP/+/RXue/z4MZ4+fYqvvvoKEokEJ06cwJAhQ9CkSRP06tVLYURamdeqUaNGaNSoEdLS0nD58mVERETgyZMnAP5/Tni3bt0QFBSEqKgoJCYm4sGDB0hJSUGVKlU+esyI1BELMlEpcPr0aWG0aPjw4QXu37dvH7755hsA74twfkFu0KABnjx5AjMzM6Gs5i+Flj/9QZ78HFbgfVH68G3VtWvXYtOmTUhNTYWlpSVMTEwAQJhHm19yK1WqpPBxVapUUSjI+TnyP6+P5cgnP5925MiRwr/j4uJgZ2eH7t27Y8GCBQU+7uXLlwCgULQ0NDRgaWmJpKQk4X5l5H9+FStWVNj+4ef7ofLly+OXX37BvHnzEBMTg9jYWKxfvx7Gxsb46aef8PXXXwvHokKFCtDQ+P83+KpWrSr8O38KhaWlpcI++Z/bpz6XnJycQrfnT13J161bNzx//hxbt27FqVOnhJFEJycnLF++HFWrVhVew9TUVKEk5ivqNSxK/fr1UbNmTTx48ADnz58X3tovbOSysGP/9OlTpKamCpny8vIKXQbxU7mMjIyEKSD5jw0Ab9++RWpqqrB2c3Jy8icf+8Nj+qH8rNnZ2UVmbdCgAebOnYs1a9bg4sWLuHjxIgDAxsYGy5Ytg4ODg1KvVXp6OmbOnIkjR44gJycHtWvXFv5v5//ftbCwQNOmTXHhwgUcPXpUmHud/0syUVnCgkxUCuzfv/+j90dFRSEhIQG2trZo3749Zs2ahb///ls4Aadz587Q0nr/393MzAzA+xOJpk2bBuD9urTa2toKhQsA9PT0FG6fPXsWy5cvh5mZGcLCwmBra4vdu3dj+vTpwj75j//8+XOFj3327JnCbTMzMzx//hwrV65E+/btAbwv7eXKlSvy87SwsMCLFy+gqakJMzMzZGZmIjU1FcbGxtDV1S1yDnLFihXx7NkzYcQMeF+g8qehfFi4Pib/OT4sQ8qsSV2/fn1s2bIFb9++xeXLl3H69GkcOXIEU6dOhaenJ4yNjQG8/6VBKpUKBebw4cMwNDSEk5OTcHyfPn2KvLw84TX7559/FD4X+ddS/rHevn1baLYPX2sAGDx4MHx8fBAXF4dr165hz549iImJwZIlSxAUFAQzMzM8ePAAU6ZMEaanZGZmQk9P7z8Vqu7duyMoKAihoaG4ePEitLW1hXc55Mm/jgCEaQTGxsbC529gYCC8mwF8+msrX0pKisK+8o9tamoKDQ0NYYnFevXqAXg/Aps/f11eYcdUXv5rWadOHWHOcW5uLnJycqCrqyvs16NHD3Tu3BmPHj3CtWvXcODAAcTExGDmzJnCibefeq1Wr16NQ4cOoXHjxggKCoKpqSkmTpwofN3k8/b2xoULF3D48GHh/wenV1BZxDnIRF+4ly9f4vz58wCA4OBgREZGKvypXr06AAhzJPX19dGhQwfIZDKhWMtPV3BzcwPwvnQ9fvwYeXl5mDp1KlxcXDB37lyF5/6w5Ny9exfA+5PYKlWqhLS0NOEHe/7IWv7jR0RECKNdR44cURg9lt9v27ZtSE9PR1paGrp3745GjRoprC4hL3+aSePGjXHu3Dn07dsXwPv5zOfOnStyDnL+6PmmTZvw5MkTyGQyBAcHIykpCdra2mjevHmhH1eY/LfQnzx5Ipwgef78eYUyVpjDhw/D3d1dKHze3t7CEn3Z2dlIS0uDs7MzdHR0kJ2dLbx9f+fOHfj5+WHIkCGIj49Hs2bNIJFIkJSUhODgYMhkMjx9+lTYv02bNgDej0Lnyz+Z69ixYwXeNcj34Ws9btw44Wuibt26GDRokHBiaP4JePmv4d69e/Hy5UtIpVIMGzYMbm5uH70ISP4va2lpaQoj2vlz6E+cOAGpVAoPDw/hHQp5UVFRwioOf/zxh1CY3dzcULduXejr6yMtLQ0hISEA3r8+rq6u6NChQ4GR7g/l5uZixYoVyMvLQ0JCAo4fPy48tra2NpycnAC8/1rKH/n19PREs2bNCkxR+tQvCa6urpBIJIiLixOmAe3ZswcuLi745ptvkJubiwULFqB+/foYM2YMatSogf79+wtzsvNfB2Veq7i4OADv38kwNjbG/fv3hbnL8lc0bNeuHQwMDBAZGYnnz5+jTp06sLe3/+jnQaSOOIJM9IU7dOgQcnJyYGJiAg8PD+GErnydOnXC+vXrcfDgQUycOBFaWlro3r07QkNDIZPJULt2bWGkC3hfzH755Rc8fvxY+GH49u1baGtro2PHjh/NUr9+fQDvR4ebN2+O3NxcYf5y/tQDDw8PODo64u+//8bgwYNRvnx5pKenw8TEBK9evRJKw/Dhw3H8+HFcuXIFjRs3hqamJjIzM1G5cmWh0H7o5s2bkMlkcHR0BADhZMC6det+NPf333+PU6dO4dGjR2jTpg309fWFojhhwoR/dTEPOzs7tG/fHseOHcPUqVOFJb7Mzc2F0cbCNGvWDIaGhnjy5Ak8PT1hZGQkTKlo1KgRKleuDAAYOnQo1qxZg6CgIGzYsAEZGRnIy8tD06ZN4erqKhy79evXIygoCOvXr0dmZiZkMhmqVq0qzLutVauWkGno0KGoUaMGHj58+Mmc+bp06YKjR49i7969CAsLg4aGhvAa5//CNWDAAOzevRvx8fFo0aIFdHV1kZ6eDgMDA2FOcmHyp4z8+uuv2L17N86ePYsKFSqgSpUqaNSokTCvvaiRy3LlymH06NHC1xYAdOjQAXXq1BFyBQcHY/bs2QgKCkJaWhpkMhkaN26s8ItDYXR0dLBt2zb89ttvCsc1/3MeNWoURowYgT/++AMnT55ETk4OsrOzYWdnJ7w+yrK1tYWXlxf+/PNPjBw5EkZGRsK0i7Zt20JTUxOdOnVCSEgIzp8/j4YNGworZwDvf6EAlHut6tevj3PnzuHkyZNo2LAh0tPThWIsv4Sgnp4eOnbsKPzCzdFjKqs4gkz0hcsfBfb09CxQjgEIa8QmJyfjzJkzAN6fxJQ/JzX/h2g+PT09bNu2DV26dIGRkRFycnLg7OyM9evXK5xgVBh3d3dMnz4dlpaWkEgkqF27NhYuXAgNDQ3ExcUhOTkZEokE69atQ7t27aCvrw9DQ0MEBASgcePGwvMDgL29PbZs2YKGDRsKV61r06YNfv3110JHDQEIazXLF2QbG5tPvnVeuXJlhIaGokePHqhYsSKys7Ph4OCApUuXYvDgwR/92MIsWrQIffr0gYGBAfT09DB69Gh8++23H/2YChUqYPv27ejevTsqVqyItLQ0VK1aFQMHDsSqVauE/caNG4effvoJNWvWRFZWFipXrgwfHx+F1SkmTJiABQsWwNHREXl5eTA2NkaPHj3w22+/CSeFaWpqYs2aNahXrx4kEgm0tLSwevVqpS993a5dO6xduxaurq7CiK+joyOWLFkilK5KlSohJCQErVu3Fl7XJk2aYMuWLahZs2aRj+3j44N69epBS0sL5ubmCitLNG3aFMD7Euzp6Vnox3fo0AE//PADdHR0YGhoiD59+mDRokXC/ePHj8ekSZNQs2ZNvHv3DpaWlhg9erTCVKCimJubY+XKlTA3N4eOjg48PDywdetW4STQli1bYvXq1XBycoJMJoOhoSG6deuGTZs2KcxdVlZgYCCGDx8OS0tLZGRkoGbNmpg2bZpwroGzszO2bNkCDw8PlCtXDlKpFHXq1MG0adOE9aqVea2GDh2KPn36wNjYGBoaGmjatCkmTZoE4P9XwcjXrFkzAO9HwPPXGScqaySyD1eoJyL6DDdv3sT+/fthbGyMTp06wdbWFqmpqejduzcePHiA5cuXFzqvlCgjIwPffPMN4uLi4O3tjYULFyrcP2XKFOzfv7/IkzE/R2hoKPz9/VG1alXhJLeyKCcnB8OGDcNff/0Fd3f3T15ghUhdcYoFERUrMzMz7N27F5mZmVi7di2MjY3x5s0bZGdnw8TE5F/N96Wyo2PHjkhMTERGRgY0NDQwYMAAsSOVOd999x1u3bolTPMYOHCgyImIxMMpFkRUrCpXrox169YJ8yVTUlJQrlw5tGrVClu2bPnoesVUdpmbmyMnJwc1a9bEkiVLhGk0pDoWFhbIzMyEpaUlpk6dinbt2okdiUg0nGJBRERERCSHI8hERERERHJYkImIiIiI5LAgExERERHJYUEmIiIiIpLDgkxEREREJIcFmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjksCATEREREclhQSYiIiIiksOCTEREREQkhwWZiIiIiEgOCzIRERERkRwWZCIiIiIiOVqqfsKEhATMmTMH0dHRMDY2Rv/+/TF06NBC9x0yZAguXLigsG316tVo27btR58jKioKurq6xZaZiIiIiNRPVlYW6tevX2C7SgtydnY2hg0bhkaNGmHWrFm4d+8eJk6ciEqVKuHrr78usH9cXByCgoLg7u4ubDMyMvrk8+jq6sLBwaFYsxMRERGReomNjS10u0oLcmJiIpycnDBz5kzo6emhRo0aaNq0Ka5evVqgIKelpQn7m5ubqzImEREREZVhKp2DbGVlheXLl0NPTw8ymQwRERG4evUqmjRpUmDf+Ph46OrqwtLSUpURiYiIiKiMU/kc5HwtWrTAixcv0Lp1a3To0KHA/fHx8ahQoQLGjx+PiIgIVK5cGWPGjEHLli1FSEtEREREZYVoBXnNmjV48eIFAgICEBgYiGnTpincn5CQgPT0dHh6esLX1xfHjx/HyJEjsWvXLjg7O3/0sbOysoqcU0JERERE9DESmUwmEzPAn3/+icmTJyMyMhI6OjrC9tzcXKSnp6NChQrCtpEjR8LExASBgYEffczY2FiepEdEREREH1VUZ1TpHOTExEScPHlSYZutrS2ys7ORlpamsF1TU1OhHAOAjY0NXrx4UeI5iYiIiKjsUmlBTkhIwJgxY/Dy5Uth282bN2FqagpTU1OFfceOHYuAgACFbbGxsbC2tlZFVCIiIiIqo1RakN3d3WFra4spU6YgISEBp0+fxtKlSzFy5EgAQFJSEt69ewcA8PT0xL59+3Do0CE8ePAAK1asQEREBHx8fFQZmYiIiIjKGJWepKetrY3g4GDMmjULvXv3Rvny5TFw4ECh9Hp4eCAwMBA9evSAt7c30tLSsGLFCjx//hx16tTBxo0bUb16dVVGJiIiIqIyRvST9EoCT9IjIiIiok/5Ik7SIyIiIiL60rEgfyEuX76MCRMm4PLly2JHISIiIirTRLtQCCnasmUL4uLikJGRgcaNG4sdh4iIiKjM4gjyFyIjI0PhbyIiIiISR5ktyDl5eWJHKDV4rIiIiKgsKbNTLLQ0NLD26jmxYwjeZGUKf39JuQDA172F2BGIiIiIVKbMjiATERERERWGBfkLoaWrq/A3EREREYmDBfkLUa9dK5jb1ES9dq3EjkJERERUppXZOchfmir2dVDFvo7YMYiIiIjKPI4gExERERHJYUEmIiIiIpLDgkxEREREJIcFmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjksCATEREREclhQSYiIiIiksOCTEREREQkhwWZiIiIiEgOCzIRERERkRwWZCIiIiIiOSzIRERERERyWJCJiIiIiOSwIBMRERERyWFBJiIiIiKSw4JMRERERCSHBZmIiIiISA4LMhERERGRHJUX5ISEBHz33XdwcXFB69at8csvvxS57+3bt9GnTx84OzujR48eiImJUWFSIiIiIiqLVFqQs7OzMWzYMFSpUgUHDhzAjBkzsGbNGhw8eLDAvhkZGRg6dCicnZ0RGhoKNzc3jBgxAmlpaaqMTERERERljEoLcmJiIpycnDBz5kzUqFEDrVu3RtOmTXH16tUC+4aFhUFbWxtTpkyBra0tpk6dCkNDQxw+fFiVkYmIiIiojFFpQbayssLy5cuhp6cHmUyGiIgIXL16FU2aNCmwb3R0NFxdXaGh8T6iRCKBq6srrl+/rsrIRERERFTGaIn1xC1atMCLFy/QunVrdOjQocD9SUlJsLa2VthmZmaG27dvqyoiEREREZVBohXkNWvW4MWLFwgICEBgYCCmTZumcH9mZiZ0dHQUtuno6EAqlX7ysbOyshAbG/vRfRwcHP596DLsU8eTiIiISF2IVpC/+uorAMC7d+8wefJk+Pn5KRRiXV3dAmVYKpVCT0/vk4+tq6vLAlzMeDyJiIhI3RQ1AKjyk/ROnjypsM3W1hbZ2dkFVqewsLBAUlKSwrbk5GSYm5uXeE4qOy5fvowJEybg8uXLYkchIiKiL4RKC3JCQgLGjBmDly9fCttu3rwJU1NTmJqaKuzr7OyM69evQyaTAQBkMhkiIyNRv359VUYmNbdlyxZER0djy5YtYkchIiKiL4RKC7K7uztsbW0xZcoUJCQk4PTp01i6dClGjhwJ4P2Jee/evQMAdOzYERkZGZgzZw7i4+MRGBiI9PR0eHl5qTIyqbmMjAyFv4mIiIhUWpC1tbURHBwMTU1N9O7dGzNmzMDAgQPh4+MDAPDw8EBYWBgAwMDAAOvXr8f169fRvXt3REZGIjg4GAYGBqqMTMVIJssVO0KpwWNFREQkHpWfpFelShWsW7eu0Pvu3LmjcNvJyQn79+9XRSxSAYlEE5mZJz+9owrp6eUKf39J2fT124gdgYiIqMxS6Qgy0Zfm229b4KuvquPbb1uIHYWIiIi+EKIt80b0JXB3rwV391pixyAiIqIvCEeQiYiIiIjksCATEREREclhQSYiIiIiksOCTEREREQkhwWZiIiIiEgOCzIRERERkRwWZCIiIiIiOSzIRERERERyWJCJiIiIiOSwIBMRERERyWFBJiIiIiKSw4JMRERERCSHBZmIiIiISA4LMhERERGRHBZkIiIiIiI5LMhERERERHJYkImIiIiI5LAgExERERHJYUEmIiIiIpLDgkxEREREJIcFmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjksCATEREREclhQSYiIiIiksOCTEREREQkhwWZiIiIiEiOygvyo0ePMHLkSLi7u6NFixZYsGABsrKyCt13yJAhsLOzU/hz4sQJFScmIiIiorJES5VPJpVKMXLkSNSqVQu7du3Cy5cvMXXqVADAlClTCuwfFxeHoKAguLu7C9uMjIxUlpeIiIiIyh6VjiDHxMTg0aNHCAwMhK2tLRo2bIhx48bh0KFDBfZNS0tDYmIinJycYG5uLvzR0dFRZWQiIiIiKmNUWpBtbGwQHByM8uXLC9skEglSU1ML7BsfHw9dXV1YWlqqMiIRERERlXEqLcimpqZo2rSpcDsvLw/bt29X2JYvPj4eFSpUwPjx4+Hh4YFevXrh7NmzqoxLRERERGWQSucgfygwMBCxsbHYu3dvgfsSEhKQnp4OT09P+Pr64vjx4xg5ciR27doFZ2fnjz5uVlYWYmNjP7qPg4PDZ2Uvaz51PJXBY/7vFMcxJyIion9PlIIsk8kwb9487Ny5Ez///DNq165dYJ9JkybB19cXFSpUAADY29vj5s2bShVkXV1dlrFixuOpejzmREREJauowSiVL/OWl5eHqVOnYteuXQgKCkLbtm0L3U9TU1Mox/lsbGzw4sULVcQkIiIiojJK5QV5wYIFOHToEFauXIn27dsXud/YsWMREBCgsC02NhbW1tYlnJCIiIiIyjKVFuSoqChs3boVY8eOhaOjI5KSkoQ/AJCUlIR3794BADw9PbFv3z4cOnQIDx48wIoVKxAREQEfHx9VRiYiIiKiMkalc5CPHj0KAFi6dCmWLl2qcN/Nmzfh4eGBwMBA9OjRA97e3khLS8OKFSvw/Plz1KlTBxs3bkT16tVVGZmIiIiIyhiJTCaTiR2iuMXGxip1gtPaq+dUkKb083VvUWyPlZl5stgeS53p67cROwIREZHaK6ozqnwOMhERERHRl4wFmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjksCATEREREclR+kIhjx8/xrp163Dp0iUkJydj586dOHjwIGrVqoXevXuXZEYiIiIiIpVRagQ5NjYW3bt3R0REBDw9PZGdnQ0AkMlkmDFjBg4cOFCSGYmIiIiIVEapEeT58+ejfv36CA4ORl5eHrZv3w4AmDp1KrKzs7F582Z4e3uXZE4iIiIiIpVQagQ5JiYGAwYMgIaGBiQSicJ9HTt2xMOHD0skHBERERGRqilVkA0MDJCUlFTofc+ePYOBgUGxhiIiIiIiEotSBbljx45YtmwZLl++DJlMBgCQSCS4f/8+Vq1ahTZt2pRoSCIiIiIiVVFqDvKkSZMQHx+P7777Dvr6+gCA4cOHIyUlBY6Ojvjxxx9LNCQRERERkaooVZD19fWxdetWnD9/HleuXMHr169hYGAANzc3eHp6QkODyykTERERkXpQeh1kAGjevDmaN28OAMjOzkZaWhrLMRERERGpFaXarVQqRVBQEH7//XcAwIULF9CsWTM0bdoUAwYMQEpKSomGJCIiIiJSFaUK8tKlS7Flyxbk5OQAAAICAmBhYYHAwEAkJSVh0aJFJRqSiIiIiEhVlJpiceTIEfj7+6Nnz56IiYnBP//8g2XLlsHLywt6enoICAgo4ZhERERERKqh1Ajyq1evUKtWLQDAmTNnoKWlhRYtWgAAjIyMkJWVVXIJiYiIiIhUSKmCXL16dURGRiI7OxuHDx+Gm5ubcHGQsLAwWFtbl2hIIiIiIiJVUaogDxkyBMuXL0eTJk3w4MEDDBo0CADQp08f7Nu3D8OGDSvRkEREREREqqLUHOTu3bujWrVqiIyMhJubG9zc3AC8X/Zt0qRJcHd3L9GQRERERESqolRBHj16NAYOHIjhw4crbP/+++9LJBQRERERkViUmmLx119/IS8vr6SzEBERERGJTqmC7OnpiT179iAtLa2k8xARERERiUqpKRbv3r3D2bNnERYWBlNTU5iamircL5FIcPDgwRIJSERERESkSkoV5AoVKqBr164lnYWIiIiISHRKFeTAwMCSzkFERERE9EVQqiBfvXr1k/twqTciIiIiUgdKFeQBAwZAIpFAJpMpbJdIJMK/Y2NjizcZEREREZEIlCrIBw4cKLAtPT0d165dw86dO7FixQqln/DRo0eYP38+IiIioK+vDy8vL4wfPx66uroF9r19+zZmzpyJ27dvw9bWFgEBAXByclL6uYiIiIiI/i2lCrK9vX2h293c3KCrq4vFixdj27Ztn3wcqVSKkSNHolatWti1axdevnyJqVOnAgCmTJmisG9GRgaGDh0KLy8vzJ8/H7t27cKIESNw/PhxGBgYKBObiIiIiOhfU2od5I9xcHBATEyMUvvGxMTg0aNHCAwMhK2tLRo2bIhx48bh0KFDBfYNCwuDtrY2pkyZAltbW0ydOhWGhoY4fPjw50YmIiIiIirSZxXktLQ0hISEwNzcXKn9bWxsEBwcjPLlywvbJBIJUlNTC+wbHR0NV1dXaGhoCPu5urri+vXrnxOZiIiIiOijlJpi4eLionBCHgDIZDK8e/cOMpkM8+bNU+rJTE1N0bRpU+F2Xl4etm/frrAtX1JSEqytrRW2mZmZ4fbt20o9FxERERHRf6FUQR48eHCBggwABgYGaNGiBWxsbP7TkwcGBiI2NhZ79+4tcF9mZiZ0dHQUtuno6EAqlX7ycbOysj65qoaDg8O/C1vGFccqJTzm/w5XhiEiIhKHUgV5zJgxxfqk+aPOO3fuxM8//4zatWsX2EdXV7dAGZZKpdDT0/vk4+vq6rKMFTMeT9XjMSciIipZRQ1GKVWQgfdTHjZt2oSrV68iLS0NxsbGcHNzg4+PDywsLJQOkpeXh59++gmHDh1CUFAQ2rZtW+h+FhYWSEpKUtiWnJys9HxnIiIiIqL/QqmT9B4+fAhvb2/s3r0blStXRqNGjWBmZoadO3fC29sbDx8+VPoJFyxYgEOHDmHlypVo3759kfs5Ozvj+vXrwsVJZDIZIiMjUb9+faWfi4iIiIjo31JqBHnhwoUwMzPD1q1bYWJiImxPSUnBkCFDsHTpUqUuFhIVFYWtW7di4sSJcHR0VBghNjc3R1JSEgwNDaGnp4eOHTti6dKlmDNnDvr164fdu3cjPT0dXl5e/+HTJCIiIiJSjlIjyJcvX8b333+vUI6B96tSjBw5EpcvX1bqyY4ePQoAWLp0KTw8PBT+5OTkwMPDA2FhYQDenwC4fv16XL9+Hd27d0dkZCSCg4N5kRAiIiIiKlFKjSDr6ekJ6xF/SCKRICcnR6knmzx5MiZPnlzk/Xfu3FG47eTkhP379yv12ERERERExUGpEWR3d3esWbMGb968Udj++vVrrF27Fg0bNiyRcEREREREqqbUCLKfnx969eoFT09PNGrUCBUrVkRycjLCw8OhpaWFJUuWlHROIiIiIiKVUGoEuWrVqjhw4AB69+6NFy9e4PLly0hKSkLv3r3x+++/w9bWtqRzEhERERGphNLrIFeqVAm9e/fGlClTAAAvX77EnTt3UKlSpRILR0RERESkakqNID979gxdu3bFiBEjhG03b97E4MGD0a9fP6SkpJRYQCIiIiIiVVKqIAcGBgIAVq1aJWxr0aIF/vzzT6Snp2PRokUlk46IiIiISMWUKsjh4eGYNGkS7O3tFbbb2tpi3LhxOHv2bImEIyIiIiJSNaUKMgBkZmYWuj0vLw9SqbTYAhERERERiUmpgty4cWOsXLkST58+Vdj+7NkzrFy5Ek2bNi2RcEREREREqqbUKhaTJ0/G//73P7Rv3x61a9eGqakpXr16hbt376JixYrCyhZERERERKWdUgXZ0tISf/75J/bt24eoqCi8efMGVlZW+Prrr9GzZ08YGhqWdE4iIiIiIpVQeh1kAwMDDBw4EAMHDizJPEREREREolJqDrJMJsPu3btx5swZAO/XQO7cuTNcXFwwZcqUIk/gIyIiIiIqbZQqyGvXrkVAQADu378PAJg6dSoyMjIwZMgQXLhwAUuXLi3RkEREREREqqJUQd6/fz9++OEHDBo0CHFxcbhz5w6+//57fP/995g0aRKOHDlS0jmJiIiIiFRCqYKcmJgIV1dXAMDp06ehoaGB1q1bAwCqVKmCtLS0kktIRERERKRCShXkypUrIz4+HgBw9OhRODo6wtTUFADw119/oWrVqiWXkIiIiIhIhZQqyN988w3mz58PLy8v3Lx5E/369QMAjB07FsHBwcJtIiIiIqLSTqll3oYOHQpzc3NERkbC19cXXbt2BQBUqFABCxYsQLdu3Uo0JBERERGRqii9DnK3bt0KFOG5c+cWeyAiIiIiIjEVWZDnzp2LwYMHw9LSUqkiPG3atGINRkREREQkhiIL8qlTp9CrVy9YWlri1KlTH30QiUTCgkxEREREauGjBbmwfxMRERERqTOlVrEgIiIiIiorihxBdnFxgUQiUfqBIiMjiyUQEREREZGYiizIgwcPFgryu3fvsGXLFtja2qJdu3YwNzfH69evcebMGdy6dQu+vr4qC0xEREREVJKKLMhjxowR/u3n54f27dtj2bJlCvuMGDEC06ZNQ3R0dMklJCIiIiJSIaXmIB8/fhw9evQo9L5OnTrh0qVLxRqKiIiIiEgsShVkExMTREVFFXrfxYsXYWFhUZyZiIiIiIhEo9SV9Pr3749ly5YhJSUFzZs3h4mJCV6+fInjx4/j4MGDmD17dknnJCIiIiJSCaUK8pAhQwAAv/zyC3bs2AGJRAKZTIZKlSph9uzZ6NWrV4mGJCIiIiJSFaUKMvC+JA8ePBj37t1DamoqjI2NYW1t/Z+fWCqVokePHpg6dSqaNm1a6D4zZszAb7/9prDN398f33333X9+XiIiIiKij1G6IAPvLylta2v72U+alZWFiRMnIi4u7qP7xcXFwc/PD19//bWwzcDA4LOfn4iIiIioKP+qIBeH+Ph4TJw4ETKZ7JP73rt3D46OjjA3N1dBMiIiIiIiES41feXKFTRq1KjA1IkPJSUl4fXr1581jYOIiIiI6N8qcgT52bNnqFKlSrE/Yb9+/ZTaLz4+HlpaWvj5559x7tw5mJiY4LvvvityPWYiIiIiouJQZEH29vbGmjVr4ObmBn9/f4waNQrVqlVTWbB79+4BAOzt7TFgwABcuXIFM2bMgL6+Pjp16vTRj83KykJsbOxH93FwcCi2rGXBp46nMnjM/53iOOZERET07xVZkKVSKaKiomBjY4P9+/eja9euMDQ0LPKBjI2NizVYv3790LlzZ+Fx7e3t8fDhQ+zcufOTBVlXV5dlrJjxeKoejzkREVHJKmowqsiC3KZNGyxevBhLliyBRCIR1kL+t0/wX0kkkgKl28bGBhcuXCjW5yEiIiIikldkQV6wYAE6d+6M169fw9/fH76+vqhevbrKgi1YsAD379/H+vXrhW2xsbGwsbFRWQYiIiIiKnuKLMhaWlpo3bo1gPcrT/To0aPE5yCnpKRAV1cX5cuXR+vWrbF161b8+uuvaNWqFc6dO4cDBw5gy5YtJZqBiIiIiMo2pdZBDgwMBACcOXMGV65cQVpaGkxMTODq6ooWLVpAIpEUS5hevXqhe/fuGDNmDBo1aoSlS5dizZo1WLx4MapVq4Zly5ahQYMGxfJcRERERESFUaogZ2RkYMSIEbh69SqMjIxgamqK5ORkrF+/Hq6urvjll19Qrly5f/3kd+7cUbh96tQphdteXl7w8vL6149LRERERPRfKXWhkKVLl+Lu3bv45ZdfEB4ejsOHD+Pq1avYsGED7t27h6CgoJLOSURERESkEkoV5CNHjmDChAnw8PBQ2N68eXOMHz8eR44cKZFwRERERESqplRBfvfuHSwtLQu9z9LSEm/evCnWUEREREREYlGqIDs4OODAgQOF3hcaGoratWsXZyYiIiIiItEodZLeuHHjMHDgQDx9+hQdOnRAxYoVkZycjCNHjiAmJgarV68u6ZxERERERCqhVEF2d3fH2rVr8fPPP2PhwoWQyWSQSCRwcHDA6tWrhfWSiYiIiIhKO6UKMgC0bNkSLVu2REZGBt6+fQtDQ8P/tLQbEREREdGXTOmCnK9cuXIsxkRERESktpQ6SY+IiIiIqKxgQSYiIiIiksOCTERERGrh8uXLmDBhAi5fvix2FCrllCrI/v7+ePz4MQBAJpPB398fT58+LdFgRERERP/Gli1bEB0djS1btogdhUq5Iguyl5cXpk2bhr179+LAgQN49eoVACAvL0/hNhEREZU9stxssSMUkJGRofD3l+JLPFb0cUWuYjFmzBhER0dj7969kMlk8PHxgb29PRwdHQEAT548gb29PTQ1NVUWloiIiL4MEk1tJP3hJ3YMBbnpycLfX1I28y6LxI5A/1KRBblTp07o1KkTAMDe3h4zZsxATk4OoqKiIJPJMG7cOGhra8PGxgZ2dnZYuHChykITERERfUhfR1Phb6L/qsiCHBMTA0dHR2hovJ+FUatWLTg5OaFHjx4IDQ3Fjh07UK5cOcTFxSEuLk5lgYmIiIgK07uJJf6IeI4ubpXFjlJmXL58Gbt378Y333yDxo0bix2n2BRZkAcNGoTc3FzUq1cPEokEly5dgpGREaysrAAAurq6sLOzg52dncrCEhERERXF1cYIrjZGYscoU7Zs2YK4uDhkZGSUjYJ87do13L17F9evX0dERAT27NmDlStXoly5cpBIJNi2bRsaNWqEOnXqoFatWtDR0VFlbiIiIiIS2Zd6YuTnKnIVC4lEAjs7O/Tt2xcAsGzZMkRERGDNmjWQyWRISUnB9u3b0b9/f7i4uKgsMBEREVFZlZvHFTGU8bnHqcgRZHmWlpbQ0dGBrq4uXF1dYWlpiUmTJqF27dqQyWR49OjRZ4UgIiIiok/T1NDGgSh/sWMIpJJXwt9fUi7v+oGf9fFKFeRTp04J/9bQ0FC4LZFIUKNGjc8KQURERESlT6MOVrh+5hlcWlURO0qxUqogExERERF9yLquMazrGosdo9gpdalpIiIiIqKyggWZiIiIiEgOCzIRERERkRwWZCIiIiIiOSzIRERERERyWJCJiIiIiOSwIBMRERERyWFBJiIiIiKSw4JMRERERCRHtIIslUrRpUsX/PXXX0Xu8+TJEwwePBj169dHp06dcPbsWRUmJCIiIqKySJSCnJWVhQkTJiAuLq7IfWQyGUaNGgVjY2Ps3bsX3bt3x9ixY/H48WMVJiUiIiKiskZL1U8YHx+PiRMnQiaTfXS/y5cv4/79+wgJCYGBgQFq1aqFv/76C3v37sX48eNVlJaIiIiIyhqVjyBfuXIFjRo1wm+//fbR/aKjo1G3bl0YGBgI29zc3BAVFVXCCYmIiIioLFP5CHK/fv2U2i8pKQmVKlVS2GZmZobnz59/8mOzsrIQGxv70X0cHByUykHvfep4KoPH/N8pjmMOALa2ttDR0SmWx1JnUqkUCQkJxfJYtaytoa2nVyyPpc6y371D/P37xfJY1ja20NPl17ky3mVJcf/e53+t83v6v1Nc39N53JX3Ocdc5QVZWZmZmdDW1lbYpqOjg+zs7E9+rK6uLr+AihmPp+oV5zE/d7p4vjGrsxatHYr1mCcFzS22x1JX5uOnFesxX3zgSrE9ljr70bshv6eLgMdc9ZQ55kWV6C92mTddXd0CZVgqlUKPozJEREREVIK+2IJsYWGBpKQkhW3JyckwNzcXKRERERERlQVfbEF2dnbG7du3kZGRIWyLiIhA/fr1xQtFRERERGrviyrIKSkpSE9PBwA0bNgQlpaWmDJlCuLi4hAcHIzo6Gj07t1b5JREREREpM6+qILcq1cvbNq0CQCgqamJNWvWICUlBT169MDvv/+OVatWwcrKSuSURERERKTORF3F4s6dOwq3T506pXC7Ro0a2L59uyojEREREVEZ90WNIBMRERERiY0FmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjksCATEREREclhQSYiIiIiksOCTEREREQkhwWZiIiIiEgOCzIRERERkRwWZCIiIiIiOSzIRERERERyWJCJiIiIiOSwIBMRERERyWFBJiIiIiKSw4JMRERERCSHBZmIiIiISA4LMhERERGRHBZkIiIiIiI5LMhERERERHJYkImIiIiI5LAgExERERHJYUEmIiIiIpLDgkxEREREJIcFmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjksCATEREREclReUGWSqWYPn063N3d0axZM2zYsKHIfYcMGQI7OzuFPydOnFBhWiIiIiIqa7RU/YSLFi1CVFQUNm/ejOfPn8PPzw+Wlpbo3LlzgX3j4uIQFBQEd3d3YZuRkZEq4xIRERFRGaPSgpyRkYHdu3dj3bp1cHR0hKOjI4YOHYrt27cXKMhpaWlITEyEk5MTzM3NVRmTiIiIiMowlU6xuH37NqRSKdzc3IRtbm5uuHHjBnJzcxX2jY+Ph66uLiwtLVUZkYiIiIjKOJWOICclJcHIyAi6urrCtooVKyI7OxsvX75EpUqVhO3x8fGoUKECxo8fj4iICFSuXBljxoxBy5YtP/k8WVlZiI2N/eg+Dg4O//0TKYM+dTyVwWP+7xTHMQd43P8NHnPV4zEXB7+nqx6/1lXvc465SgtyZmYmdHR0FLbl35ZKpQrbExISkJ6eDk9PT/j6+uL48eMYOXIkdu3aBWdn548+j66uLr+AihmPp+rxmKsej7nq8ZiLg8dd9XjMVU+ZY15UiVZpQdbV1S1QhPNv6+vrK2yfNGkSfH19UaFCBQCAvb09bt68qVRBJiIiIiL6r1Q6B9nCwgKpqakKJTkpKQk6OjoFVqfQ1NQUynE+GxsbvHjxQiVZiYiIiKhsUmlBdnBwgLa2Nq5fvy5si4iIQL169aClpTiYPXbsWAQEBChsi42NhbW1tSqiEhEREVEZpdKCrK+vD29vb8yaNQsxMTE4efIkNm3aBB8fHwDvR5PfvXsHAPD09MS+fftw6NAhPHjwACtWrEBERISwLxERERFRSVD5hUL8/f0REBCAgQMHonz58hg9ejS8vLwAAB4eHggMDESPHj3g7e2NtLQ0rFixAs+fP0edOnWwceNGVK9eXdWRiYiIiKgMUXlB1tfXx8KFC7Fw4cIC9925c0fh9rfffotvv/1WVdGIiIiIiFQ7xYKIiIiI6EvHgkxEREREJIcFmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjksCATEREREclhQSYiIiIiksOCTEREREQkhwWZiIiIiEgOCzIRERERkRwWZCIiIiIiOSzIRERERERyWJCJiIiIiOSwIBMRERERyWFBJiIiIiKSw4JMRERERCSHBZmIiIiISA4LMhERERGRHBZkIiIiIiI5LMhERERERHJYkImIiIiI5LAgExERERHJYUEmIiIiIpLDgkxEREREJIcFmYiIiIhIDgsyEREREZEcFmQiIiIiIjksyEREREREcliQiYiIiIjkqLwgS6VSTJ8+He7u7mjWrBk2bNhQ5L63b99Gnz594OzsjB49eiAmJkaFSYmIiIioLFJ5QV60aBGioqKwefNmzJo1C2vXrsWff/5ZYL+MjAwMHToUzs7OCA0NhZubG0aMGIG0tDRVRyYiIiKiMkSlBTkjIwO7d++Gv78/HB0d0bZtWwwdOhTbt28vsG9YWBi0tbUxZcoU2NraYurUqTA0NMThw4dVGZmIiIiIyhiVFuTbt29DKpXCzc1N2Obm5oYbN24gNzdXYd/o6Gi4urpCQ+N9RIlEAldXV1y/fl2VkYmIiIiojNFS5ZMlJSXByMgIurq6wraKFSsiOzsbL1++RKVKlRT2tba2Vvh4MzMz3L59+5PPk5WVhdjY2E/u18rA/F+kL7uUOZbKsyzGx1JnxXnMAfPKxfpwaql4v84BdOxZvI+nhpKL+Zh3sTMs1sdTV8X6tW47qPgeS40V99e6na5PsT6eOlL26zwrK6vQ7SotyJmZmdDR0VHYln9bKpUqte+H+xWmfv36nxeUiIiIiMoslU6x0NXVLVBw82/r6+srta+enl7JhiQiIiKiMk2lBdnCwgKpqakKxTcpKQk6OjowMjIqsG9SUpLCtuTkZJibc1oEEREREZUclRZkBwcHaGtrK5xoFxERgXr16kFLS3G2h7OzM65fvw6ZTAYAkMlkiIyM5PQJIiIiIipRKi3I+vr68Pb2xqxZsxATE4OTJ09i06ZN8PF5P9k8KSkJ7969AwB07NgRGRkZmDNnDuLj4xEYGIj09HR4eXmpMjIRERERlTESWf4QrYpkZmYiICAAx44dQ/ny5TF48GAMHjwYAGBnZ4fAwED06NEDABATE4OZM2ciPj4ednZ2CAgIgKOjoyrjEhEREVEZo/KCTERERET0JVP5paaJiIiIiL5kLMhERGrm6dOnKOzNwdzcXNy8eVOEREREpQunWBARqRkHBwdcvHgRpqamCtsfPHiAbt26ITo6WqRkRMUvNTUVjx49KvRCYq6uriIkInWg0ivpkaLk5GRs3LgR9+/fL/Q/9qZNm0RIRUSl0W+//Ya1a9cCeL8spre3NzQ0FN8kfPv2LWrXri1GPLVlb28PiUSi1L7FfjlzQkhICBYsWIDs7OwC90kkEh7zYpSXl6f0vh9+7ymNWJBFNH78eDx58gRt2rThFQJV7Pbt29i+fTsePnyIJUuW4Pjx46hevTpatGghdjS19Pr1a2zcuBE3btxATk5Ogbf/Q0JCREqmPnr06AFdXV3k5eVh6tSpGDp0KAwNDYX7JRIJypUrh8aNG4uYUv1s3rxZ+PfNmzexZcsW+Pr6wtHREVpaWrh16xbWrl2LAQMGiJhSfa1evRpDhw7FoEGD+HO0hNWtW7dM/TLIgiyiGzduYOfOnXBwcBA7Sply/vx5jBkzBp06dUJ0dDSkUilSUlKwYMECBAYGomvXrmJHVDt+fn64efMmunbtCgMDA7HjqCVtbW14e3sDAKysrODq6lrgAkxU/Jo0aSL8e/bs2Vi4cCGaNWsmbKtXrx6qVauGn376CYMGDRIjolqTSCTo2rUrKlSoIHYUtffrr7+KHUGl+N1TRE5OTnjy5AkLsooFBQXB398fffr0wZEjRwAAY8eOhbm5OdasWcOCXAIuX76M7du3w8nJSewoZULDhg1x/vz5Ikfsx40bJ1Iy9ZaYmAgzM7MC2/X19fHmzRsREqk/X19fLF26FD/99BMsLS3FjqPWGjZsqNR+z58/L+EkqsGCLKL58+ejX79+OHnyJCwtLQu8dfH999+LlEy93bt3D02bNi2wvVmzZggMDBQhkfqrXLmy0m/N0eebN28eQkJCYG9vj/Llyyvcx9eh5LRq1QpTp07F1KlTYWdnB5lMhhs3bmDevHno1KmT2PHUkq2tLVasWIE2bdoUer86vNX/JUpISMCiRYsQFxcnzE2WyWSQSqV4/fq1Whx3FmQRLV++HCkpKbhz5w4ePnyocJ9EImFBLiFWVlaIiopCtWrVFLafOnWqwDYqHpMmTcKsWbMwZswYWFlZQUdHR+F+HvfitX//fixYsABff/212FHKlNmzZ2PmzJkYOHCgUBo0NTXRrVs3TJs2TeR06mn69Olo3LgxunfvzjnIKjR9+nTk5eVhxIgRmD9/Pvz8/PDkyRPs2LFDbQaauMybiOrXr49169bxpBkVO3HiBPz8/NCrVy/s2rULgwcPxuPHj3H06FEsXryYIz0lwN7evsA2iUQCmUzGM81LQJMmTbBz507UrFlT7ChlUlpaGu7fvw8AsLa25rz7EuTi4oKDBw/yl2wVc3Jywm+//QYHBwf873//w9ixY9GkSRPs2bMH+/fvx44dO8SO+Nk4giwiS0tLlCtXTuwYZU7btm2xY8cObNq0CbVr18aZM2dgbW2NkJAQODs7ix1PLZ08eVLsCGXKt99+i5UrV2L27NkFplhQyUpKSsL9+/eRm5sL4P3J2FKpFLdu3YKvr6/I6dRPz549sW/fPvzwww9iRylTtLS0hFVybGxsEBsbiyZNmqBp06ZYuHChyOmKBwuyiL7//ntMnjwZAwcOhJWVFTQ1NRXulz87mopX9erVMWHCBFSuXBkAcOHCBdja2oqcSn1VrVoVMpkM586dw71795CbmwsbGxt4eHgUmG5Bn+/ixYuIiYnB4cOHYWJiAm1tbYX7z5w5I04wNbdz507MmzcPOTk5wjskwPt3S5ydnVmQS8CrV6/w22+/Ye/evYX+HOUSkiXDzc0NGzduhJ+fHxwdHXHo0CF89913iI6Ohq6urtjxigWnWIiosLed8/Ft55ITFRWFESNGoHfv3pg0aRIAoHPnznjz5g02bNjAVUVKwNOnT+Hr64tHjx7B2toaubm5ePjwISpXroytW7fCwsJC7IhqZf/+/R+9v3v37ipKUrZ4enqiR48eGD58ODw9PbFnzx6kp6fDz88PnTp1wrBhw8SOqHZWrVr10ft5Lk/JSEhIwKhRo9CnTx/07dsXPXv2RGJiIt69e4dRo0apxXFnQaYy55tvvkHDhg0xceJEhTP6Fy9ejOvXr6vF3KkvzciRI5Gbm4slS5bAyMgIwPuRHz8/P+jr62PFihUiJ1RPeXl5ePLkCapUqYK8vDyO1pcwR0dHHDlyBFZWVhgxYgS6desGLy8vXLt2DVOnTsWxY8fEjkhUrDIzM6Gvr4+MjAxcuXIFxsbGqF+/vtixigWnWKjY48ePYWVlBYlEgsePH390X550UDLu3r2LpUuXFljuqk+fPnw7roSEh4dj9+7dQjkGABMTE0yaNAn9+vUTMZl6ysnJwdKlS7F9+3bk5ubi6NGjWLJkCbS0tDBnzhye+1BCzMzMkJKSAisrK2FeppeXFywsLPDixQux46kNPz8/zJgxAwYGBvDz8/vovosWLVJRqrKlTZs22LdvH4yNjQEA5cqVQ6tWrZCYmIgmTZrg0qVL4gYsBizIKtauXTtcvHgRZmZmaNeuncI8NYBn9quClZUVLl68iL59+ypsDw8Ph7m5uUip1JuRkRFev35dYPvr168LzI+lz/fzzz/jwoUL2LRpE4YPHw4AGDBgAKZPn44FCxZg9uzZIidUT15eXpg8eTLmzp2L5s2bY9KkSXBwcMDp06dRo0YNseOpDfl5xh/OOaaSExYWJpy/8OTJE8ycObPAfOOnT5+qzRU81eOzKEVOnjwJU1NT4d+ker6+vpg8eTIiIyNRr149AO8Xkw8LC2NxKCFdunTBtGnTMH36dOFqetHR0Zg7dy66dOkicjr18+eff2Lx4sVwc3MTtjVo0ADz58/HqFGj+HVeQiZOnIgKFSrg9evXaNOmDXr37o3Zs2fD2NhYbdaG/RLIH0seV9Vp3Lgxzp8/L9zW0NAo8AuKvb09Jk+erOpoJYJzkL8QH16J5tatWwo/3Kh4Xbp0Cbt27UJCQgK0tbVRo0YN+Pj4wNXVVexoakkqlWLGjBk4ePCg8I6JpqYm+vbtix9//FFtznr+Uri4uGD//v2oWbOmwjqxt2/fxv/+9z9cv35d7IhExSIvLw/Hjh1DfHy8sLRe/s/R2NhYbN68WeSE6mnVqlUYPHhwodO1UlJShIHA0owFWUTXrl1DQEAAEhISCtynpaWFGzduiJCKqOSkpqbiwYMH0NXVRfXq1aGvry92JLU0atQomJiYYO7cuXB1dcXBgwdhbGyMiRMnQlNTE2vXrhU7oto6c+YMtmzZgkePHmHbtm3Ys2cPqlSpgj59+ogdTS0FBAQgNDQUdevWRUxMDFxcXPDo0SMkJyejf//+vIJhCXFwcMDFixcLFOF//vkHXbt2VYtfwjnFQkTz5s1D9erVMXnyZIwbNw6LFi1CYmIiVq1ahenTp4sdT21JpVKEhobixo0byMnJwYe/I/KkjuJx6dIluLu7Q0tLq8AJG+np6UhJSRFuc83v4jVz5kyMHj0aTZo0QVZWFkaMGIFnz57BysoK69atEzue2vr9998xb948+Pj44Pr168jLy4O5uTkWLFiAzMxMfPfdd2JHVDtHjhzBkiVL0L59e3Ts2BEBAQGwsbHB5MmTkZmZKXY8tbJ//37s3bsXwPtRel9f3wLzjZOSklCpUiUx4hU7FmQRxcfHY8mSJbC1tUW9evWgra2N/v37w8zMDMHBwfDy8hI7olqaOnUqTpw4gebNm/MSsCVo0KBBwgmpgwYNKnI/npBa/CwsLLB3715cunQJ9+7dQ05ODqytreHh4QENDQ2x46mtX375BbNmzUKnTp2wceNGABC+py9atIgFuQSkpaXhq6++AgDUqVMH0dHRqF27NkaMGIHBgweLnE69dOjQAU+ePAEAREREwNXVtcCVOsuXL4/27duLEa/YsSCLSF9fX/hhZWNjg9u3b6Nly5ZwcnLC/fv3RU6nvk6ePInVq1ejWbNmYkdRa7dv3y7036Q69vb2sLGxEW4nJSUBAC/MUkIePXoER0fHAtsdHByQnJwsQiL1V716ddy8eRNVqlRB7dq1ERMTg169eiEvLw9paWlix1Mr5cqVEy4AUrVqVXTu3Fmt11ZnQRZR48aNsXTpUkyfPh0uLi7YtGkTevXqhePHjwtrC1Lxq1ChAguCin24Zma+xMREeHt7q8WamV+S48ePY8aMGQWW1uMSkiWrTp06OHv2LL799luF7fv27YOdnZ1IqdTbkCFDMHHiRMyfPx9eXl7o3r07JBIJoqKieKJ7CcrNzcXBgweLvL9Xr14qTFMyeJKeiJKSkuDn5wdPT0/07dsXgwYNwrVr16CpqYmAgAD07t1b7Ihqaffu3Th8+DB++uknVKtWrcA6vHwLunjIr5l58OBBdOjQodA1Mx8+fKiwdBB9vlatWqFZs2b49ttvC10hRH5UmYrPtWvXMGLECDRu3Bhnz55F165d8eDBA8TGxmLdunVo3Lix2BHVUkREBPT09FCvXj2cP38ee/bsgbGxMcaMGcO17UuIp6enwu3c3Fy8fPkSWlpacHV1xaZNm0RKVnxYkL8gMpkM8fHxHOEsYS1btsTLly+FJYE+xNG14pGSkoLFixcDeH9yR6dOnaCnp6ewT/ny5dGtWzdhDiEVD3d3d+zevRvW1tZiRylzkpOTERISgoSEBOTm5sLa2hr9+vWDpaWl2NGISlRGRgZmzpwJW1tbjBw5Uuw4n40FWcX+zVvJPLO/ZFy5cuWj9zds2FBFScqOVatWYciQIVzWTUWCgoLw5s0b/PTTT7xS4RcgPT0dDx48EC5MRJ+nX79+kEgkSu0bEhJSwmlI3sOHD9G3b1+1mDbHOcgq9rGz+eVxnmDJyS/AiYmJuH//PurXr4+0tDRUrFhR5GTqRX6ZNzc3N0RFRRW5L38ZLF6dOnWCj48PQkNDUbFixQJlglfxVK3IyEgMHz6c39OLSdOmTcWOQEW4deuWcOGz0o4FWcXyz+aPiYlBnTp1FN5yPnHiBMzMzODi4iJWvDIhPT0d/v7+OHbsGDQ0NHD06FHMnz8fr169wurVq2FmZiZ2RLXAZd7E8+OPP8LGxgZdunThVQpJ7eSvpJAvIyMDqampqFy5MgDgwoULqF+/PpfxLEGFjeKnp6cjLi5O6YHALx0Lsorl5ubC398fhw4dwtatWxXezv/zzz9x5MgR9OzZE7NmzSpwjXMqHgsXLsSrV69w8uRJdOnSBQAwZcoUTJ48GXPnzkVQUJDICdUDl3kTz+PHj3Hw4EFUr15d7ChEJSoqKgojRoxA7969MWnSJABAYGAg3rx5gw0bNsDBwUHkhOqpsFF8HR0dfPXVV2rzjiALsopt3LgRly9fxq+//gp3d3eF+4KCgtCnTx+MHz8etWrV4qLyJeTUqVMIDg5G1apVhW01atRAQEAAfHx8REym3u7fv4+KFSvC0NAQf/31F44fPw5HR0f07NlT7Ghqx9PTExcuXEC/fv3EjkJUoubPn4/evXtj4sSJwrY///wTixcvxpw5c7Bjxw4R06mvD0fx1RELsort378f06dPL1CO8zVu3Bh+fn7YuHEjC3IJeffuXaEnLkml0gKXnabiERoaiunTp2PTpk0wMjLCyJEj4ebmhqNHj+LZs2dl4putKllYWGDBggU4cOAAqlWrVuDdKF5OvfgoczISpxCVnLt372Lp0qUF3u7v06cPT9ArYXv37sWuXbtw7949aGtrw8bGBkOGDEHbtm3FjlYsWJBV7NmzZ6hbt+5H92nQoAFmzZqlokRlT5s2bbB06VKFkvDgwQPMmTMHrVq1Ei+YGlu/fj3mzp2LRo0aITAwELVq1cLmzZtx+fJlTJ48mQW5mL1+/RqdO3cWO0aZ8G9OvKbiZ2VlhYsXL6Jv374K28PDw7kGcglatmwZdu3aBR8fH4wePRp5eXmIiYmBn58fxo4dqxYDfCzIKlaxYkX8888/Cm/vf+jp06cwMTFRYaqyZfr06fD390ejRo0gk8ng7e2NjIwMeHh44KeffhI7nlp69uyZcJGE06dP4+uvvwbw/ofbmzdvxIymlgIDA8WOUGZwfr24fH19MXnyZERGRgrL6MXGxiIsLAyzZ88WOZ362rNnDxYuXIjWrVsL29q0aQMHBwfMnTuXBZn+vXbt2mHlypVwdXUt9G3+7OxsrFq1Ci1atBAhXdlgYGCAlStX4vHjx0hISEBOTg6sra1ha2srdjS1Va1aNVy8eBGVKlXCo0eP0KZNGwDAgQMHeFW3EnLmzBls3boVDx8+xLZt27Bnzx5UqVIFffr0ETsaUbHp3LkzTE1NsWvXLuzZswfa2tqoUaMGtmzZAldXV7HjqbUqVaoU2FajRg1IpVIR0hQ/FmQVGzVqFHr16oUePXpgwIABcHR0hKGhId68eYOYmBiEhIQgKysLy5YtEzuqWnn69GmBbZqamqhTp06BfXjFq+I3ZswYTJo0Cbm5ucIow8KFC7Fr1y6sWrVK7Hhq5/fff8e8efPg4+ODyMhI5OXlwdzcHAsWLEBmZqZajO4QAcBPP/2E4cOH4+effxY7itqTX9/Y19cXM2fOxLx581CrVi0A71fPmTdvnlpcRQ/glfRE8ebNGyxevBhhYWHIzMwE8P4y00ZGRujSpQtGjx4NU1NTkVOqF3t7e6XnAPKEmpKRkpKCxMREYdmle/fuoUKFCrxASwno2rUrRo0ahU6dOsHFxQUHDx5EtWrVcOTIESxatAinTp0SOyJRsWjYsCFCQ0NhZWUldhS19+HPUZlMBolEAl1dXUgkErx79w4SiQRGRka8kh79N0ZGRpg7dy5mzJiBx48fIzU1FSYmJqhevTo0NDTEjqeWjh07Jvz7/Pnz2LZtG6ZMmQJHR0doaWnh1q1bWLhwId9+LkF6enq4fv069u3bh9zcXFhbWwvrUFPxevToERwdHQtsd3BwQHJysgiJiErGd999h5kzZ8LHxwdVq1YtcGGcatWqiZRM/fz6669iR1ApjiBTmdO6dWssX74czs7OCttjYmIwatQoXLhwQaRk6uv27dsYOnQotLW14ejoiNzcXNy8eRNZWVnYtm0bateuLXZEtdK7d29069YN3377rcII8rJly3Dp0iXs2bNH7IhExcLe3l7hdv4IZ/7oJt8RLFmJiYm4f/8+6tevj7S0NLV6R5AjyFTmpKWlIScnp9Dt2dnZIiRSf/PmzUPz5s0xZ84caGm9/7aTk5OD6dOnY/78+di8ebPICdXL5MmTMWLECFy6dAnZ2dlYs2YNHjx4gNjYWKxbt07seETF5uTJk2JHKJPS09Ph7++PY8eOQUNDA0ePHsX8+fPx6tUrrF69GmZmZmJH/Gx8P5/KnK5du8LPzw8HDhzA7du3ERsbi927d2Py5MkF1tKk4hETE4Nhw4YJ5RgAtLS0MGzYMERFRYkXTE01aNAAR48eRZ06deDp6YnU1FS4ubkhLCxMWG6PSB1UrVoVVatWxfPnz3H58mUYGRkhPT0d5ubmH11OlT7PwoUL8erVK5w8eVKY1jJlyhQAwNy5c8WMVmw4gkxlztSpU2FgYIBFixYhJSUFwPv1qfv16wdfX1+R06mn/OXdPlzS7cGDBzAwMBAplfpatWoVhgwZgnHjxilsT0tLw4IFC4QfZESl3cuXLzFixAjEx8dDKpWiYcOGCAoKwt27d7Fp0ybUqFFD7Ihq6dSpUwgODlb4JaRGjRoICAiAj4+PiMmKDwsylTlaWlqYMGECJkyYIBRkrhpSsvr06YNp06ZhzJgxcHJyAgBER0dj1apVHLUvJvHx8UhKSgIArF69GnXq1IGhoWGBfXbt2sWCTGpjzpw5qFq1KrZv344mTZoAABYvXgw/Pz/MnTsXGzZsEDmhenr37l2h13KQSqVQl1PbWJCpTNi7dy++/vpr6OjoYO/evR/dt1evXipKVXYMGTIEmZmZWLZsmXDlPHNzcwwZMoRr8haTly9fKlz2eOzYsQX2KVeuHAYPHqzKWEQl6tKlSwgJCYGenp6wzcDAABMnTsQ333wjYjL11qZNGyxduhSLFi0Stj148ABz5sxBy5YtRUxWfFiQqUxYs2YN2rRpAx0dHaxZs6bI/SQSCQtyMfr9999x/PhxaGtro02bNggPD8fLly+hq6vLqRXFrFGjRsJljz09PbF3716+M0JqT0NDQ7iegLykpKQCS75R8Zk+fTr8/f3RqFEjyGQyeHt7IyMjAx4eHpg2bZrY8YoFl3mjMicvL4/rTatAcHAwfv75ZzRp0gRaWlq4ePEiBg0ahAkTJogdrUx7/vw5KleuLHYMomIxb948xMTEYNasWejXrx9CQkKQkpKC2bNnw8PDA9OnTxc7otpJS0uDpqYm9PX18fjxYyQkJCAnJwfW1tbC+T1Lly4VO+ZnY0GmMsfDwwMdO3ZE586d4eLiInYctdW2bVt8//338Pb2BvD+Yi3+/v64du2a0lc1pP8mISEBixYtQlxcnHB5WJlMBqlUitevX3NtWFIbUqkUy5YtQ0hIiLBMp5aWFnr16oUpU6YoTL2gz/P8+XNMmTIF4eHhAIAWLVpg0aJFMDIyQm5uLjZv3oy1a9dCS0tL2Kc0Y0GmMicsLAxHjhzBuXPnYGpqKpTlevXqiR1NrTg6OuLkyZOwsLAA8H7dYycnJ5w+fVrYRiWjX79+yMvLQ/fu3TF//nz4+fnhyZMn2LFjBwICAoRfWojUxbt37/D48WPk5uaiWrVqKF++PFJSUjjNqBiNGjUKcXFxGDt2LLS1tREcHIw6derghx9+wKhRo3D79m306tUL48ePh4mJidhxPxvnIFOZ4+XlBS8vL2RmZuL06dM4evQofHx8ULFiRXTu3LnQk5vo38vJySmw7rGuri6kUqmIqcqGv//+G7/99hscHBxw4MAB2NjYoH///rC2tsbu3btZkEltODg44OLFizA1NVW4Iuc///yDrl274vr16yKmUy8RERFYvny5sFpIvXr14O3tjdu3b0Mmk+G3337DV199JXLK4sOCTGWWvr4+vLy8YGtrCxsbG2zduhVbt25lQaZST0tLS1jizcbGBrGxsWjSpAmaNm2KhQsXipyO6PPs379fWI1IJpPB19dX4Zdx4P1JepUqVRIjntpKTU2Fra2tcLtatWrIzs4WLmNf2LJvpRkLMpVJMTExOHbsGI4fP47ExES0aNEC8+fPR+vWrcWOplb++OMPlC9fXridl5eHw4cPF3jbkyuHFC83Nzds3LgRfn5+cHR0xKFDh/Ddd98hOjqaZ/ZTqdehQwc8efIEwPtRTVdXV4XvMwBQvnx5tG/fXox4aksmk0FTU1Nhm6amJkaPHq125RjgHGQqg1q1aoXk5GQ0adIEnTt3Rtu2bbnkWAnw9PRUaj+JRIKTJ0+WcJqyJSEhAaNGjUKfPn3Qt29f9OzZE4mJiXj37h1GjRqF77//XuyIRMVi//798PLy4i9+KmBvb4+LFy/CzMxM2Obi4oKDBw+iWrVqIiYrGSzIVCacPHkSLVq0gLa2Nnbt2oUOHToUOIkgPT0dq1evhp+fn0gpiYpXZmYm9PX1kZGRgStXrsDY2Bj169cXOxZRsUpISMDff/+NnJycAldx47tTxcfe3h7+/v4Ko/Vz5szB6NGj1fJdQRZkKhMcHBxw4cIFhd98W7VqhZCQEOFa8snJyWjevDmXwKJSr02bNti3bx+MjY0VticmJsLb2xuXLl0SJxhRMQsODsayZctgZGRUYJoF350qXmXtXUHOQaYyobDfA9+8eSOsEUtU2oWFheHMmTMAgCdPnmDmzJkF3nZ++vRpgZOZiEqzzZs348cff8SQIUPEjqL2Tp06JXYEleLlxIiI1EDjxo2hqakpnESjoaEh3M7/Y29v/9FLrROVNtnZ2TwZj0oEhxKIiNSAqakpAgMDAQBVq1bF4MGDUa5cOZFTEZWsbt26ISQkBJMnT+YVOqlYsSATEamRly9fYuTIkcJUitu3b+PSpUswNTVFhw4deOldUiuvXr3CsWPHcOjQIVStWrXAcmMhISEiJaPSjgWZyoxPrcmblpYmVjSiz5aRkQE/Pz+cPHkSf/75J2xsbPD7779j6tSpMDMzg56eHlavXo1t27bxUt+kNmxsbDBy5EixY5Aa4ioWVCYoe/YtUPZORCD1sGjRIpw6dQoBAQFo2LAhpFIpmjdvjpo1ayIkJAQ6OjqYNm0apFIpFi1aJHZcIqIvGkeQqUxg6SV1d+zYMcyaNQuNGzcGAFy8eBFv377Ft99+Cx0dHQDv1yYdNWqUmDGJitWn1q3nL4P0X3EVCyIiNZCYmAhra2vh9qVLl6ChoYHmzZsL28zNzTmViNTKhyu1yGQyPHr0CEePHkXlypXFjkelGEeQiYjUQMWKFZGYmAhLS0sAwPnz51GvXj2FK1zdunWL849JreSv3PKhzZs349atWypOQ+qEI8hERGqgY8eOWLp0KW7duoUNGzbg4cOH6Nmzp3B/YmIigoKC/tV8fKLSql27djhx4oTYMagUY0EmIlIDY8aMgampKXr27ImgoCD07NkTffv2BQCsWrUKbdq0gb6+Pr7//nuRkxIVn7y8vAJ/3r59i127dsHExETseFSKcRULIiI1kj/H2MDAQNh25coVvHnzBq1bt+alpkmt2NvbF3qBEF1dXcydOxddunQRIRWpAxZkIiIiKpWuXLmicFsikUBbWxu1atVS+CWR6N9iQSYiIqJSLSEhAQkJCcjNzYW1tTXs7e3FjkSlHN9rIyIiolLpzZs3mDx5Ms6cOQMjIyPk5uYiPT0dDRo0wJo1a2BoaCh2RCqleJIeERERlUpz5sxBUlISwsLCEB4ejmvXruHQoUPIzMwscgk4ImVwigURERGVSm5ubti6dSscHR0VtsfExGDYsGEIDw8XKRmVdpxiQUSkZoo6sx8AtLW1YW5ujk6dOmHcuHHQ1tZWcTqi4qOnp1fodolEgtzcXBWnIXXCKRZERGomICAAZmZmCAgIwIEDB7B//37MmTMHlSpVwtChQ/Hjjz/i7NmzWL58udhRiT6Lp6cnZs+ejfv37wvb7t27hzlz5qB169YiJqPSjlMsiIjUTLt27TB9+nS0aNFCYftff/2FgIAAHDt2DNevX8eYMWNw4cIFkVISfb7U1FSMHj0aV69eFZZ1S09PR8uWLbFo0SJUqFBB5IRUWnGKBRGRmklOToaFhUWB7aampnjx4gUAwNzcHOnp6aqORlRsYmJiYGdnh23btuHOnTtISEiAVCqFlZUVGjRoIHY8KuU4xYKISM00a9YMs2bNwqNHj4Rtjx49wrx589C4cWPk5uZi7969qFOnjogpif6bnJwc/Pjjj+jTpw+io6MBAHZ2dvDy8sLZs2cxYMAATJs2jXOQ6bNwigURkZp5/fo1xo8fj0uXLsHQ0BAymQzp6enw8PDA/PnzcePGDUydOhVr1qyBq6ur2HGJ/pXg4GBs374dS5cuhbu7e4H7L1++jPHjx2PEiBH47rvvVB+Q1AILMhGRmrp//z7u3r0LTU1N1KpVCzVr1gQAvHv3Drq6ukWudEH0JevUqRMmTJiAdu3aFbnP/v37sXHjRvzxxx8qTEbqhFMsiIjUkEwmg76+PurWrQs7Oztoamri8ePHePz4MfT09FiOqdR69uwZ6tat+9F9GjRogH/++UdFiUgd8SQ9IiI1c/bsWcyYMUM4IS+fTCaDRCJBbGysSMmIPl/FihXxzz//oGrVqkXu8/TpU5iYmKgwFakbFmQiIjUzb948uLi4wNfXV1j6ikhdtGvXDitXroSrq2uhF7rJzs7GqlWrCixzSPRvcA4yEZGacXZ2xh9//IFq1aqJHYWo2L19+xa9evWCjo4OBgwYAEdHRxgaGuLNmzeIiYlBSEgIsrKysHPnTpibm4sdl0opFmQiIjUzdOhQdOnSBd7e3mJHISoRb968weLFixEWFobMzEwA76cQGRkZoUuXLhg9ejRMTU1FTkmlGQsyEZGaWbNmDTZs2IDmzZujevXqBd6GHjdunEjJiIqXVCrF48ePkZqaChMTE1SvXh0aGlx/gD4f5yATEamZS5cuwdHREa9evcKrV68U7uPqFaROdHR0YGtrK3YMUkMcQSYiIiIiksMRZCIiNbB37158/fXX0NHRwd69e4vcTyKRoGfPnipMRkRU+nAEmYhIDXh6emLfvn0wMTGBp6dnkftJJBKcPHlShcmIiEofFmQiIiIiIjmcYkFEpAauXr2q9L7u7u4lmISIqPTjCDIRkRqwt7dXuC2RSCCTyaCrqwtNTU1kZGRAU1MT5cuXx5UrV0RKSURUOnAEmYhIDdy8eVP4d2hoKPbt24c5c+agdu3aAICHDx9i+vTpaN26tVgRiYhKDY4gExGpmaZNm2LTpk0FRpXv3r2LAQMGIDw8XKRkRESlAy83Q0SkhhITEwtsu3fvHnR0dERIQ0RUunCKBRGRmunXrx/8/Pzg4+MDOzs7AMCNGzewfft2jB07VuR0RERfPk6xICJSQ7/99hv27NmDhIQEAEDt2rXRv39/dOvWTeRkRERfPhZkIiIiIiI5nGJBRKRmfv7554/eP27cOBUlISIqnViQiYjUzLVr1xRu5+bm4p9//kFqaio6deokUioiotKDBZmISM1s27at0O0LFy5ETk6OitMQEZU+nINMRFRGPH78GN27dy8wwkxERIq4DjIRURlx+vRp6OnpiR2DiOiLxykWRERqpmXLlpBIJArb0tPTkZaWhsmTJ4uUioio9OAUCyIiNRMaGqpQkCUSCbS1teHo6IgaNWqImIyIqHRgQSYiIiIiksMpFkREaqBfv34FplUUJSQkpITTEBGVbizIRERqoGnTpmJHICJSG5xiQUSkhjIyMpCamorKlSsDAC5cuID69evDwMBA5GRERF8+LvNGRKRmoqKi0Lp1a2zfvl3YFhgYiI4dOyI2NlbEZEREpQNHkImI1Mw333yDhg0bYuLEiQrzkhcvXozr169jx44dIqYjIvrycQSZiEjN3L17F3369Clw0l6fPn1w69YtkVIREZUeLMhERGrGysoKFy9eLLA9PDwc5ubmIiQiIipduIoFEZGa8fX1xeTJkxEZGYl69eoBAGJjYxEWFobZs2eLnI6I6MvHOchERGro0qVL2LVrFxISEqCtrY0aNWrAx8cHrq6uYkcjIvrisSATEREREcnhFAsiIjUjlUoRGhqKGzduICcnBx+OgyxatEikZEREpQMLMhGRmpk6dSpOnDiB5s2b88IgRET/AadYEBGpGRcXF6xatQrNmjUTOwoRUanEZd6IiNRMhQoVYGFhIXYMIqJSiwWZiEjNjB49GvPmzUN8fDyysrKQl5en8IeIiD6OUyyIiNRMy5Yt8fLlS+Tm5hZ6f2xsrIoTERGVLizIRERq5sqVKx+9v2HDhipKQkRUOrEgExGpqYSEBCQkJCA3NxfW1tawt7cXOxIRUanAZd6IiNTMmzdvMHnyZJw5cwZGRkbIzc1Feno6GjRogDVr1sDQ0FDsiEREXzSepEdEpGbmzJmDpKQkhIWFITw8HNeuXcOhQ4eQmZmJwMBAseMREX3xOMWCiEjNuLm5YevWrXB0dFTYHhMTg2HDhiE8PFykZEREpQNHkImI1Iyenl6h2yUSSZErWxAR0f9jQSYiUjOenp6YPXs27t+/L2y7d+8e5syZg9atW4uYjIiodOAUCyIiNZOamorRo0fj6tWrMDAwAACkp6ejZcuWWLRoESpUqCByQiKiLxsLMhGRmrpz5w4SEhKgp6cHa2trWFtbix2JiKhU4BQLIiI1EhMTg6ysLACAnZ0dvLy8IJPJ8ObNG5GTERGVHizIRERqICcnBz/++CP69OmD6Ohohfv++OMP/O9//8O0adN4kh4RkRJYkImI1MCmTZsQHh6OX3/9tcClpIOCgrB582acPHkS27ZtEykhEVHpwTnIRERqoFOnTpgwYQLatWtX5D779+/Hxo0b8ccff6gwGRFR6cMRZCIiNfDs2TPUrVv3o/s0aNAA//zzj4oSERGVXizIRERqoGLFip8sv0+fPoWJiYmKEhERlV4syEREaqBdu3ZYuXIlsrOzC70/Ozsbq1atQosWLVScjIio9OEcZCIiNfD27Vv06tULOjo6GDBgABwdHWFoaIg3b94gJiYGISEhyMrKws6dO2Fubi52XCKiLxoLMhGRmnjz5g0WL16MsLAwZGZmAgBkMhmMjIzQpUsXjB49GqampiKnJCL68rEgExGpGalUisePHyM1NRUmJiaoXr06NDQ4o46ISFksyEREREREcjikQEREREQkhwWZiIiIiEgOCzIRERERkRwWZCIiIiIiOf8HqSlLO4+G+NEAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#agg\n",
+ "df_dist = df.groupby(['id','discourse_type']).agg({'discourse_text':'count'}).reset_index()\n",
+ "\n",
+ "#plot\n",
+ "plt.figure(figsize=(10,7),tight_layout=True)\n",
+ "sns.barplot(x='discourse_type',y='discourse_text',\n",
+ " data=df_dist,estimator=np.mean,palette='Set3',\n",
+ " order=df['discourse_type'].value_counts().index)\n",
+ "\n",
+ "plt.xticks(rotation=90,size=14);\n",
+ "plt.yticks(size=14)\n",
+ "plt.xlabel(None)\n",
+ "plt.ylabel('# of discourse',size=16);\n",
+ "\n",
+ "plt.title('Average # of discourse type per essay',size=16,weight='bold',pad=20);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1de81c03",
+ "metadata": {},
+ "source": [
+ "# First iteration on predictions\n",
+ "\n",
+ "These learnings are derived from model_v3 ; analysis on the test split "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7cd82c86",
+ "metadata": {},
+ "source": [
+ "__Key take away__\n",
+ "\n",
+ "- Length of each class is not fully captured by the model when looking at average length\n",
+ "- This is most likely driven by discourses predicted to be of very small length, kind of stammering from the model \n",
+ " - could we set threshold per class to avoid this ?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ca34385c",
+ "metadata": {},
+ "source": [
+ "## Load data from dataset \n",
+ "\n",
+ "Let's also take the original train.csv file instead of our re-processed dataset for comparison. We will use it for **high level** comparisons. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "0ab81d64",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:31.919643Z",
+ "start_time": "2022-02-15T16:30:14.446380Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# load data from pickles and csv files\n",
+ "with open('../raw_data/dataset_v3.pickle','rb') as file:\n",
+ " dataset = pickle.load(file)\n",
+ " \n",
+ "with open('../raw_data/preds_on_testsplit.pickle','rb') as file:\n",
+ " y_pred_proba = pickle.load(file)\n",
+ "\n",
+ " \n",
+ "df_essays = pd.read_csv('../raw_data/preprocessed_v3.csv',converters={'predictionstring':eval,\n",
+ " 'label':eval})\n",
+ "\n",
+ "df_raw = pd.read_csv('../raw_data/train.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "f30d457f",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:32.142023Z",
+ "start_time": "2022-02-15T16:30:31.924577Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# get X_test,y_test\n",
+ "X_test = {\n",
+ " 'input_ids' : dataset['inputs']['input_ids'][idx_test],\n",
+ " 'attention_mask' : dataset['inputs']['attention_mask'][idx_test]\n",
+ "}\n",
+ "\n",
+ "y_test = dataset['labels'][idx_test]\n",
+ "ps_test = dataset['predictionstrings'][idx_test]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34735cd3",
+ "metadata": {},
+ "source": [
+ "## Get preds"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "d1bbaebf",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:32.172066Z",
+ "start_time": "2022-02-15T16:30:32.143482Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def get_preds(y_pred,ps):\n",
+ " \"\"\"\n",
+ " Generate readable predictions from the output of the model.\n",
+ "\n",
+ " Args:\n",
+ " y_pred (ndarray): output of the model\n",
+ " ps (ndarray): predictionstring referring to the token predicted\n",
+ "\n",
+ " Returns:\n",
+ " DataFrame : DataFrame with class and predictionstrings\n",
+ " \"\"\"\n",
+ "\n",
+ "\n",
+ " labels = []\n",
+ " predictionstrings = []\n",
+ " counts = []\n",
+ " \n",
+ " counter=dict()\n",
+ " \n",
+ " for tok,pos in zip(y_pred,ps):\n",
+ " \n",
+ " if tok <= 13:\n",
+ " lab = reversed_mapping[tok]\n",
+ " labels.append(lab)\n",
+ " predictionstrings.append(pos)\n",
+ " if len(labels)<2:\n",
+ " counts.append(str(1))\n",
+ " counter.setdefault(lab,1)\n",
+ " continue\n",
+ " if lab == labels[-2]:\n",
+ " counts.append(str(counter[lab]))\n",
+ " else: \n",
+ " try:\n",
+ " counter[lab]+=1\n",
+ " except KeyError:\n",
+ " counter.setdefault(lab,1)\n",
+ " counts.append(str(counter[lab]))\n",
+ " \n",
+ " preds = pd.DataFrame([labels,counts,predictionstrings],index=['class','count','predictionstring']).T\n",
+ " preds['class'] += ' ' + preds['count'].astype(str)\n",
+ " preds = preds.groupby('class',sort=False).agg({'predictionstring':list}).reset_index()\n",
+ " preds['class']=preds['class'].apply(lambda txt : txt.split()[0])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : [str(l) for l in l_])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : ' '.join(l_))\n",
+ " \n",
+ " return preds"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "6df9a277",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:57.249410Z",
+ "start_time": "2022-02-15T16:30:32.174062Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "61c288c575dd46f3952b50263c899653",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/1560 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "##creating pred_df\n",
+ "y_pred = np.argmax(y_pred_proba,axis=-1)\n",
+ "pred_df=pd.DataFrame()\n",
+ "for i,idx in tqdm(enumerate(idx_test),total=len(idx_test)):\n",
+ " \n",
+ " pred_ = get_preds(y_pred[i],ps_test[i])\n",
+ " \n",
+ " pred_['id']=df_essays.iloc[idx]['id']\n",
+ " \n",
+ " pred_df = pred_df.append(pred_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "3fb90245",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:57.281470Z",
+ "start_time": "2022-02-15T16:30:57.251720Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " class \n",
+ " predictionstring \n",
+ " id \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Lead \n",
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... \n",
+ " E6870101D8EE \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Evidence \n",
+ " 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... \n",
+ " E6870101D8EE \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Lead \n",
+ " 65 66 67 68 69 70 71 72 73 74 75 76 77 78 \n",
+ " E6870101D8EE \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Claim \n",
+ " 79 80 81 82 \n",
+ " E6870101D8EE \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Lead \n",
+ " 83 84 \n",
+ " E6870101D8EE \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " class predictionstring id\n",
+ "0 Lead 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... E6870101D8EE\n",
+ "1 Evidence 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... E6870101D8EE\n",
+ "2 Lead 65 66 67 68 69 70 71 72 73 74 75 76 77 78 E6870101D8EE\n",
+ "3 Claim 79 80 81 82 E6870101D8EE\n",
+ "4 Lead 83 84 E6870101D8EE"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# let's check\n",
+ "pred_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "d009da2c",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:57.305823Z",
+ "start_time": "2022-02-15T16:30:57.283092Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#create list of ids in the test split for later use\n",
+ "ids_test_split = pred_df.id.unique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd835677",
+ "metadata": {},
+ "source": [
+ "## Compare lengths per class "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "990a540a",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:57.760835Z",
+ "start_time": "2022-02-15T16:30:57.307891Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#compute lengths\n",
+ "df_raw['discourse_length'] = df_raw['predictionstring'].apply(lambda txt : len(txt.split()))\n",
+ "pred_df['discourse_length'] = pred_df['predictionstring'].apply(lambda txt : len(txt.split()))\n",
+ "\n",
+ "#create df \n",
+ "a = df_raw.groupby('discourse_type').agg({'discourse_length':'mean'}).reset_index() #take mean of length\n",
+ "a['class'] = a['discourse_type'].replace('Concluding Statement','Concluding_Statement')\n",
+ "a.drop('discourse_type',axis=1,inplace=True)\n",
+ "\n",
+ "b = pred_df.groupby('class').agg({'discourse_length':'mean'}).reset_index() #take mean of length\n",
+ "\n",
+ "comparison_length = a.merge(b,on='class',suffixes=('_true','_pred')) \n",
+ "\n",
+ "#house cleaning for plotting purposes\n",
+ "comparison_length.rename({'discourse_length_pred':'prediction','discourse_length_true':'ground_truth'},\n",
+ " inplace=True, axis=1)\n",
+ "comparison_length.set_index(\"class\",inplace=True)\n",
+ "comparison_length = comparison_length.unstack().reset_index()\n",
+ "comparison_length.rename({'level_0':'type',0:'value'},axis=1,inplace=True)\n",
+ "comparison_length.sort_values('value',inplace=True,ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "f73eed57",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:58.018528Z",
+ "start_time": "2022-02-15T16:30:57.763685Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#plot\n",
+ "plt.figure(figsize=(10,5))\n",
+ "\n",
+ "sns.barplot(data=comparison_length, y='value', x='class',hue='type')\n",
+ "\n",
+ "#pretty plot\n",
+ "plt.xticks(rotation=90,size=12);\n",
+ "plt.yticks(size=12)\n",
+ "\n",
+ "plt.xlabel(None)\n",
+ "plt.ylabel(None)\n",
+ "\n",
+ "plt.title('Comparison average of the length per class',size=16,weight='bold',pad=20)\n",
+ "plt.legend();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "68d9af1b",
+ "metadata": {},
+ "source": [
+ "## Compare distribution of lengths per class"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "5c8c1673",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:58.523313Z",
+ "start_time": "2022-02-15T16:30:58.020486Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#df, replace, order\n",
+ "a = df_raw.sort_values('discourse_length',ascending=False)\n",
+ "b = pred_df.sort_values('discourse_length',ascending=False)\n",
+ "\n",
+ "a.replace('Concluding Statement','Concluding_Statement',inplace=True)\n",
+ "\n",
+ "order = a.groupby('discourse_type',sort=False).mean()\\\n",
+ " .sort_values('discourse_length',ascending=False).index\n",
+ "\n",
+ "\n",
+ "#plot\n",
+ "fig,axs = plt.subplots(1,2,figsize=(15,6), sharey= True)\n",
+ "\n",
+ "sns.boxplot(data = a,x='discourse_type',y='discourse_length',ax=axs[0],showfliers=False,order=order)\n",
+ "sns.boxplot(data = b,x='class',y='discourse_length',ax=axs[1],showfliers=False,order=order)\n",
+ "\n",
+ "#pretty plot\n",
+ "axs[0].set_xticklabels(labels = order, rotation=90,size=12)\n",
+ "axs[1].set_xticklabels(labels = order, rotation=90,size=12)\n",
+ "\n",
+ "axs[0].set_xlabel(None)\n",
+ "axs[0].set_ylabel('# words',size=14,labelpad=20)\n",
+ "axs[1].set_xlabel(None)\n",
+ "axs[1].set_ylabel(None)\n",
+ "\n",
+ "axs[0].set_title('Ground Truth',size=14,pad=10)\n",
+ "axs[1].set_title('Predictions',size=14,pad=10);\n",
+ "\n",
+ "plt.suptitle('Distribution of discourse length per class',size=16,weight='bold');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7626833",
+ "metadata": {},
+ "source": [
+ "## Number of discourses under a certain length"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "5cfb3f16",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:58.551971Z",
+ "start_time": "2022-02-15T16:30:58.525018Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def show_values_on_bars(axs,counts):\n",
+ " \"\"\"\n",
+ " This function adds value on top of bar. Should be customed for each use case.\n",
+ " \"\"\"\n",
+ " def _show_on_single_plot(ax,counts):\n",
+ " for p,v in zip(ax.patches,counts):\n",
+ " _x = p.get_x() + p.get_width() / 2\n",
+ " _y = p.get_y() + p.get_height() + 1\n",
+ " value = f'{p.get_height()/v*100:.1f}%'\n",
+ " ax.text(_x, _y+1.5, value, ha=\"center\",size=12)\n",
+ "\n",
+ " if isinstance(axs, np.ndarray):\n",
+ " for idx, ax in np.ndenumerate(axs):\n",
+ " _show_on_single_plot(ax,counts)\n",
+ " else:\n",
+ " _show_on_single_plot(axs,counts)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "83b314e5",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T16:30:59.131731Z",
+ "start_time": "2022-02-15T16:30:58.554103Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "## define the threshold \n",
+ "LIMIT = 3\n",
+ "\n",
+ "#df, order, replace\n",
+ "a = df_raw.sort_values('discourse_type')\n",
+ "b = pred_df.sort_values('class')\n",
+ "\n",
+ "a.replace('Concluding Statement','Concluding_Statement',inplace=True)\n",
+ "\n",
+ "\n",
+ "order = a.groupby('discourse_type',sort=False).mean()\\\n",
+ " .sort_values('discourse_length',ascending=False).index\n",
+ "\n",
+ "## computing total count per class to get %\n",
+ "v_a=a.groupby('discourse_type').count().loc[order]['id']\n",
+ "v_b=b.groupby('class').count().loc[order]['id']\n",
+ "\n",
+ "##filtering according to limot\n",
+ "a = a[a['discourse_length']\n",
+ ".column {\n",
+ " float: left;\n",
+ " width: 50%;\n",
+ " /*margin-top:10px;*/\n",
+ " border-right: 2px dotted black;\n",
+ " text-align:justify;\n",
+ " }\n",
+ ".column:first-child {\n",
+ " padding-right:16px;\n",
+ " padding-left:0px;\n",
+ " }\n",
+ "\n",
+ ".column:last-child {\n",
+ " padding-right:0px;\n",
+ " padding-left:15px;\n",
+ " border: none;\n",
+ " }\n",
+ "\n",
+ ".row:after {\n",
+ " content: \"\";\n",
+ " display: table;\n",
+ " clear: both;\n",
+ " text-align:left\n",
+ " }\n",
+ "\n",
+ ".title {\n",
+ " text-align:center;\n",
+ " padding-bottom: 15px;\n",
+ " padding-top: 0;\n",
+ " margin-top: 0;\n",
+ " border-bottom: 2px dotted black\n",
+ " }\n",
+ "\n",
+ "\n",
+ "div.content > * {\n",
+ " text-align: center;\n",
+ " font-weight: bold;\n",
+ " padding-right: 10px;\n",
+ " padding-left: 10px;\n",
+ " padding-top: 5px ;\n",
+ " padding-bottom:5px;\n",
+ " }\n",
+ "\n",
+ ".content {\n",
+ " display:flex;\n",
+ " justify-content:space-evenly;\n",
+ " margin: 0px auto;\n",
+ " margin-bottom: 5px;\n",
+ " background: #FFFFFF;\n",
+ " padding-block: 10px;\n",
+ " max-width:auto;\n",
+ " border: 2px solid black;\n",
+ " }\n",
+ "\n",
+ "/*styles for each discourse type*/\n",
+ "\n",
+ "Lead {\n",
+ " background-color:#ff8585;\n",
+ " }\n",
+ "Position {\n",
+ " background-color: #7cf0ff;\n",
+ " }\n",
+ "Evidence {\n",
+ " background-color: #badcfc;\n",
+ " }\n",
+ "Claim {\n",
+ " background-color: #3a3dff;\n",
+ " }\n",
+ "Concluding_Statement {\n",
+ " background-color: #ff7df9;\n",
+ " }\n",
+ "Counterclaim {\n",
+ " background-color: #4d92e0;\n",
+ " }\n",
+ "Rebuttal {\n",
+ " background-color: #ffd57c;\n",
+ " }\n",
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "utils.css()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "51d39727",
+ "metadata": {},
+ "source": [
+ "#### Data preparation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "645519e8",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T17:55:55.101288Z",
+ "start_time": "2022-02-15T17:55:54.667684Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " class \n",
+ " predictionstring \n",
+ " id \n",
+ " discourse_length \n",
+ " flag \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Lead \n",
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... \n",
+ " E6C2FD3578B3 \n",
+ " 119 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Position \n",
+ " 119 120 121 122 123 124 125 126 127 128 129 13... \n",
+ " E6C2FD3578B3 \n",
+ " 28 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Claim \n",
+ " 147 \n",
+ " E6C2FD3578B3 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " class predictionstring id \\\n",
+ "0 Lead 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... E6C2FD3578B3 \n",
+ "1 Position 119 120 121 122 123 124 125 126 127 128 129 13... E6C2FD3578B3 \n",
+ "2 Claim 147 E6C2FD3578B3 \n",
+ "\n",
+ " discourse_length flag \n",
+ "0 119 0 \n",
+ "1 28 0 \n",
+ "2 1 1 "
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# flag essay with a length issue\n",
+ "pred_df['flag'] = np.where(pred_df['discourse_length']<3,1,0)\n",
+ "# list essays with > 3 stammers for exploration purporses\n",
+ "stammerers = pred_df.groupby('id').sum()\n",
+ "stammerers = stammerers[stammerers['flag']>3].index\n",
+ "\n",
+ "stam_df = pred_df[pred_df['id'].isin(stammerers)].copy()\n",
+ "stam_df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "c35d8930",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T17:55:57.374675Z",
+ "start_time": "2022-02-15T17:55:57.058351Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#link with the essays\n",
+ "stam_df['text'] = stam_df['id'].apply(utils.get_essay)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "2aa03cb4",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T17:55:58.683652Z",
+ "start_time": "2022-02-15T17:55:58.109043Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# connect the predictions to text\n",
+ "slicer_vect = np.vectorize(utils.slicering)\n",
+ "stam_df['discourse_text'] = slicer_vect(stam_df['predictionstring'],stam_df['text'])\n",
+ "\n",
+ "#transform into html string for later visualization\n",
+ "stam_df['html'] = stam_df.apply(utils.render_html,axis=1)\n",
+ "stam_df = stam_df.groupby('id').agg({'html':' '.join,'flag':sum})\n",
+ "#display(stam_df.head(3))\n",
+ "\n",
+ "#same for truth df\n",
+ "df_true = df_raw[df_raw['id'].isin(stammerers)].copy() #selecting only essays of interest\n",
+ "df_true['discourse_type'].replace('Concluding Statement','Concluding_Statement',inplace=True)\n",
+ "df_true['html'] = df_true.apply(utils.render_html,axis=1)\n",
+ "df_true = df_true.groupby('id').agg({'html':' '.join})\n",
+ "#display(df_true.head(3))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "id": "0dc7a92c",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T18:14:14.180797Z",
+ "start_time": "2022-02-15T18:14:14.152519Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ "
Legend --> \n",
+ "
Lead \n",
+ "
Position \n",
+ "
Claim \n",
+ "
Counterclaim \n",
+ "
Rebuttal \n",
+ "
Evidence \n",
+ "
Concluding_Statement \n",
+ "
\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
Prediction \n",
+ "
The used of this technology in the classroom would benifit the teachers. [Position] The teachers could teach and use the technology to see if the students were bored or confused. Another way this technology would help with is teachers could make a video teaching the lesson and use the technology to see how the students reacted. The use of this technology would be great for teachers to know if their students are confused [Claim] or bored. The teacher could teach a different way each day and see how the students reacted. Then which ever way kept the students more interested and made them pay attention more would be the way the teacher could start teaching so that the teacher gets the best results from their students. If the students are confused and don't want to ask a question the teacher could see that the student is confused and then the teacher can talk to the student one on one to help the student best understand the topic that he or [Evidence] she is confused with. The technology could be used if the teacher was gone. [Claim] The teacher could make a video teaching the lesson and the facial action coding system could see the students face and change the way the lesson is being taught to each student making the video lesson more personal towards the student. This would help the student learn the lesson how they like to learn. This would help them absorb all of the information in hope that they can retain the information for [Evidence] the test. The technology would be great for classrooms. It helps the students learn the lesson the way they like to learn. It also helps the teachers teach the students to the best of their abilities. It will help the teachers [Concluding_Statement] teach [Evidence] the [Concluding_Statement] students the best way for the students. The teachers will know if their students are bored and that the teacher need to change up the way that they are presenting the lesson to the students. It will also help the teachers know if a student needs one on one but the student won't ask the teacher for [Evidence] help. With [Concluding_Statement] this technology [Evidence] the teacher will be able to go to the student and help the student. These reasons are why the facial action coding [Concluding_Statement]
\n",
+ "
\n",
+ "
\n",
+ "
Ground Truth \n",
+ "
The used of this technology in the classroom would benifit the teachers. [Position] The teachers could teach and use the technology to see if the students were bored or confused [Claim] Another way this technology would help with is teachers could make a video teaching the lesson and use the technology to see how the students reacted. [Claim] The use of this technology would be great for teachers to know if their students are confused or bored. [Claim] The teacher could teach a different way each day and see how the students reacted. Then which ever way kept the students more interested and made them pay attention more would be the way the teacher could start teaching so that the teacher gets the best results from their students. If the students are confused and don't want to ask a question the teacher could see that the student is confused and then the teacher can talk to the student one on one to help the student best understand the topic that he or she is confused with. [Evidence] The technology could be used if the teacher was gone [Claim] The teacher could make a video teaching the lesson and the facial action coding system could see the students face and change the way the lesson is being taught to each student making the video lesson more personal towards the student. This would help the student learn the lesson how they like to learn. This would help them absorb all of the information in hope that they can retain the information for the test. [Evidence] The technology would be great for classrooms. It helps the students learn the lesson the way they like to learn. It also helps the teachers teach the students to the best of their abilities. It will help the teachers teach the students the best way for the students. The teachers will know if their students are bored and that the teacher need to change up the way that they are presenting the lesson to the students. It will also help the teachers know if a student needs one on one but the student won't ask the teacher for help. With this technology the teacher will be able to go to the student and help the student. These reasons are why the facial action coding system should be implimented into the classroom. [Concluding_Statement]
\n",
+ "
\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## showing a comparison choosing a random essay \n",
+ "random_idx = random.choice(stam_df.index)\n",
+ "utils.comparison_text(stam_df.loc[random_idx,'html'],df_true.loc[random_idx,'html'])\n",
+ "\n",
+ "##NB : difference comes from my shitty model which apparently tends to classify a lot of tokens as O -__-"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b9c44900",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.12"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": true,
+ "toc_position": {
+ "height": "calc(100% - 180px)",
+ "left": "10px",
+ "top": "150px",
+ "width": "220px"
+ },
+ "toc_section_display": true,
+ "toc_window_display": true
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/arthur/inference_kaggle.ipynb b/notebooks/arthur/inference_kaggle.ipynb
new file mode 100644
index 0000000..8f446b9
--- /dev/null
+++ b/notebooks/arthur/inference_kaggle.ipynb
@@ -0,0 +1,1228 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "59cb5ea2",
+ "metadata": {
+ "papermill": {
+ "duration": 0.022357,
+ "end_time": "2022-02-11T10:32:08.844251",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:08.821894",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Imports and Variables "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9db7b815",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:16:56.018595Z",
+ "start_time": "2022-02-09T13:16:56.01273Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:08.891267Z",
+ "iopub.status.busy": "2022-02-11T10:32:08.889764Z",
+ "iopub.status.idle": "2022-02-11T10:32:08.901047Z",
+ "shell.execute_reply": "2022-02-11T10:32:08.900526Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:24.698621Z"
+ },
+ "papermill": {
+ "duration": 0.03576,
+ "end_time": "2022-02-11T10:32:08.901179",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:08.865419",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "#variables \n",
+ "import os \n",
+ "\n",
+ "SEQ_LEN = 1024 \n",
+ "\n",
+ "BATCH_SIZE = 8\n",
+ "\n",
+ "\n",
+ "#PATHS \n",
+ "\n",
+ "LOAD_BACKBONE_FROM = '../input/backbone/'\n",
+ "LOAD_MODEL_WEIGHTS_FROM = '../input/mymodel/mymodel'\n",
+ "LOAD_TXT_FROM = '../input/feedback-prize-2021/test/'\n",
+ "\n",
+ "#GPU and info message for tf\n",
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]='0' \n",
+ "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "89d5ea39",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:16:14.992257Z",
+ "start_time": "2022-02-09T13:16:14.981509Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:08.946557Z",
+ "iopub.status.busy": "2022-02-11T10:32:08.946069Z",
+ "iopub.status.idle": "2022-02-11T10:32:15.810602Z",
+ "shell.execute_reply": "2022-02-11T10:32:15.810107Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:32.513703Z"
+ },
+ "papermill": {
+ "duration": 6.888819,
+ "end_time": "2022-02-11T10:32:15.810763",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:08.921944",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# canonicals \n",
+ "import pandas as pd \n",
+ "import numpy as np \n",
+ "\n",
+ "#deep\n",
+ "import tensorflow as tf\n",
+ "from transformers import TFAutoModel, AutoConfig, AutoTokenizer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "621e9a0e",
+ "metadata": {
+ "papermill": {
+ "duration": 0.020572,
+ "end_time": "2022-02-11T10:32:15.852491",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:15.831919",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Loading and preprocessing data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "263e8156",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:18:16.81691Z",
+ "start_time": "2022-02-09T13:18:16.811361Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:15.898409Z",
+ "iopub.status.busy": "2022-02-11T10:32:15.897831Z",
+ "iopub.status.idle": "2022-02-11T10:32:15.917097Z",
+ "shell.execute_reply": "2022-02-11T10:32:15.917471Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:43.995593Z"
+ },
+ "papermill": {
+ "duration": 0.044566,
+ "end_time": "2022-02-11T10:32:15.917606",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:15.873040",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0FB0700DAF44 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " D72CB1C11673 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 18409261F5C2 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " DF920E0A7337 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " D46BCB48440A \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id\n",
+ "0 0FB0700DAF44\n",
+ "1 D72CB1C11673\n",
+ "2 18409261F5C2\n",
+ "3 DF920E0A7337\n",
+ "4 D46BCB48440A"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "txt_ids = [f.split('.')[0] for f in os.listdir(LOAD_TXT_FROM)]\n",
+ "df_test = pd.DataFrame(txt_ids,columns=['id'])\n",
+ "df_test"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "1b3ca4ec",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:23:48.85536Z",
+ "start_time": "2022-02-09T13:23:48.85116Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:15.965307Z",
+ "iopub.status.busy": "2022-02-11T10:32:15.964505Z",
+ "iopub.status.idle": "2022-02-11T10:32:15.966316Z",
+ "shell.execute_reply": "2022-02-11T10:32:15.966917Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:47.851725Z"
+ },
+ "papermill": {
+ "duration": 0.02812,
+ "end_time": "2022-02-11T10:32:15.967056",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:15.938936",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def get_essay(id_):\n",
+ " \"\"\"Function to get the full text of an essay from the .txt file.\n",
+ "\n",
+ " Args:\n",
+ " id_ (str): id of the essay\n",
+ " mode (str, optional): determines whether to access *train* or *test* texts. \\\n",
+ " Defaults to 'train'.\n",
+ "\n",
+ " Returns:\n",
+ " str: Returns the full text of the id\n",
+ " \"\"\"\n",
+ " with open(os.path.join(LOAD_TXT_FROM,f'{id_}.txt'),'r') as file:\n",
+ " txt = file.read()\n",
+ " return txt.strip()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "e3eecc9f",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.016664Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.016099Z",
+ "iopub.status.idle": "2022-02-11T10:32:16.036234Z",
+ "shell.execute_reply": "2022-02-11T10:32:16.035802Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:49.192967Z"
+ },
+ "papermill": {
+ "duration": 0.04807,
+ "end_time": "2022-02-11T10:32:16.036346",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:15.988276",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# store text in df\n",
+ "df_test['essays'] = df_test['id'].apply(get_essay)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "89373b90",
+ "metadata": {
+ "papermill": {
+ "duration": 0.0211,
+ "end_time": "2022-02-11T10:32:16.078692",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.057592",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Tokenize texts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9386c3ca",
+ "metadata": {
+ "papermill": {
+ "duration": 0.020633,
+ "end_time": "2022-02-11T10:32:16.120312",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.099679",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "## Tokenizer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "511ceffc",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:22:17.698739Z",
+ "start_time": "2022-02-09T13:22:17.669997Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.171123Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.170332Z",
+ "iopub.status.idle": "2022-02-11T10:32:16.177500Z",
+ "shell.execute_reply": "2022-02-11T10:32:16.177081Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:53.532126Z"
+ },
+ "papermill": {
+ "duration": 0.03649,
+ "end_time": "2022-02-11T10:32:16.177612",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.141122",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def tokenize_labelize(essay,tokenizer,predictionstring=None,labels=None,max_len=SEQ_LEN):\n",
+ " \"\"\"Tokenize an essay and match each token with the corresponding label.\n",
+ "\n",
+ " Args:\n",
+ " essay (str): Text to tokenize\n",
+ " tokenizer (tokenizer): Tokenizer from HF.\n",
+ " predictionstring (pandas.Series | numpy.array, optional): As a unique string, list of index position of words with a label. Must be provided with labels. Defaults to None.\n",
+ " labels (pandas.Series | numpy.array, optional): As a unique string, list of labels of each word. Must be provided with labels. Defaults to None.\n",
+ " max_len (int): Maximum sequence length for padding/truncating.\n",
+ " \n",
+ "\n",
+ " Returns:\n",
+ " dict : Returns a dictionnary with input_ids,attention_mask and labels if passed.\n",
+ " \"\"\"\n",
+ " \n",
+ " tokens = tokenizer(essay,\n",
+ " return_attention_mask = True,\n",
+ " return_token_type_ids = False,\n",
+ " padding = 'max_length',\n",
+ " max_length = SEQ_LEN,\n",
+ " truncation = True,\n",
+ " return_tensors='np'\n",
+ " )\n",
+ " \n",
+ " word_ids=tokens.word_ids()\n",
+ " \n",
+ " labels_mapping = {\n",
+ " 'B-Lead' : 0,\n",
+ " 'B-Position' : 1,\n",
+ " 'B-Evidence' : 2,\n",
+ " 'B-Claim' : 3,\n",
+ " 'B-Concluding_Statement' : 4,\n",
+ " 'B-Counterclaim' : 5,\n",
+ " 'B-Rebuttal' : 6,\n",
+ " 'I-Lead' : 7,\n",
+ " 'I-Position' : 8,\n",
+ " 'I-Evidence' : 9,\n",
+ " 'I-Claim' : 10,\n",
+ " 'I-Concluding_Statement' : 11,\n",
+ " 'I-Counterclaim' : 12,\n",
+ " 'I-Rebuttal': 13\n",
+ " }\n",
+ " \n",
+ " if labels:\n",
+ " match = {p:labels_mapping[l] for p,l in zip(predictionstring,labels)}\n",
+ " labels_matched = [15 if (w==None or w==word_ids[i-1]) \\\n",
+ " else match.get(str(w),14) \\\n",
+ " for i,w in enumerate(word_ids)]\n",
+ " \n",
+ " \n",
+ " return {\n",
+ " 'input_ids' : tokens['input_ids'][0],\n",
+ " 'attention_mask' : tokens['attention_mask'][0],\n",
+ " 'labels': np.array(labels_matched), \n",
+ " 'predictionstring':np.array(word_ids)\n",
+ " }\n",
+ " \n",
+ " return {\n",
+ " 'input_ids' : tokens['input_ids'][0],\n",
+ " 'attention_mask' : tokens['attention_mask'][0],\n",
+ " 'predictionstring':np.array(word_ids)\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "8b54766a",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:22:43.130645Z",
+ "start_time": "2022-02-09T13:22:43.113541Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.222294Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.221468Z",
+ "iopub.status.idle": "2022-02-11T10:32:16.225883Z",
+ "shell.execute_reply": "2022-02-11T10:32:16.225454Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:55.068076Z"
+ },
+ "papermill": {
+ "duration": 0.027485,
+ "end_time": "2022-02-11T10:32:16.225992",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.198507",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "## vectorize the function tokenizer above\n",
+ "tokenize_labelize_vect = np.vectorize(tokenize_labelize,excluded=['SEQ_LEN'],otypes=['object'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "5b4ded61",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.271298Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.270792Z",
+ "iopub.status.idle": "2022-02-11T10:32:16.416115Z",
+ "shell.execute_reply": "2022-02-11T10:32:16.416531Z",
+ "shell.execute_reply.started": "2022-02-09T13:48:57.546827Z"
+ },
+ "papermill": {
+ "duration": 0.169916,
+ "end_time": "2022-02-11T10:32:16.416717",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.246801",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "## load tokenizer\n",
+ "tokenizer = AutoTokenizer.from_pretrained(LOAD_BACKBONE_FROM)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "a5fb1846",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.464389Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.463619Z",
+ "iopub.status.idle": "2022-02-11T10:32:16.486745Z",
+ "shell.execute_reply": "2022-02-11T10:32:16.486301Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:00.259629Z"
+ },
+ "papermill": {
+ "duration": 0.048676,
+ "end_time": "2022-02-11T10:32:16.486863",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.438187",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "#create tokens\n",
+ "tokens_test = tokenize_labelize_vect(df_test.essays,tokenizer,max_len=SEQ_LEN)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e7ed71b2",
+ "metadata": {
+ "papermill": {
+ "duration": 0.020753,
+ "end_time": "2022-02-11T10:32:16.528989",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.508236",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "## Dataset creation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "54940360",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:29:39.724058Z",
+ "start_time": "2022-02-09T13:29:39.716081Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.580729Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.578379Z",
+ "iopub.status.idle": "2022-02-11T10:32:16.582622Z",
+ "shell.execute_reply": "2022-02-11T10:32:16.583062Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:03.531009Z"
+ },
+ "papermill": {
+ "duration": 0.033186,
+ "end_time": "2022-02-11T10:32:16.583187",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.550001",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def dataset_creator(tokens):\n",
+ " \"\"\"\n",
+ " Creates a dictionnary with tokens attributes as a numpy array.\n",
+ "\n",
+ " Args:\n",
+ " tokens (list): list of dictionnaries, outputs from tokeniner\n",
+ "\n",
+ " Returns:\n",
+ " dict: dict with list of size BATCH_SIZE of inputs_id, attention mask, predictionstring and labels if provided.\n",
+ " \"\"\"\n",
+ " \n",
+ " keys = tokens[0].keys()\n",
+ "\n",
+ " inputs = {\n",
+ " 'input_ids':[],\n",
+ " 'attention_mask':[]\n",
+ " }\n",
+ " predictionstring = []\n",
+ " labels = []\n",
+ " \n",
+ " for t in tokens:\n",
+ " inputs['input_ids'].append(t['input_ids'])\n",
+ " inputs['attention_mask'].append(t['attention_mask'])\n",
+ " predictionstring.append(t['predictionstring'])\n",
+ " if 'labels' in keys:\n",
+ " labels.append(t['labels'])\n",
+ "\n",
+ " \n",
+ " inputs['input_ids'] = np.array(inputs['input_ids'])\n",
+ " inputs['attention_mask'] = np.array(inputs['attention_mask'])\n",
+ " predictionstring = np.array(predictionstring)\n",
+ " labels = np.array(labels)\n",
+ " \n",
+ " if 'labels' in tokens[0].keys():\n",
+ " \n",
+ " #OHE labels\n",
+ " labels_ohe = np.zeros((len(labels),SEQ_LEN,16))\n",
+ " \n",
+ " dim1 = np.arange(len(labels))\n",
+ " dim2 = np.arange(SEQ_LEN)\n",
+ " \n",
+ " labels_ohe[dim1[:,None,None],dim2[None,:,None],labels[:,:,None]] = 1\n",
+ " \n",
+ " return inputs, labels_ohe, predictionstring\n",
+ " \n",
+ " return inputs, predictionstring"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "ceb5d2ba",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.629303Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.628544Z",
+ "iopub.status.idle": "2022-02-11T10:32:16.630525Z",
+ "shell.execute_reply": "2022-02-11T10:32:16.630961Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:04.545139Z"
+ },
+ "papermill": {
+ "duration": 0.026963,
+ "end_time": "2022-02-11T10:32:16.631085",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.604122",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "## creating test dataset\n",
+ "X_test,ps_test = dataset_creator(tokens_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e59a69a9",
+ "metadata": {
+ "papermill": {
+ "duration": 0.021075,
+ "end_time": "2022-02-11T10:32:16.673192",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.652117",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Model prediction"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7206db1a",
+ "metadata": {
+ "papermill": {
+ "duration": 0.020733,
+ "end_time": "2022-02-11T10:32:16.715129",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.694396",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "## Model architecture"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "7db24263",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T13:37:01.648522Z",
+ "start_time": "2022-02-09T13:37:01.158808Z"
+ },
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:16.761456Z",
+ "iopub.status.busy": "2022-02-11T10:32:16.760922Z",
+ "iopub.status.idle": "2022-02-11T10:32:30.602549Z",
+ "shell.execute_reply": "2022-02-11T10:32:30.603198Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:07.057087Z"
+ },
+ "papermill": {
+ "duration": 13.867367,
+ "end_time": "2022-02-11T10:32:30.603410",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:16.736043",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "All model checkpoint layers were used when initializing TFLongformerModel.\n",
+ "\n",
+ "All the layers of TFLongformerModel were initialized from the model checkpoint at ../input/backbone/tf_model.h5.\n",
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Instantiate model Longformer to be used as backbone\n",
+ "config = AutoConfig.from_pretrained(os.path.join(LOAD_BACKBONE_FROM,'config.json'))\n",
+ "backbone = TFAutoModel.from_pretrained(os.path.join(LOAD_BACKBONE_FROM,'tf_model.h5'),config=config)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "de774ab7",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:30.657658Z",
+ "iopub.status.busy": "2022-02-11T10:32:30.656163Z",
+ "iopub.status.idle": "2022-02-11T10:32:30.659590Z",
+ "shell.execute_reply": "2022-02-11T10:32:30.660033Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:23.360148Z"
+ },
+ "papermill": {
+ "duration": 0.033231,
+ "end_time": "2022-02-11T10:32:30.660175",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:30.626944",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def init_model():\n",
+ " input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,),dtype='int32')\n",
+ " attention_mask = tf.keras.layers.Input(shape=(SEQ_LEN,),dtype='int32')\n",
+ " \n",
+ " x = backbone({'input_ids':input_ids,\n",
+ " 'attention_mask':attention_mask})[0]\n",
+ "\n",
+ " backbone.trainable = False\n",
+ "\n",
+ "\n",
+ " x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units = 512,\n",
+ " activation = 'tanh',\n",
+ " dropout=.2,\n",
+ " return_sequences=True))(x)\n",
+ " x_res = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units = 512,\n",
+ " activation = 'tanh',\n",
+ " dropout=.2,\n",
+ " return_sequences=True))(x)\n",
+ " \n",
+ "\n",
+ " x = tf.keras.layers.add([x,x_res])\n",
+ " output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(16,activation = 'softmax'))(x)\n",
+ "\n",
+ " model = tf.keras.models.Model(inputs={'input_ids':input_ids,\n",
+ " 'attention_mask':attention_mask},outputs=output)\n",
+ " \n",
+ " \n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "59651021",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:30.711176Z",
+ "iopub.status.busy": "2022-02-11T10:32:30.710516Z",
+ "iopub.status.idle": "2022-02-11T10:32:49.698672Z",
+ "shell.execute_reply": "2022-02-11T10:32:49.699094Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:24.262996Z"
+ },
+ "papermill": {
+ "duration": 19.017529,
+ "end_time": "2022-02-11T10:32:49.699240",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:30.681711",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "To disable this warning, you can either:\n",
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "To disable this warning, you can either:\n",
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## instantiate the model, plot the graph\n",
+ "model = init_model()\n",
+ "\n",
+ "tf.keras.utils.plot_model(model,show_shapes=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41995070",
+ "metadata": {
+ "papermill": {
+ "duration": 0.02412,
+ "end_time": "2022-02-11T10:32:49.747839",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:49.723719",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "## Model compilation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "0d637ae6",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:49.804197Z",
+ "iopub.status.busy": "2022-02-11T10:32:49.803431Z",
+ "iopub.status.idle": "2022-02-11T10:32:49.805448Z",
+ "shell.execute_reply": "2022-02-11T10:32:49.805879Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:49.416398Z"
+ },
+ "papermill": {
+ "duration": 0.034061,
+ "end_time": "2022-02-11T10:32:49.806006",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:49.771945",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# creating homemade metric\n",
+ "\n",
+ "def accuracy_masked_func(y_true,y_pred):\n",
+ " y_pred = tf.cast(tf.argmax(y_pred,axis=-1),'int32')\n",
+ " y_true = tf.cast(y_true,'int32')\n",
+ " y_true = tf.cast(tf.argmax(y_true,axis=-1),'int32') #for y_pred and y_true to match\n",
+ " mask = tf.cast(y_true != 15,'int32') #create a mask\n",
+ " matches = tf.cast(tf.equal(y_true,y_pred),'int32')*mask #calculate the matches ignoring the masking\n",
+ " accuracy = tf.math.reduce_sum(matches,axis=-1)/tf.maximum(tf.math.reduce_sum(mask,axis=-1),1)\n",
+ " \n",
+ " return accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "fdb1dcd8",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:49.864353Z",
+ "iopub.status.busy": "2022-02-11T10:32:49.863561Z",
+ "iopub.status.idle": "2022-02-11T10:32:49.883890Z",
+ "shell.execute_reply": "2022-02-11T10:32:49.883455Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:50.243674Z"
+ },
+ "papermill": {
+ "duration": 0.053655,
+ "end_time": "2022-02-11T10:32:49.884015",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:49.830360",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# define loss and metrics \n",
+ "loss = tf.keras.losses.CategoricalCrossentropy(name='categorical_crossentropy')\n",
+ "cat_accuracy = tf.keras.metrics.CategoricalAccuracy()\n",
+ "masked_accuracy = tf.keras.metrics.MeanMetricWrapper(fn=accuracy_masked_func)\n",
+ "\n",
+ "# RMSProp optimizer with clip value and small lr to avoid exploiding gradient \n",
+ "opt = tf.keras.optimizers.RMSprop(clipvalue=.5,learning_rate=0.0001)\n",
+ "\n",
+ "#compile\n",
+ "model.compile(optimizer=opt,loss=loss,metrics=[cat_accuracy,masked_accuracy])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9c0a32e9",
+ "metadata": {
+ "papermill": {
+ "duration": 0.024517,
+ "end_time": "2022-02-11T10:32:49.937271",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:49.912754",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "## Load model pretrained weights"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "534a0fbb",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:50.025175Z",
+ "iopub.status.busy": "2022-02-11T10:32:50.024371Z",
+ "iopub.status.idle": "2022-02-11T10:32:57.777162Z",
+ "shell.execute_reply": "2022-02-11T10:32:57.776594Z",
+ "shell.execute_reply.started": "2022-02-09T13:49:56.848183Z"
+ },
+ "papermill": {
+ "duration": 7.81584,
+ "end_time": "2022-02-11T10:32:57.777304",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:49.961464",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.load_weights(os.path.join(LOAD_MODEL_WEIGHTS_FROM))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a25fa713",
+ "metadata": {
+ "papermill": {
+ "duration": 0.024352,
+ "end_time": "2022-02-11T10:32:57.826677",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:57.802325",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Get predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "a068cbca",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:32:57.880940Z",
+ "iopub.status.busy": "2022-02-11T10:32:57.880180Z",
+ "iopub.status.idle": "2022-02-11T10:33:16.530032Z",
+ "shell.execute_reply": "2022-02-11T10:33:16.530749Z",
+ "shell.execute_reply.started": "2022-02-09T13:50:05.619127Z"
+ },
+ "papermill": {
+ "duration": 18.679681,
+ "end_time": "2022-02-11T10:33:16.530952",
+ "exception": false,
+ "start_time": "2022-02-11T10:32:57.851271",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "y_pred = model.predict(X_test,batch_size=BATCH_SIZE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "a73309fd",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:33:16.619754Z",
+ "iopub.status.busy": "2022-02-11T10:33:16.618610Z",
+ "iopub.status.idle": "2022-02-11T10:33:16.622436Z",
+ "shell.execute_reply": "2022-02-11T10:33:16.623209Z",
+ "shell.execute_reply.started": "2022-02-09T13:52:49.634319Z"
+ },
+ "papermill": {
+ "duration": 0.05607,
+ "end_time": "2022-02-11T10:33:16.623391",
+ "exception": false,
+ "start_time": "2022-02-11T10:33:16.567321",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "labels_mapping = {'B-Lead' : 0,\n",
+ " 'B-Position' : 1,\n",
+ " 'B-Evidence' : 2,\n",
+ " 'B-Claim' : 3,\n",
+ " 'B-Concluding_Statement' : 4,\n",
+ " 'B-Counterclaim' : 5,\n",
+ " 'B-Rebuttal' : 6,\n",
+ " 'I-Lead' : 7,\n",
+ " 'I-Position' : 8,\n",
+ " 'I-Evidence' : 9,\n",
+ " 'I-Claim' : 10,\n",
+ " 'I-Concluding_Statement' : 11,\n",
+ " 'I-Counterclaim' : 12,\n",
+ " 'I-Rebuttal': 13,\n",
+ " 'O':14,\n",
+ " 'PAD':15}\n",
+ "\n",
+ "reversed_mapping = {v:(k[2:] if v<14 else k) for k,v in labels_mapping.items()}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "022ae676",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:33:16.718680Z",
+ "iopub.status.busy": "2022-02-11T10:33:16.712589Z",
+ "iopub.status.idle": "2022-02-11T10:33:16.731130Z",
+ "shell.execute_reply": "2022-02-11T10:33:16.731891Z",
+ "shell.execute_reply.started": "2022-02-09T13:52:51.914194Z"
+ },
+ "papermill": {
+ "duration": 0.066045,
+ "end_time": "2022-02-11T10:33:16.732035",
+ "exception": false,
+ "start_time": "2022-02-11T10:33:16.665990",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def get_preds(y_pred,ps):\n",
+ " \"\"\"\n",
+ " Generate readable predictions from the output of the model.\n",
+ "\n",
+ " Args:\n",
+ " y_pred (ndarray): output of the model\n",
+ " ps (ndarray): predictionstring referring to the token predicted\n",
+ "\n",
+ " Returns:\n",
+ " DataFrame : DataFrame with class and predictionstrings\n",
+ " \"\"\"\n",
+ " \n",
+ " labels = []\n",
+ " predictionstrings = []\n",
+ " counts = []\n",
+ " \n",
+ " counter=dict()\n",
+ " \n",
+ " for tok,pos in zip(y_pred,ps):\n",
+ " \n",
+ " if tok <= 13:\n",
+ " lab = reversed_mapping[tok]\n",
+ " labels.append(lab)\n",
+ " predictionstrings.append(pos)\n",
+ " if len(labels)<2:\n",
+ " counts.append(str(1))\n",
+ " counter.setdefault(lab,1)\n",
+ " continue\n",
+ " if lab == labels[-2]:\n",
+ " counts.append(str(counter[lab]))\n",
+ " else: \n",
+ " try:\n",
+ " counter[lab]+=1\n",
+ " except KeyError:\n",
+ " counter.setdefault(lab,1)\n",
+ " counts.append(str(counter[lab]))\n",
+ " \n",
+ " preds = pd.DataFrame([labels,counts,predictionstrings],index=['class','count','predictionstring']).T\n",
+ " preds['class'] += ' ' + preds['count'].astype(str)\n",
+ " preds = preds.groupby('class',sort=False).agg({'predictionstring':list}).reset_index()\n",
+ " preds['class']=preds['class'].apply(lambda txt : txt.split()[0])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : [str(l) for l in l_])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : ' '.join(l_))\n",
+ " \n",
+ " return preds"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "05015116",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:33:16.819694Z",
+ "iopub.status.busy": "2022-02-11T10:33:16.818466Z",
+ "iopub.status.idle": "2022-02-11T10:33:17.084510Z",
+ "shell.execute_reply": "2022-02-11T10:33:17.083921Z",
+ "shell.execute_reply.started": "2022-02-09T13:52:53.90828Z"
+ },
+ "papermill": {
+ "duration": 0.313023,
+ "end_time": "2022-02-11T10:33:17.084673",
+ "exception": false,
+ "start_time": "2022-02-11T10:33:16.771650",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "preds = np.argmax(y_pred,axis=-1)\n",
+ "preds_df = pd.DataFrame()\n",
+ "\n",
+ "for i,idx in enumerate(df_test.index): \n",
+ " \n",
+ " pred_ = get_preds(preds[i],ps_test[i])\n",
+ " \n",
+ " pred_['id']=df_test.iloc[idx]['id']\n",
+ " \n",
+ " preds_df = preds_df.append(pred_)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "be6138d5",
+ "metadata": {
+ "papermill": {
+ "duration": 0.026636,
+ "end_time": "2022-02-11T10:33:17.139195",
+ "exception": false,
+ "start_time": "2022-02-11T10:33:17.112559",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Submission"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "3d5d8852",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-02-11T10:33:17.200959Z",
+ "iopub.status.busy": "2022-02-11T10:33:17.200125Z",
+ "iopub.status.idle": "2022-02-11T10:33:17.205973Z",
+ "shell.execute_reply": "2022-02-11T10:33:17.206518Z",
+ "shell.execute_reply.started": "2022-02-09T13:53:19.555159Z"
+ },
+ "papermill": {
+ "duration": 0.040808,
+ "end_time": "2022-02-11T10:33:17.206691",
+ "exception": false,
+ "start_time": "2022-02-11T10:33:17.165883",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "sub = preds_df[['id','class','predictionstring']]\n",
+ "sub['class'] = sub['class'].replace('Concluding_Statement','Concluding Statement')\n",
+ "sub.reset_index(inplace=True,drop=True)\n",
+ "sub.to_csv('submission.csv',index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "89609b2c",
+ "metadata": {
+ "papermill": {
+ "duration": 0.024892,
+ "end_time": "2022-02-11T10:33:17.258700",
+ "exception": false,
+ "start_time": "2022-02-11T10:33:17.233808",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.12"
+ },
+ "papermill": {
+ "default_parameters": {},
+ "duration": 80.332934,
+ "end_time": "2022-02-11T10:33:21.050153",
+ "environment_variables": {},
+ "exception": null,
+ "input_path": "__notebook__.ipynb",
+ "output_path": "__notebook__.ipynb",
+ "parameters": {},
+ "start_time": "2022-02-11T10:32:00.717219",
+ "version": "2.3.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/arthur/metrics.ipynb b/notebooks/arthur/metrics.ipynb
new file mode 100644
index 0000000..bc2a2aa
--- /dev/null
+++ b/notebooks/arthur/metrics.ipynb
@@ -0,0 +1,1284 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "13115bc0",
+ "metadata": {},
+ "source": [
+ " Metrics \n",
+ " \n",
+ "This notebook wraps up the metrics used during the training of the models and the evaluation metric for the kaggle competition. It is intended to be used for evaluating the results on the test split."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "165075d8",
+ "metadata": {},
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "c422e5a0",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:19.667516Z",
+ "start_time": "2022-02-12T11:27:19.664254Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import os \n",
+ "import pickle\n",
+ "\n",
+ "from tqdm.notebook import tqdm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "ab1e01c2",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:22.723904Z",
+ "start_time": "2022-02-12T11:27:22.720647Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd \n",
+ "import numpy as np \n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import tensorflow as tf\n",
+ "\n",
+ "#import seqeval.metrics #not used\n",
+ "\n",
+ "from transformers import AutoTokenizer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bedf28c7",
+ "metadata": {
+ "heading_collapsed": true
+ },
+ "source": [
+ "## Categorical Accuracy with masking "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a9ad0bf",
+ "metadata": {
+ "hidden": true
+ },
+ "source": [
+ "### Subclassing -- WIP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "c7ea652e",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:23.696135Z",
+ "start_time": "2022-02-12T11:27:23.690834Z"
+ },
+ "hidden": true
+ },
+ "outputs": [],
+ "source": [
+ "#Create dummy data\n",
+ "\n",
+ "labels = {\n",
+ " 'B-First':0,\n",
+ " 'I-First':1,\n",
+ " 'B-Second':2,\n",
+ " 'I-Second':3,\n",
+ " 'O':4,\n",
+ " 'PAD':-100\n",
+ "}\n",
+ "\n",
+ "reversed_labels={v:k for k,v in labels.items()}\n",
+ "\n",
+ "y_true = np.array([[0,1,1,1,4,2,3,3,3,-100,-100,-100],[-100,0,1,4,4,2,3,3,3,0,1,-100]])\n",
+ "y_pred = np.array([[0,1,1,1,4,2,3,3,4,4,4,4],[-100,0,1,1,2,2,3,3,0,0,0,4]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 204,
+ "id": "5cdcc9ff",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T12:00:26.638087Z",
+ "start_time": "2022-02-04T12:00:26.630017Z"
+ },
+ "hidden": true
+ },
+ "outputs": [],
+ "source": [
+ "class AccuracyMasked(tf.keras.metrics.Metric):\n",
+ " \n",
+ " def __init__(self,class_to_ignore,name='accuracy_masked',**kwargs):\n",
+ " super().__init__(name=name,**kwargs),\n",
+ " self.class_to_ignore = class_to_ignore\n",
+ " self.accs = self.add_weight(name = 'accs',initializer = 'zeros',dtype='float64') \n",
+ " \n",
+ " def update_state(self,y_true,y_pred):\n",
+ " #y_pred = tf.argmax(y_pred,axis=-1)\n",
+ " #y_true = tf.argmax(y_true,axis=-1)#for y_pred and y_true to match\n",
+ " #mask = tf.cast(tf.not_equal(y_true,self.class_to_ignore),'int32') #create a mask\n",
+ " #matches = tf.cast(tf.equal(y_true,y_pred),'int32')*mask #calculate the matches ignoring the masking\n",
+ " #accuracy = tf.math.reduce_sum(matches,axis=-1)/tf.maximum(tf.math.reduce_sum(mask,axis=-1),1)\n",
+ " accuracy=tf.math.reduce_sum(tf.cast(y_true == y_pred,'int32')*tf.cast(y_true != -100,'int32'))/tf.math.reduce_sum(tf.cast(y_true != -100,'int32'))\n",
+ " self.accs.assign_add(tf.math.reduce_mean(accuracy))\n",
+ " \n",
+ " \n",
+ " def result(self):\n",
+ " return self.accs #tf.math.reduce_mean(accuracy)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 205,
+ "id": "963ca44b",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T12:00:27.157765Z",
+ "start_time": "2022-02-04T12:00:27.146240Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 205,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracymasked = AccuracyMasked(-100)\n",
+ "\n",
+ "accuracymasked(y_pred,y_true)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 181,
+ "id": "d2b72fec",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T11:57:45.527829Z",
+ "start_time": "2022-02-04T11:57:45.517677Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 181,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tf.math.reduce_sum(tf.cast(y_true == y_pred,'int32')*tf.cast(y_true != -100,'int32'))/tf.math.reduce_sum(tf.cast(y_true != -100,'int32'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "id": "02560a48",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T10:13:51.823339Z",
+ "start_time": "2022-02-04T10:13:51.817445Z"
+ },
+ "hidden": true
+ },
+ "outputs": [],
+ "source": [
+ "mask = tf.cast(tf.not_equal(y_true,-100),'int32') #create a mask\n",
+ "matches = tf.cast(tf.equal(y_true,y_pred),'int32')*mask #calculate the matches ignoring the masking\n",
+ "accuracy = tf.math.reduce_sum(matches,axis=-1)/tf.maximum(tf.math.reduce_sum(mask,axis=-1),1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "id": "40ee6f96",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T10:13:56.068300Z",
+ "start_time": "2022-02-04T10:13:56.062516Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tf.math.reduce_mean(accuracy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f16d9a53",
+ "metadata": {
+ "heading_collapsed": true,
+ "hidden": true
+ },
+ "source": [
+ "### Using simple function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 208,
+ "id": "0f26c8fb",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T13:35:38.446847Z",
+ "start_time": "2022-02-04T13:35:38.439726Z"
+ },
+ "hidden": true
+ },
+ "outputs": [],
+ "source": [
+ "def accuracy_masked_func(y_true,y_pred):\n",
+ " \"\"\"\n",
+ " Compute the accuracy ignoring the class 15 (PAD).\n",
+ "\n",
+ " Args:\n",
+ " y_true (tf.Tensor): target of shape (None, 1024, 16)\n",
+ " y_pred (tf.Tensor): targets predicted of shape (None, 1024, 16)\n",
+ "\n",
+ " Returns:\n",
+ " float: accuracy \n",
+ " \"\"\"\n",
+ " y_pred = tf.cast(tf.argmax(y_pred,axis=-1),'int32')\n",
+ " y_true = tf.cast(tf.argmax(y_true,axis=-1),'int32') #for y_pred and y_true to match\n",
+ " mask = tf.cast(y_true != 15,'int32') #create a mask for 15 = PAD\n",
+ " matches = tf.cast(tf.equal(y_true,y_pred),'int32')*mask #calculate the matches ignoring the masking\n",
+ " accuracy = tf.math.reduce_sum(matches,axis=-1)/tf.maximum(tf.math.reduce_sum(mask,axis=-1),1)\n",
+ " \n",
+ " return accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 207,
+ "id": "e812f1eb",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T13:30:47.443487Z",
+ "start_time": "2022-02-04T13:30:47.384912Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 207,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy_masked_func(y_true,y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 210,
+ "id": "1d71adf6",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T13:35:53.801253Z",
+ "start_time": "2022-02-04T13:35:53.682910Z"
+ },
+ "hidden": true
+ },
+ "outputs": [],
+ "source": [
+ "#simpler than subclassing\n",
+ "accuracy_masked = tf.keras.metrics.MeanMetricWrapper(fn=accuracy_masked_func)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 211,
+ "id": "b5dec6c5",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-04T13:36:03.220012Z",
+ "start_time": "2022-02-04T13:36:02.983247Z"
+ },
+ "hidden": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 211,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy_masked(y_true,y_pred)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20541fd0",
+ "metadata": {},
+ "source": [
+ "## Evaluation metric\n",
+ "\n",
+ "Create a metric to evaluate model performance according to the Kaggle\n",
+ "competition rules.\n",
+ "\n",
+ "- For each sample, all ground truths and predictions for a given class are \n",
+ " compared.\n",
+ "- If the overlap between the ground truth and prediction is >= 0.5, \n",
+ " and the overlap between the prediction and the ground truth >= 0.5, \n",
+ " the prediction is a match and considered a true positive. If multiple \n",
+ " matches exist, the match with the highest pair of overlaps is taken.\n",
+ "- Any unmatched ground truths are false negatives and any unmatched \n",
+ " predictions are false positives.\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0acd78d9",
+ "metadata": {},
+ "source": [
+ "### Get the data\n",
+ "Only for testing purposes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "04b7374c",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:45.796473Z",
+ "start_time": "2022-02-12T11:27:28.711604Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#get the data\n",
+ "df_essays = pd.read_csv('../raw_data/preprocessed_v3.csv',converters={'predictionstring':eval,\n",
+ " 'label':eval})\n",
+ "with open('../raw_data/preds_on_testsplit.pickle','rb') as file:\n",
+ " y_pred = pickle.load(file)\n",
+ " \n",
+ "with open('../raw_data/dataset_v3.pickle','rb') as file:\n",
+ " dataset = pickle.load(file)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "812217ff",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:45.934390Z",
+ "start_time": "2022-02-12T11:27:45.928556Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#Max len of essay \n",
+ "SEQ_LEN = 1024 ## THIS SHOULD NOT BE CHANGED without appropriate changes in the preprocessing \n",
+ "\n",
+ "#Train, val, test split proportion\n",
+ "VAL_SPLIT = 0.8\n",
+ "TEST_SPLIT = 0.9\n",
+ "\n",
+ "LEN=len(dataset['labels'])\n",
+ "\n",
+ "idx_val=int(LEN*VAL_SPLIT)\n",
+ "idx_test=int(LEN*TEST_SPLIT)\n",
+ "\n",
+ "idx_train=list(range(0,idx_val))\n",
+ "idx_val=list(range(idx_val,idx_test))\n",
+ "idx_test=list(range(idx_test,LEN))\n",
+ "\n",
+ "assert(len(idx_test)+len(idx_train)+len(idx_val)==LEN)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "fc7a9007",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:46.285310Z",
+ "start_time": "2022-02-12T11:27:46.072217Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#creating X_test, y_test, ps_test\n",
+ "\n",
+ "X_test = {\n",
+ " 'input_ids' : dataset['inputs']['input_ids'][idx_test],\n",
+ " 'attention_mask' : dataset['inputs']['attention_mask'][idx_test]\n",
+ "}\n",
+ "\n",
+ "y_test = dataset['labels'][idx_test]\n",
+ "ps_test = dataset['predictionstrings'][idx_test]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "f9dad965",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:46.398761Z",
+ "start_time": "2022-02-12T11:27:46.394171Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#Labels mapping\n",
+ "\n",
+ "labels_mapping = {'B-Lead' : 0,\n",
+ " 'B-Position' : 1,\n",
+ " 'B-Evidence' : 2,\n",
+ " 'B-Claim' : 3,\n",
+ " 'B-Concluding_Statement' : 4,\n",
+ " 'B-Counterclaim' : 5,\n",
+ " 'B-Rebuttal' : 6,\n",
+ " 'I-Lead' : 7,\n",
+ " 'I-Position' : 8,\n",
+ " 'I-Evidence' : 9,\n",
+ " 'I-Claim' : 10,\n",
+ " 'I-Concluding_Statement' : 11,\n",
+ " 'I-Counterclaim' : 12,\n",
+ " 'I-Rebuttal': 13,\n",
+ " 'O':14,\n",
+ " 'PAD':15}\n",
+ "\n",
+ "reversed_mapping = {v:(k[2:] if v<14 else k) for k,v in labels_mapping.items()}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d23afa93",
+ "metadata": {},
+ "source": [
+ "### Creating predictions df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "da5e8108",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:27:46.511743Z",
+ "start_time": "2022-02-12T11:27:46.504988Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def get_preds(y_pred,ps):\n",
+ " \"\"\"\n",
+ " Generate readable predictions from the output of the model.\n",
+ "\n",
+ " Args:\n",
+ " y_pred (ndarray): output of the model\n",
+ " ps (ndarray): predictionstring referring to the token predicted\n",
+ "\n",
+ " Returns:\n",
+ " DataFrame : DataFrame with class and predictionstrings\n",
+ " \"\"\"\n",
+ "\n",
+ "\n",
+ " labels = []\n",
+ " predictionstrings = []\n",
+ " counts = []\n",
+ " \n",
+ " counter=dict()\n",
+ " \n",
+ " for tok,pos in zip(y_pred,ps):\n",
+ " \n",
+ " if tok <= 13:\n",
+ " lab = reversed_mapping[tok]\n",
+ " labels.append(lab)\n",
+ " predictionstrings.append(pos)\n",
+ " if len(labels)<2:\n",
+ " counts.append(str(1))\n",
+ " counter.setdefault(lab,1)\n",
+ " continue\n",
+ " if lab == labels[-2]:\n",
+ " counts.append(str(counter[lab]))\n",
+ " else: \n",
+ " try:\n",
+ " counter[lab]+=1\n",
+ " except KeyError:\n",
+ " counter.setdefault(lab,1)\n",
+ " counts.append(str(counter[lab]))\n",
+ " \n",
+ " preds = pd.DataFrame([labels,counts,predictionstrings],index=['class','count','predictionstring']).T\n",
+ " preds['class'] += ' ' + preds['count'].astype(str)\n",
+ " preds = preds.groupby('class',sort=False).agg({'predictionstring':list}).reset_index()\n",
+ " preds['class']=preds['class'].apply(lambda txt : txt.split()[0])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : [str(l) for l in l_])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : ' '.join(l_))\n",
+ " \n",
+ " return preds"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "bb83207b",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:28:11.536940Z",
+ "start_time": "2022-02-12T11:27:46.654887Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## Working essay per essay, building pred_df\n",
+ "\n",
+ "preds = np.argmax(y_pred,axis=-1)\n",
+ "pred_df=pd.DataFrame()\n",
+ "for i,idx in tdqm(enumerate(idx_test),total=len(idx_test)): ## CHANGE idx_test\n",
+ " \n",
+ " pred_ = get_preds(preds[i],ps_test[i])\n",
+ " \n",
+ " pred_['id']=df_essays.iloc[idx]['id']\n",
+ " \n",
+ " pred_df = pred_df.append(pred_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "1f1c075b",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:28:57.763276Z",
+ "start_time": "2022-02-12T11:28:32.577541Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fde323963f1f43819251f0134373b26e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/1560 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Working essay per essay, building true_df\n",
+ "\n",
+ "## NB : this does not yield exactly the same result as if \n",
+ "## we processed train.csv. \n",
+ "## TODO : First glance shows delta to be negligible. Yet to further investigate.\n",
+ "\n",
+ "true = np.argmax(y_test,axis=-1)\n",
+ "true_df=pd.DataFrame()\n",
+ "for i,idx in tqdm(enumerate(idx_test),total=len(idx_test)): ## CHANGE idx_test\n",
+ " \n",
+ " true_ = get_preds(true[i],ps_test[i])\n",
+ " \n",
+ " true_['id']=df_essays.iloc[idx]['id']\n",
+ " \n",
+ " true_df = true_df.append(true_)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94cdc11d",
+ "metadata": {},
+ "source": [
+ "### Scoring functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "b2f51aee",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:29:00.900607Z",
+ "start_time": "2022-02-12T11:29:00.894481Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def scoring(predictionstring_true,predictionstring_pred):\n",
+ " \"\"\"\n",
+ " Identify each prediction to be a True Positive, a False Positive or a False \n",
+ " Negative according to the competition rules : \n",
+ " - For each sample, all ground truths and predictions for a given class are \n",
+ " compared.\n",
+ " - If the overlap between the ground truth and prediction is >= 0.5, \n",
+ " and the overlap between the prediction and the ground truth >= 0.5, \n",
+ " the prediction is a match and considered a true positive. If multiple \n",
+ " matches exist, the match with the highest pair of overlaps is taken.\n",
+ " - Any unmatched ground truths are false negatives and any unmatched \n",
+ " predictions are false positives.\n",
+ " \n",
+ " predictionstring_true and predictionstring_pred are a possible match from an\n",
+ " outer join of all predictions.\n",
+ "\n",
+ " Args:\n",
+ " predictionstring_true (str): predictionstring of a true discourse\n",
+ " predictionstring_pred (str): predictionstring of a predicted discourse\n",
+ "\n",
+ " Returns:\n",
+ " str: TP, FP, or FP \n",
+ " \"\"\"\n",
+ " \n",
+ " if predictionstring_true is np.nan :\n",
+ " return 'FP'\n",
+ " \n",
+ " elif predictionstring_pred is np.nan :\n",
+ " return 'FN'\n",
+ " \n",
+ " else:\n",
+ " \n",
+ " ps_true = set(predictionstring_true.split(' '))\n",
+ " ps_pred = set(predictionstring_pred.split(' '))\n",
+ "\n",
+ " inter = ps_pred.intersection(ps_true)\n",
+ " overlap_1 = len(inter)/len(ps_true)\n",
+ " overlap_2 = len(inter)/len(ps_pred)\n",
+ "\n",
+ " if overlap_1 >= .5 and overlap_2 >= .5:\n",
+ " return 'TP'\n",
+ " else:\n",
+ " return 'FP'\n",
+ "\n",
+ "## vectorize the funct\n",
+ "scoring_vect = np.vectorize(scoring)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "ce1ca8c1",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:29:02.131685Z",
+ "start_time": "2022-02-12T11:29:02.128171Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def f1_score(fp,fn,tp):\n",
+ " return tp/(tp+.5*(fp+fn))*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "e9933025",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:52:47.478241Z",
+ "start_time": "2022-02-12T11:52:47.464874Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def scores_df(merged_df):\n",
+ " \"\"\"\n",
+ " Computes f1-score summary.\n",
+ "\n",
+ " Args:\n",
+ " merged_df (DataFrame): DF with all correspondances of ground truth and \n",
+ " predictions.\n",
+ "\n",
+ " Returns:\n",
+ " df, plot: Returns a dataframe summarizing f1-score per class and a\n",
+ " barplot showing f1 levels in comparison to the Macro f1-score\n",
+ " \"\"\"\n",
+ " \n",
+ " merged_df['FP'] = np.where(merged_df['score']=='FP',1,0)\n",
+ " merged_df['FN'] = np.where(merged_df['score']=='FN',1,0)\n",
+ " merged_df['TP'] = np.where(merged_df['score']=='TP',1,0)\n",
+ "\n",
+ " merged_df['key'] = merged_df['id']+merged_df['class']+merged_df['predictionstring_pred']+\\\n",
+ " merged_df['predictionstring_true']\n",
+ " \n",
+ " idx_potential_duplicates = merged_df[(merged_df['key'].duplicated()) & (merged_df['TP']==1)].index\n",
+ " \n",
+ " merged_df.drop(idx_potential_duplicates,inplace=True,axis=0)\n",
+ " \n",
+ " merged_df.drop('key',axis=1,inplace=True)\n",
+ " \n",
+ " score_df = merged_df.groupby('class').agg({'TP':sum,'FP':sum,'FN':sum})\n",
+ " score_df['F1'] = f1_score(score_df.FP,score_df.FN,score_df.TP)\n",
+ " \n",
+ " #Create a total row\n",
+ " score_df.loc['Total']=score_df.mean()\n",
+ "\n",
+ " ## weighted average\n",
+ " score_df['Support'] = true_df.groupby('class').count()['id']\n",
+ " score_df['F1_Weighted']=score_df['F1']*(score_df['Support']/score_df['Support'].sum())\n",
+ " score_df.loc['Total','Support']=score_df.Support.sum()\n",
+ " score_df.loc['Total','F1_Weighted']=score_df.F1_Weighted.sum()\n",
+ "\n",
+ " # impute correct values for the Total row for TP FN FP \n",
+ " score_df.loc['Total','FP'] = score_df.loc[:'Rebuttal','FP'].sum()\n",
+ " score_df.loc['Total','FN'] = score_df.loc[:'Rebuttal','FN'].sum()\n",
+ " score_df.loc['Total','TP'] = score_df.loc[:'Rebuttal','TP'].sum() \n",
+ " \n",
+ " #Print aggregated scores\n",
+ " print(f\"F1 Macro Score = {score_df.loc['Total','F1']:.2f}%\")\n",
+ " print(f\"F1 Micro Score = {f1_score(score_df.loc['Total','FP'],score_df.loc['Total','FN'],score_df.loc['Total','TP']):.2f}%\")\n",
+ " print(f\"F1 Weighted Score = {score_df.loc['Total','F1_Weighted']:.2f}%\")\n",
+ " \n",
+ " ## Creating figure\n",
+ " sns.barplot(y=score_df.reset_index().loc[:6,'F1'],x=score_df.index[:-1],palette='Set2')\n",
+ " plt.xticks(rotation=90)\n",
+ " locs,_=plt.xticks()\n",
+ " plt.plot([locs[0]-.5,locs[-1]+.5],[score_df.loc['Total','F1'],score_df.loc['Total','F1']],c='r');\n",
+ " plt.title('Macro F1',size= 16);\n",
+ "\n",
+ " ## FORMATING\n",
+ " score_df[['TP','FN','FP','Support']] = score_df[['TP','FN','FP','Support']].applymap('{:.0f}'.format)\n",
+ " score_df[['F1','F1_Weighted']] = score_df[['F1','F1_Weighted']].applymap('{:.2f}%'.format)\n",
+ " \n",
+ " \n",
+ " return score_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69ece2c1",
+ "metadata": {},
+ "source": [
+ "### Building F1 report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "74444e77",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:52:49.103916Z",
+ "start_time": "2022-02-12T11:52:48.691932Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " class \n",
+ " predictionstring_pred \n",
+ " id \n",
+ " predictionstring_true \n",
+ " score \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Lead \n",
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... \n",
+ " E6870101D8EE \n",
+ " NaN \n",
+ " FP \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Lead \n",
+ " 65 66 67 68 69 70 71 72 73 74 75 76 77 78 \n",
+ " E6870101D8EE \n",
+ " NaN \n",
+ " FP \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Lead \n",
+ " 83 84 \n",
+ " E6870101D8EE \n",
+ " NaN \n",
+ " FP \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Evidence \n",
+ " 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... \n",
+ " E6870101D8EE \n",
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... \n",
+ " FP \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Evidence \n",
+ " 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... \n",
+ " E6870101D8EE \n",
+ " 110 111 112 113 114 115 116 117 118 119 120 12... \n",
+ " FP \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " class predictionstring_pred id \\\n",
+ "0 Lead 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... E6870101D8EE \n",
+ "1 Lead 65 66 67 68 69 70 71 72 73 74 75 76 77 78 E6870101D8EE \n",
+ "2 Lead 83 84 E6870101D8EE \n",
+ "3 Evidence 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... E6870101D8EE \n",
+ "4 Evidence 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... E6870101D8EE \n",
+ "\n",
+ " predictionstring_true score \n",
+ "0 NaN FP \n",
+ "1 NaN FP \n",
+ "2 NaN FP \n",
+ "3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... FP \n",
+ "4 110 111 112 113 114 115 116 117 118 119 120 12... FP "
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Merge true_df and pred_df to get all possible matches\n",
+ "merged_df = pred_df.merge(true_df,how = 'outer',on=['id','class'],suffixes=('_pred','_true'))\n",
+ "merged_df['score'] = scoring_vect(merged_df.predictionstring_true,\n",
+ " merged_df.predictionstring_pred) # apply scoring func to determine for each\n",
+ " # pred if it FP,FN,TP\n",
+ "merged_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "88eed805",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-12T11:52:49.416950Z",
+ "start_time": "2022-02-12T11:52:49.106575Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "F1 Macro Score = 44.92%\n",
+ "F1 Micro Score = 39.31%\n",
+ "F1 Weighted Score = 42.86%\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " TP \n",
+ " FP \n",
+ " FN \n",
+ " F1 \n",
+ " Support \n",
+ " F1_Weighted \n",
+ " \n",
+ " \n",
+ " class \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Claim \n",
+ " 1570 \n",
+ " 7897 \n",
+ " 49 \n",
+ " 28.32% \n",
+ " 3412 \n",
+ " 8.06% \n",
+ " \n",
+ " \n",
+ " Concluding_Statement \n",
+ " 1106 \n",
+ " 605 \n",
+ " 27 \n",
+ " 77.78% \n",
+ " 1346 \n",
+ " 8.73% \n",
+ " \n",
+ " \n",
+ " Counterclaim \n",
+ " 141 \n",
+ " 573 \n",
+ " 202 \n",
+ " 26.68% \n",
+ " 542 \n",
+ " 1.21% \n",
+ " \n",
+ " \n",
+ " Evidence \n",
+ " 2400 \n",
+ " 9860 \n",
+ " 0 \n",
+ " 32.74% \n",
+ " 3792 \n",
+ " 10.35% \n",
+ " \n",
+ " \n",
+ " Lead \n",
+ " 774 \n",
+ " 618 \n",
+ " 50 \n",
+ " 69.86% \n",
+ " 963 \n",
+ " 5.61% \n",
+ " \n",
+ " \n",
+ " Position \n",
+ " 955 \n",
+ " 909 \n",
+ " 68 \n",
+ " 66.16% \n",
+ " 1535 \n",
+ " 8.47% \n",
+ " \n",
+ " \n",
+ " Rebuttal \n",
+ " 57 \n",
+ " 571 \n",
+ " 199 \n",
+ " 12.90% \n",
+ " 402 \n",
+ " 0.43% \n",
+ " \n",
+ " \n",
+ " Total \n",
+ " 7003 \n",
+ " 21033 \n",
+ " 595 \n",
+ " 44.92% \n",
+ " 11992 \n",
+ " 42.86% \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " TP FP FN F1 Support F1_Weighted\n",
+ "class \n",
+ "Claim 1570 7897 49 28.32% 3412 8.06%\n",
+ "Concluding_Statement 1106 605 27 77.78% 1346 8.73%\n",
+ "Counterclaim 141 573 202 26.68% 542 1.21%\n",
+ "Evidence 2400 9860 0 32.74% 3792 10.35%\n",
+ "Lead 774 618 50 69.86% 963 5.61%\n",
+ "Position 955 909 68 66.16% 1535 8.47%\n",
+ "Rebuttal 57 571 199 12.90% 402 0.43%\n",
+ "Total 7003 21033 595 44.92% 11992 42.86%"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "f1_report = scores_df(merged_df)\n",
+ "f1_report"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2fe56a67",
+ "metadata": {},
+ "source": [
+ "## Confusion matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 284,
+ "id": "d8ddb586",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T09:42:21.575452Z",
+ "start_time": "2022-02-08T09:42:21.571645Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import confusion_matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 420,
+ "id": "0edffa7e",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T10:57:41.466079Z",
+ "start_time": "2022-02-08T10:57:41.456484Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def show_confusion_matrix(y_true,y_pred):\n",
+ " \"\"\"\n",
+ " Plots a confusion matrix.\n",
+ "\n",
+ " Args:\n",
+ " y_true (np.array): targets of shape (None, 1024)\n",
+ " y_pred (np.array): targets predicted of shape (None, 1024)\n",
+ " \"\"\"\n",
+ "\n",
+ " reversed_mapping = {\n",
+ " 0: 'Lead',\n",
+ " 1: 'Position',\n",
+ " 2: 'Evidence',\n",
+ " 3: 'Claim',\n",
+ " 4: 'Concluding_Statement',\n",
+ " 5: 'Counterclaim',\n",
+ " 6: 'Rebuttal',\n",
+ " 7: 'Lead',\n",
+ " 8: 'Position',\n",
+ " 9: 'Evidence',\n",
+ " 10: 'Claim',\n",
+ " 11: 'Concluding_Statement',\n",
+ " 12: 'Counterclaim',\n",
+ " 13: 'Rebuttal',\n",
+ " 14: 'O',\n",
+ " 15: 'PAD'}\n",
+ " \n",
+ " y_true_flat = [reversed_mapping[y] for y in y_true.flatten()]\n",
+ " y_pred_flat = [reversed_mapping[y] for y in y_pred.flatten()]\n",
+ " \n",
+ " LABELS = ['Lead','Position','Claim','Counterclaim','Rebuttal','Evidence','Concluding_Statement','O','PAD']\n",
+ "\n",
+ " cfn = confusion_matrix(y_true_flat,y_pred_flat,labels=LABELS)\n",
+ " \n",
+ " fig,ax = plt.subplots(1,1,figsize=(10,10))\n",
+ " plt.title('Confusion Matrix',size=18,pad=20)\n",
+ " sns.heatmap(cfn/np.sum(cfn,axis=0)*100,cmap='Blues',annot = True,fmt='.2f',annot_kws={'size':10},ax=ax);\n",
+ " plt.xticks(np.arange(len(LABELS))+.5,LABELS,rotation = 90,size=12);\n",
+ " plt.yticks(np.arange(len(LABELS))+.5,LABELS,rotation = 0,size=12);\n",
+ " plt.xlabel('PREDICTED',size=16);\n",
+ " plt.ylabel('ACTUAL',size=16);\n",
+ " for t in ax.texts: t.set_text(t.get_text() + \" %\")\n",
+ " \n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 421,
+ "id": "55ecfde3",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T10:57:50.337916Z",
+ "start_time": "2022-02-08T10:57:44.304107Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "show_confusion_matrix(y_true,y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b3e3e48c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.12"
+ },
+ "toc": {
+ "base_numbering": "1",
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": true,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {
+ "height": "calc(100% - 180px)",
+ "left": "10px",
+ "top": "150px",
+ "width": "268px"
+ },
+ "toc_section_display": true,
+ "toc_window_display": true
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "position": {
+ "height": "505.844px",
+ "left": "1063px",
+ "right": "20px",
+ "top": "120px",
+ "width": "357px"
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/arthur/preprocessing.ipynb b/notebooks/arthur/preprocessing.ipynb
new file mode 100644
index 0000000..0391fb0
--- /dev/null
+++ b/notebooks/arthur/preprocessing.ipynb
@@ -0,0 +1,1071 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ff304fae",
+ "metadata": {},
+ "source": [
+ "# Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "774a69c4",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:47:57.059503Z",
+ "start_time": "2022-02-09T14:47:56.945646Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import os \n",
+ "import csv\n",
+ "import pickle\n",
+ "\n",
+ "from tqdm.notebook import tqdm\n",
+ "\n",
+ "import pandas as pd \n",
+ "import numpy as np \n",
+ "import matplotlib.pyplot as plt \n",
+ "\n",
+ "from transformers import AutoTokenizer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "e834accd",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:47:57.806090Z",
+ "start_time": "2022-02-09T14:47:57.795868Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## Utilities variables\n",
+ "\n",
+ "#Sample mode \n",
+ "SAMPLE_MODE = None\n",
+ "\n",
+ "#Max len of essay\n",
+ "SEQ_LEN = 1024\n",
+ "\n",
+ "#path\n",
+ "PATH_RAW_DATA='/Users/arthurcollard/code/arthurcol/feedback_prize/raw_data/'\n",
+ "\n",
+ "VERSION = 3\n",
+ "NAME_OUTPUT_FILE = f'preprocessed_v{VERSION}.csv'\n",
+ "NAME_TEST_FILE = f'test_preprocessed_v{VERSION}.csv'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a17de51a",
+ "metadata": {},
+ "source": [
+ "# Data loading and preparation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91aae15f",
+ "metadata": {},
+ "source": [
+ "## Loading training set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "76f1bd5e",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:01.136989Z",
+ "start_time": "2022-02-09T14:47:59.717075Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#load data from csv file \n",
+ "df = pd.read_csv(PATH_RAW_DATA+'train.csv',nrows=SAMPLE_MODE)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5cc7bdb6",
+ "metadata": {},
+ "source": [
+ "## Preparation of the training data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "4867ca17",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:01.758156Z",
+ "start_time": "2022-02-09T14:48:01.745441Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## Def a function for labelling discourses per word\n",
+ "\n",
+ "def labelizer(label,len_,flag):\n",
+ " \"\"\"Repeat the label according to the length of the sentence. Makes use of B/I notation according to the position of the word within the sentence and the sentence within the essay.\n",
+ "\n",
+ " Args:\n",
+ " label (str): NER label of the sentence.\n",
+ " len_ (int): Length of the sentence (n° of words).\n",
+ " flag (int): 1 if the sentence follows a sentence with the same label. 0 otherwise.\n",
+ "\n",
+ " Returns:\n",
+ " str: Returns a string of length (n° of words) len_ with B/I-label repeated len_ times.\n",
+ " \"\"\"\n",
+ " if flag==0:\n",
+ " label_first = f'B-{label} '\n",
+ " else:\n",
+ " label_first = f'I-{label} '\n",
+ " \n",
+ " return (label_first + f'I-{label} '*(len_-1)).strip()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "b65d4e94",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:02.820733Z",
+ "start_time": "2022-02-09T14:48:02.343098Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## Creating features for labeling needs : \n",
+ "\n",
+ " #Flag if the discourse is the same as the previous one\n",
+ "df['previous_discourse_flag']=np.where(df['discourse_type'].shift(1)==df['discourse_type'],1,0)\n",
+ "\n",
+ " #Get length of predictionstring\n",
+ "df['predictionstring_len'] = df['predictionstring'].apply(lambda txt:len(txt.split()))\n",
+ "\n",
+ " # Remove spaces in labels\n",
+ "df['discourse_type']=df['discourse_type'].str.replace('Concluding Statement','Concluding_Statement')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "73c807ed",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:23.387677Z",
+ "start_time": "2022-02-09T14:48:03.517512Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " discourse_id \n",
+ " discourse_start \n",
+ " discourse_end \n",
+ " discourse_text \n",
+ " discourse_type \n",
+ " discourse_type_num \n",
+ " predictionstring \n",
+ " previous_discourse_flag \n",
+ " predictionstring_len \n",
+ " label \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 423A1CA112E2 \n",
+ " 1.622628e+12 \n",
+ " 8.0 \n",
+ " 229.0 \n",
+ " Modern humans today are always on their phone.... \n",
+ " Lead \n",
+ " Lead 1 \n",
+ " 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1... \n",
+ " 0 \n",
+ " 44 \n",
+ " B-Lead I-Lead I-Lead I-Lead I-Lead I-Lead I-Le... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id discourse_id discourse_start discourse_end \\\n",
+ "0 423A1CA112E2 1.622628e+12 8.0 229.0 \n",
+ "\n",
+ " discourse_text discourse_type \\\n",
+ "0 Modern humans today are always on their phone.... Lead \n",
+ "\n",
+ " discourse_type_num predictionstring \\\n",
+ "0 Lead 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1... \n",
+ "\n",
+ " previous_discourse_flag predictionstring_len \\\n",
+ "0 0 44 \n",
+ "\n",
+ " label \n",
+ "0 B-Lead I-Lead I-Lead I-Lead I-Lead I-Lead I-Le... "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# vectorize labelizer func and apply to our df \n",
+ "labelizer_vect = np.vectorize(labelizer)\n",
+ "df['label']=labelizer_vect(df['discourse_type'],df['predictionstring_len'],df['previous_discourse_flag'])\n",
+ "df.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "58b4fc6d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:26.232046Z",
+ "start_time": "2022-02-09T14:48:24.060013Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " discourse_id \n",
+ " discourse_start \n",
+ " discourse_end \n",
+ " discourse_text \n",
+ " discourse_type \n",
+ " discourse_type_num \n",
+ " predictionstring \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 423A1CA112E2 \n",
+ " 1.622628e+12 \n",
+ " 8.0 \n",
+ " 229.0 \n",
+ " Modern humans today are always on their phone.... \n",
+ " Lead \n",
+ " Lead 1 \n",
+ " 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 423A1CA112E2 \n",
+ " 1.622628e+12 \n",
+ " 230.0 \n",
+ " 312.0 \n",
+ " They are some really bad consequences when stu... \n",
+ " Position \n",
+ " Position 1 \n",
+ " 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 423A1CA112E2 \n",
+ " 1.622628e+12 \n",
+ " 313.0 \n",
+ " 401.0 \n",
+ " Some certain areas in the United States ban ph... \n",
+ " Evidence \n",
+ " Evidence 1 \n",
+ " 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id discourse_id discourse_start discourse_end \\\n",
+ "0 423A1CA112E2 1.622628e+12 8.0 229.0 \n",
+ "1 423A1CA112E2 1.622628e+12 230.0 312.0 \n",
+ "2 423A1CA112E2 1.622628e+12 313.0 401.0 \n",
+ "\n",
+ " discourse_text discourse_type \\\n",
+ "0 Modern humans today are always on their phone.... Lead \n",
+ "1 They are some really bad consequences when stu... Position \n",
+ "2 Some certain areas in the United States ban ph... Evidence \n",
+ "\n",
+ " discourse_type_num predictionstring \n",
+ "0 Lead 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1... \n",
+ "1 Position 1 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 \n",
+ "2 Evidence 1 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " predictionstring \n",
+ " label \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0000D23A521A \n",
+ " [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
+ " [B-Position, I-Position, I-Position, I-Positio... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 00066EA9880D \n",
+ " [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
+ " [B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 000E6DE9E817 \n",
+ " [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, ... \n",
+ " [B-Position, I-Position, I-Position, I-Positio... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id predictionstring \\\n",
+ "0 0000D23A521A [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
+ "1 00066EA9880D [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
+ "2 000E6DE9E817 [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, ... \n",
+ "\n",
+ " label \n",
+ "0 [B-Position, I-Position, I-Position, I-Positio... \n",
+ "1 [B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea... \n",
+ "2 [B-Position, I-Position, I-Position, I-Positio... "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "## Groupby ID to get predictionstrings and labels as a unique string\n",
+ "\n",
+ "df_essays = df.groupby('id').agg({'predictionstring':' '.join,'label':' '.join})\n",
+ "\n",
+ "## Transform into lists\n",
+ "\n",
+ "df_essays['label'] = df_essays['label'].apply(lambda txt : txt.split())\n",
+ "df_essays['predictionstring'] = df_essays['predictionstring'].apply(lambda txt : txt.split())\n",
+ "\n",
+ "#remove utilities columns created in the original df\n",
+ "df.drop(['previous_discourse_flag','predictionstring_len','label'],axis=1,inplace=True)\n",
+ "\n",
+ "#reset index\n",
+ "df_essays.reset_index(inplace=True)\n",
+ "\n",
+ "display(df.head(3),df_essays.head(3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "592ef22e",
+ "metadata": {},
+ "source": [
+ "## Create dataframe for the test set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "7a898c54",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:28.767641Z",
+ "start_time": "2022-02-09T14:48:28.753338Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " DF920E0A7337 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0FB0700DAF44 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " D46BCB48440A \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 18409261F5C2 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " D72CB1C11673 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id\n",
+ "0 DF920E0A7337\n",
+ "1 0FB0700DAF44\n",
+ "2 D46BCB48440A\n",
+ "3 18409261F5C2\n",
+ "4 D72CB1C11673"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ids = [t.split('.')[0] for t in os.listdir(os.path.join(PATH_RAW_DATA,'test'))]\n",
+ "df_test = pd.DataFrame(ids,columns=['id'])\n",
+ "df_test"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ac4a7ee9",
+ "metadata": {},
+ "source": [
+ "# Retrieve full text properly"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "fbed858f",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:29.456781Z",
+ "start_time": "2022-02-09T14:48:29.447066Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#Function\n",
+ "\n",
+ "def get_essay(id_,mode='train'):\n",
+ " \"\"\"Function to get the full text of an essay from the .txt file.\n",
+ "\n",
+ " Args:\n",
+ " id_ (str): id of the essay\n",
+ " mode (str, optional): determines whether to access *train* or *test* texts. \\\n",
+ " Defaults to 'train'.\n",
+ "\n",
+ " Returns:\n",
+ " str: Returns the full text of the id\n",
+ " \"\"\"\n",
+ " with open(os.path.join(PATH_RAW_DATA,mode,f'{id_}.txt'),'r') as file:\n",
+ " txt = file.read()\n",
+ " return txt.strip()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c47b29dc",
+ "metadata": {},
+ "source": [
+ "# Tokenizer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "8d0c7d8a",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:48:30.490257Z",
+ "start_time": "2022-02-09T14:48:30.472338Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#Function\n",
+ "\n",
+ "def tokenize_labelize(essay,tokenizer,predictionstring=None,labels=None,max_len=SEQ_LEN):\n",
+ " \"\"\"Tokenize an essay and match each token with the corresponding label.\n",
+ "\n",
+ " Args:\n",
+ " essay (str): Text to tokenize\n",
+ " tokenizer (tokenizer): Tokenizer from HF.\n",
+ " predictionstring (pandas.Series | numpy.array, optional): As a unique string, list of index position of words with a label. Must be provided with labels. Defaults to None.\n",
+ " labels (pandas.Series | numpy.array, optional): As a unique string, list of labels of each word. Must be provided with labels. Defaults to None.\n",
+ " max_len (int): Maximum sequence length for padding/truncating.\n",
+ " \n",
+ "\n",
+ " Returns:\n",
+ " dict : Returns a dictionnary with input_ids,attention_mask and labels if passed.\n",
+ " \"\"\"\n",
+ " \n",
+ " tokens = tokenizer(essay,\n",
+ " return_attention_mask = True,\n",
+ " return_token_type_ids = False,\n",
+ " padding = 'max_length',\n",
+ " max_length = SEQ_LEN,\n",
+ " truncation = True,\n",
+ " return_tensors='np'\n",
+ " )\n",
+ " \n",
+ " word_ids=tokens.word_ids()\n",
+ " \n",
+ " labels_mapping = {\n",
+ " 'B-Lead' : 0,\n",
+ " 'B-Position' : 1,\n",
+ " 'B-Evidence' : 2,\n",
+ " 'B-Claim' : 3,\n",
+ " 'B-Concluding_Statement' : 4,\n",
+ " 'B-Counterclaim' : 5,\n",
+ " 'B-Rebuttal' : 6,\n",
+ " 'I-Lead' : 7,\n",
+ " 'I-Position' : 8,\n",
+ " 'I-Evidence' : 9,\n",
+ " 'I-Claim' : 10,\n",
+ " 'I-Concluding_Statement' : 11,\n",
+ " 'I-Counterclaim' : 12,\n",
+ " 'I-Rebuttal': 13\n",
+ " }\n",
+ " \n",
+ " if labels:\n",
+ " match = {p:labels_mapping[l] for p,l in zip(predictionstring,labels)}\n",
+ " labels_matched = [15 if (w==None or w==word_ids[i-1]) \\\n",
+ " else match.get(str(w),14) \\\n",
+ " for i,w in enumerate(word_ids)]\n",
+ " \n",
+ " \n",
+ " return {\n",
+ " 'input_ids' : tokens['input_ids'][0],\n",
+ " 'attention_mask' : tokens['attention_mask'][0],\n",
+ " 'labels': np.array(labels_matched), \n",
+ " 'predictionstring':np.array(word_ids)\n",
+ " }\n",
+ " \n",
+ " return {\n",
+ " 'input_ids' : tokens['input_ids'][0],\n",
+ " 'attention_mask' : tokens['attention_mask'][0],\n",
+ " 'predictionstring':np.array(word_ids)\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7d43342b",
+ "metadata": {},
+ "source": [
+ "# Create preprocessed data\n",
+ "\n",
+ "Working per batch for RAM issues"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "bd001f31",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:49:41.762812Z",
+ "start_time": "2022-02-09T14:49:41.609070Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## Instantiate tokenizer from HF\n",
+ "tokenizer = AutoTokenizer.from_pretrained('backbone')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "6f321835",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:49:44.576833Z",
+ "start_time": "2022-02-09T14:49:44.573401Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## vectorize the function tokenizer above\n",
+ "tokenize_labelize_vect = np.vectorize(tokenize_labelize,excluded=['SEQ_LEN'],otypes=['object'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "05a661ad",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:50:37.492415Z",
+ "start_time": "2022-02-09T14:49:48.077341Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2b6550187b734dab98348745e3db5030",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Processing...: 0%| | 0/313 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Create CSV file with tokens (input_ids, attention_mask, predictionstring, labels)\n",
+ "# also stored in an array tokens\n",
+ "\n",
+ "batch_size = 50\n",
+ "nbatch = int(len(df_essays)/batch_size)+1\n",
+ "\n",
+ "fieldnames = ['id','predictionstring','label','essays']\n",
+ "tokens = np.array([])\n",
+ "\n",
+ "with open(PATH_RAW_DATA+NAME_OUTPUT_FILE,'w') as file :\n",
+ " writer = csv.DictWriter(file,fieldnames = fieldnames)\n",
+ " writer.writeheader()\n",
+ "\n",
+ "for i in tqdm(range(nbatch+1),desc='Processing...'):\n",
+ " df_ = df_essays.loc[i*batch_size:(i+1)*batch_size-1].copy()\n",
+ " df_['essays'] = df_['id'].apply(get_essay)\n",
+ " tokens = np.append(tokens,tokenize_labelize_vect(df_.essays,tokenizer,\n",
+ " df_.predictionstring, df_.label ,max_len=SEQ_LEN))\n",
+ " df_.to_csv(PATH_RAW_DATA+NAME_OUTPUT_FILE,mode='a',header=False)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "bfeff648",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:15.947759Z",
+ "start_time": "2022-02-09T14:51:13.383076Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## sanity check\n",
+ "result = pd.read_csv(PATH_RAW_DATA+NAME_OUTPUT_FILE)\n",
+ "assert(tokens.shape[0]==result.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "4455a2dc",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:16.676568Z",
+ "start_time": "2022-02-09T14:51:16.639286Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "##create tokens_test array\n",
+ "\n",
+ "df_test['essays'] = df_test['id'].apply(get_essay,mode='test')\n",
+ "tokens_test = tokenize_labelize_vect(df_test.essays,tokenizer,max_len=SEQ_LEN)\n",
+ "\n",
+ "## saving as csv\n",
+ "df_test.to_csv(PATH_RAW_DATA+f'preprocessed_inf_v{VERSION}.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "170b3d19",
+ "metadata": {},
+ "source": [
+ "# Build dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "3de728d9",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:17.818262Z",
+ "start_time": "2022-02-09T14:51:17.804759Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def dataset_creator(tokens):\n",
+ " \"\"\"\n",
+ " Creates a dictionnary with tokens attributes as a numpy array.\n",
+ "\n",
+ " Args:\n",
+ " tokens (list): list of dictionnaries, outputs from tokeniner\n",
+ "\n",
+ " Returns:\n",
+ " dict: dict with list of size BATCH_SIZE of inputs_id, attention mask, predictionstring and labels if provided.\n",
+ " \"\"\"\n",
+ " \n",
+ " keys = tokens[0].keys()\n",
+ "\n",
+ " inputs = {\n",
+ " 'input_ids':[],\n",
+ " 'attention_mask':[]\n",
+ " }\n",
+ " predictionstring = []\n",
+ " labels = []\n",
+ " \n",
+ " for t in tqdm(tokens,desc='Aggregating dataset'):\n",
+ " inputs['input_ids'].append(t['input_ids'])\n",
+ " inputs['attention_mask'].append(t['attention_mask'])\n",
+ " predictionstring.append(t['predictionstring'])\n",
+ " if 'labels' in keys:\n",
+ " labels.append(t['labels'])\n",
+ "\n",
+ " \n",
+ " inputs['input_ids'] = np.array(inputs['input_ids'])\n",
+ " inputs['attention_mask'] = np.array(inputs['attention_mask'])\n",
+ " predictionstring = np.array(predictionstring)\n",
+ " labels = np.array(labels)\n",
+ " \n",
+ " if 'labels' in tokens[0].keys():\n",
+ " \n",
+ " #OHE labels\n",
+ " labels_ohe = np.zeros((len(labels),SEQ_LEN,16))\n",
+ " \n",
+ " dim1 = np.arange(len(labels))\n",
+ " dim2 = np.arange(SEQ_LEN)\n",
+ " \n",
+ " labels_ohe[dim1[:,None,None],dim2[None,:,None],labels[:,:,None]] = 1\n",
+ " \n",
+ " return inputs, labels_ohe, predictionstring\n",
+ " \n",
+ " return inputs, predictionstring"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "9465810d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:22.975169Z",
+ "start_time": "2022-02-09T14:51:19.285701Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6e57bccdfaab41979d186cab4d8881dd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Aggregating dataset: 0%| | 0/15594 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#training dataset\n",
+ "if 'labels' in tokens[0].keys():\n",
+ " inputs,labels,predictionstrings = dataset_creator(tokens)\n",
+ "else:\n",
+ " inputs,predictionstrings = dataset_creator(tokens)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "5a6d73d2",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:23.707341Z",
+ "start_time": "2022-02-09T14:51:23.675222Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "1837e1233a0049df9578b850c79ab579",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Aggregating dataset: 0%| | 0/5 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "## creating test dataset\n",
+ "inputs_test,ps_test = dataset_creator(tokens_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5d7f188d",
+ "metadata": {},
+ "source": [
+ "# Save datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "4d6e94b9",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:25.873361Z",
+ "start_time": "2022-02-09T14:51:24.901074Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## Store all objects in a single dictionnary for training\n",
+ "\n",
+ "if 'labels' in tokens[0].keys():\n",
+ " dataset = {\n",
+ " 'inputs':inputs,\n",
+ " 'labels':labels,\n",
+ " 'predictionstrings':predictionstrings\n",
+ " }\n",
+ "else:\n",
+ " dataset = {\n",
+ " 'inputs':inputs,\n",
+ " 'predictionstrings':predictionstrings\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "d09bf01c",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:26.522434Z",
+ "start_time": "2022-02-09T14:51:26.519539Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## store test objects stored in a dict\n",
+ "\n",
+ "dataset_test = {\n",
+ " 'inputs':inputs_test,\n",
+ " 'predictionstrings':ps_test\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "321206c8",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T14:51:35.630267Z",
+ "start_time": "2022-02-09T14:51:27.083694Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "## dump dataset dictionnary with as a pickle file\n",
+ "\n",
+ "with open(f'../raw_data/dataset_v{VERSION}.pickle','wb') as file : \n",
+ " pickle.dump(dataset,file)\n",
+ " \n",
+ "with open(f'../raw_data/dataset_test_v{VERSION}.pickle','wb') as file : \n",
+ " pickle.dump(dataset_test,file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7fefe1cc",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T13:23:34.140576Z",
+ "start_time": "2022-02-08T13:23:34.137608Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "### the end ###"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.12"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {
+ "height": "calc(100% - 180px)",
+ "left": "10px",
+ "top": "150px",
+ "width": "288px"
+ },
+ "toc_section_display": true,
+ "toc_window_display": true
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/arthur/training_v2.ipynb b/notebooks/arthur/training_v2.ipynb
new file mode 100644
index 0000000..7879a47
--- /dev/null
+++ b/notebooks/arthur/training_v2.ipynb
@@ -0,0 +1,1563 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "5a81d29d",
+ "metadata": {
+ "id": "5a81d29d",
+ "toc": true
+ },
+ "source": [
+ "Table of Contents \n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e66f60a",
+ "metadata": {
+ "id": "6e66f60a"
+ },
+ "source": [
+ "This notebook is used for the training of our model and its evaluation on the test split. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7af21fa2",
+ "metadata": {
+ "id": "7af21fa2"
+ },
+ "source": [
+ "# Imports & Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aaafcdc1",
+ "metadata": {
+ "id": "aaafcdc1"
+ },
+ "source": [
+ "## Colab"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9eb34847",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:19:54.049278Z",
+ "start_time": "2022-02-09T15:19:54.041429Z"
+ },
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "9eb34847",
+ "outputId": "98e6dcb8-22ba-454f-c5e4-5b82a5226033"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running the notebook on \u001b[34myour machine\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "from termcolor import colored\n",
+ "\n",
+ "try:\n",
+ " from google.colab import drive\n",
+ " COLAB = True\n",
+ " print('Running the notebook on',colored('Colab','yellow'))\n",
+ "except:\n",
+ " COLAB = False\n",
+ " print('Running the notebook on',colored('your machine','blue'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6443d60f",
+ "metadata": {
+ "id": "6443d60f"
+ },
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "4c0d1bee",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:05.161147Z",
+ "start_time": "2022-02-09T15:19:55.526591Z"
+ },
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4c0d1bee",
+ "outputId": "b87ae6e0-1133-427e-c9a5-70eb095ffd25"
+ },
+ "outputs": [],
+ "source": [
+ "## utilities\n",
+ "import os \n",
+ "import pickle\n",
+ "from datetime import datetime\n",
+ "\n",
+ "## classics \n",
+ "import numpy as np \n",
+ "import pandas as pd \n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns \n",
+ "\n",
+ "## deep\n",
+ "import tensorflow as tf\n",
+ "\n",
+ "if COLAB:\n",
+ " !pip install --quiet transformers\n",
+ "\n",
+ "from transformers import AutoTokenizer, TFAutoModel, AutoConfig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "460a7dec",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:05.320773Z",
+ "start_time": "2022-02-09T15:20:05.175602Z"
+ },
+ "id": "460a7dec"
+ },
+ "outputs": [],
+ "source": [
+ "#evaluation metrics\n",
+ "from sklearn.metrics import confusion_matrix"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "387e4f08",
+ "metadata": {
+ "id": "387e4f08"
+ },
+ "source": [
+ "## Variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "4acaafab",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:05.337097Z",
+ "start_time": "2022-02-09T15:20:05.333617Z"
+ },
+ "id": "4acaafab"
+ },
+ "outputs": [],
+ "source": [
+ "#Max len of essay \n",
+ "SEQ_LEN = 1024 ## THIS SHOULD NOT BE CHANGED without appropriate changes in the preprocessing \n",
+ "\n",
+ "#Train, val, test split proportion\n",
+ "VAL_SPLIT = 0.8\n",
+ "TEST_SPLIT = 0.9\n",
+ "\n",
+ "#Batch size\n",
+ "BATCH_SIZE = 16\n",
+ "\n",
+ "#Data version\n",
+ "VERSION=3\n",
+ "\n",
+ "# Load weights of trained model\n",
+ "MODEL_NAME = 'mymodel'\n",
+ "LOAD_MODEL = True\n",
+ "LOAD_BACKBONE_FROM = '/content/drive/MyDrive/feedback-prize/backbone'\n",
+ "LOAD_MODEL_WEIGHTS_FROM = '/content/drive/MyDrive/feedback-prize/mymodel/mymodel'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "7efa402a",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:11.117941Z",
+ "start_time": "2022-02-09T15:20:11.114592Z"
+ },
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7efa402a",
+ "outputId": "d4aeaada-14d5-4089-a0fb-a959ffc80b02"
+ },
+ "outputs": [],
+ "source": [
+ "## Paths\n",
+ "\n",
+ "## if running in colab, mount drive\n",
+ "if COLAB:\n",
+ " drive.mount('/content/drive')\n",
+ " PATH='/content/drive/MyDrive/feedback-prize/'\n",
+ "else:\n",
+ " PATH='../'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "178d6319",
+ "metadata": {
+ "id": "178d6319"
+ },
+ "source": [
+ "# Load data\n",
+ "\n",
+ "The data is already preprocessed in another notebook.\n",
+ "\n",
+ "The preprocessed data is loaded and splitted in `train`, `val`, `test`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "98e9bc4d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:21.289653Z",
+ "start_time": "2022-02-09T15:20:18.512222Z"
+ },
+ "id": "98e9bc4d"
+ },
+ "outputs": [],
+ "source": [
+ "#Load train and test data\n",
+ "with open(os.path.join(PATH,'raw_data',f'dataset_v{VERSION}.pickle'),'rb') as file:\n",
+ " dataset = pickle.load(file)\n",
+ " \n",
+ "with open(os.path.join(PATH,'raw_data',f'dataset_test_v{VERSION}.pickle'),'rb') as file:\n",
+ " dataset_inf = pickle.load(file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "c01c00ca",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:24.039041Z",
+ "start_time": "2022-02-09T15:20:22.704665Z"
+ },
+ "id": "c01c00ca"
+ },
+ "outputs": [],
+ "source": [
+ "#Load preprocessed.csv file as it will be needed to retrieve predictions\n",
+ "df_essays = pd.read_csv(os.path.join(PATH,'raw_data',f'preprocessed_v{VERSION}.csv'))\n",
+ "df_inf = pd.read_csv(os.path.join(PATH,'raw_data',f'preprocessed_inf_v{VERSION}.csv'),index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "06633885",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:24.658232Z",
+ "start_time": "2022-02-09T15:20:24.653276Z"
+ },
+ "id": "06633885"
+ },
+ "outputs": [],
+ "source": [
+ "## Creating splits indexes\n",
+ "\n",
+ "LEN=len(dataset['labels'])\n",
+ "\n",
+ "idx_val=int(LEN*VAL_SPLIT)\n",
+ "idx_test=int(LEN*TEST_SPLIT)\n",
+ "\n",
+ "idx_train=list(range(0,idx_val))\n",
+ "idx_val=list(range(idx_val,idx_test))\n",
+ "idx_test=list(range(idx_test,LEN))\n",
+ "\n",
+ "assert(len(idx_test)+len(idx_train)+len(idx_val)==LEN)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "d5694515",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:29.000487Z",
+ "start_time": "2022-02-09T15:20:25.580342Z"
+ },
+ "id": "d5694515"
+ },
+ "outputs": [],
+ "source": [
+ "## Splitting dataset\n",
+ "\n",
+ "#train\n",
+ "X_train = {\n",
+ " 'input_ids' : dataset['inputs']['input_ids'][idx_train],\n",
+ " 'attention_mask' : dataset['inputs']['attention_mask'][idx_train]\n",
+ "}\n",
+ "\n",
+ "y_train = dataset['labels'][idx_train]\n",
+ "ps_train = dataset['predictionstrings'][idx_train]\n",
+ "\n",
+ "#val\n",
+ "X_val = {\n",
+ " 'input_ids' : dataset['inputs']['input_ids'][idx_val],\n",
+ " 'attention_mask' : dataset['inputs']['attention_mask'][idx_val]\n",
+ "}\n",
+ "\n",
+ "y_val = dataset['labels'][idx_val]\n",
+ "ps_val = dataset['predictionstrings'][idx_val]\n",
+ "\n",
+ "\n",
+ "#test\n",
+ "X_test = {\n",
+ " 'input_ids' : dataset['inputs']['input_ids'][idx_test],\n",
+ " 'attention_mask' : dataset['inputs']['attention_mask'][idx_test]\n",
+ "}\n",
+ "\n",
+ "y_test = dataset['labels'][idx_test]\n",
+ "ps_test = dataset['predictionstrings'][idx_test]\n",
+ "\n",
+ "#inference\n",
+ "X_inf = dataset_inf['inputs']\n",
+ "ps_inf = dataset_inf['predictionstrings']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "06895f0e",
+ "metadata": {
+ "id": "06895f0e"
+ },
+ "source": [
+ "# Modeling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ce61fc80",
+ "metadata": {
+ "id": "ce61fc80"
+ },
+ "source": [
+ "## Model architecture"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "dbf3d6ae",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T15:05:09.982796Z",
+ "start_time": "2022-02-08T15:04:51.874202Z"
+ },
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dbf3d6ae",
+ "outputId": "0ba8c2c1-1edd-4c6c-d10a-e17735c0811d",
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "All model checkpoint layers were used when initializing TFLongformerModel.\n",
+ "\n",
+ "All the layers of TFLongformerModel were initialized from the model checkpoint at /content/drive/MyDrive/feedback-prize/backbone/tf_model.h5.\n",
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Instantiate model Longformer to be used as backbone\n",
+ "config = AutoConfig.from_pretrained(os.path.join(LOAD_BACKBONE_FROM,'config.json'))\n",
+ "backbone = TFAutoModel.from_pretrained(os.path.join(LOAD_BACKBONE_FROM,'tf_model.h5'),config=config)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "4b31d749",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T15:05:19.028832Z",
+ "start_time": "2022-02-08T15:05:19.014958Z"
+ },
+ "id": "4b31d749"
+ },
+ "outputs": [],
+ "source": [
+ "## TODO : retrieve the attention mask from backbone and pass it to the two LSTM ; currently highly possible the attention mask got lost\n",
+ "\n",
+ "# init model\n",
+ "\n",
+ "def init_model():\n",
+ " input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,),dtype='int32')\n",
+ " attention_mask = tf.keras.layers.Input(shape=(SEQ_LEN,),dtype='int32')\n",
+ " \n",
+ " x = backbone({'input_ids':input_ids,\n",
+ " 'attention_mask':attention_mask})[0]\n",
+ "\n",
+ " backbone.trainable = False\n",
+ "\n",
+ "\n",
+ " x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units = 512,\n",
+ " activation = 'tanh',\n",
+ " dropout=.2,\n",
+ " return_sequences=True))(x)\n",
+ " x_res = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units = 512,\n",
+ " activation = 'tanh',\n",
+ " dropout=.2,\n",
+ " return_sequences=True))(x)\n",
+ " \n",
+ "\n",
+ " x = tf.keras.layers.add([x,x_res])\n",
+ " output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(16,activation = 'softmax'))(x)\n",
+ "\n",
+ " model = tf.keras.models.Model(inputs={'input_ids':input_ids,\n",
+ " 'attention_mask':attention_mask},outputs=output)\n",
+ " \n",
+ " \n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "805fd880",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T15:08:40.730243Z",
+ "start_time": "2022-02-08T15:08:32.179279Z"
+ },
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 360
+ },
+ "id": "805fd880",
+ "outputId": "c4bd4814-003e-4232-876f-94e18ef4a459"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## instantiate the model, plot the graph\n",
+ "model = init_model()\n",
+ "\n",
+ "tf.keras.utils.plot_model(model,show_shapes=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ba134f3f",
+ "metadata": {
+ "id": "ba134f3f"
+ },
+ "source": [
+ "## Loss, Optimizer, Metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "2e741083",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T15:19:20.793041Z",
+ "start_time": "2022-02-08T15:19:20.726503Z"
+ },
+ "id": "2e741083"
+ },
+ "outputs": [],
+ "source": [
+ "# creating homemade metric\n",
+ "\n",
+ "def accuracy_masked_func(y_true,y_pred):\n",
+ " y_pred = tf.cast(tf.argmax(y_pred,axis=-1),'int32')\n",
+ " y_true = tf.cast(y_true,'int32')\n",
+ " y_true = tf.cast(tf.argmax(y_true,axis=-1),'int32') #for y_pred and y_true to match\n",
+ " mask = tf.cast(y_true != 15,'int32') #create a mask\n",
+ " matches = tf.cast(tf.equal(y_true,y_pred),'int32')*mask #calculate the matches ignoring the masking\n",
+ " accuracy = tf.math.reduce_sum(matches,axis=-1)/tf.maximum(tf.math.reduce_sum(mask,axis=-1),1)\n",
+ " \n",
+ " return accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "39f32227",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T15:20:32.714566Z",
+ "start_time": "2022-02-08T15:20:32.434987Z"
+ },
+ "id": "39f32227"
+ },
+ "outputs": [],
+ "source": [
+ "# define loss and metrics \n",
+ "loss = tf.keras.losses.CategoricalCrossentropy(name='categorical_crossentropy')\n",
+ "cat_accuracy = tf.keras.metrics.CategoricalAccuracy()\n",
+ "masked_accuracy = tf.keras.metrics.MeanMetricWrapper(fn=accuracy_masked_func)\n",
+ "\n",
+ "# RMSProp optimizer with clip value and small lr to avoid exploiding gradient \n",
+ "opt = tf.keras.optimizers.RMSprop(clipvalue=.5,learning_rate=0.0001)\n",
+ "\n",
+ "#compile\n",
+ "model.compile(optimizer=opt,loss=loss,metrics=[cat_accuracy,masked_accuracy])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "af59f074",
+ "metadata": {
+ "id": "af59f074"
+ },
+ "source": [
+ "## Model training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "8de8c166",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T15:35:00.705395Z",
+ "start_time": "2022-02-08T15:35:00.630108Z"
+ },
+ "id": "8de8c166"
+ },
+ "outputs": [],
+ "source": [
+ "#### CALLBACKS\n",
+ "\n",
+ "timestamp = datetime.today().__format__('%d%m_%Hh%M')\n",
+ "\n",
+ "checkpoints_path = f'{PATH}{MODEL_NAME}/{MODEL_NAME}_{timestamp}.ckpt'\n",
+ "logdir = '/content/drive/MyDrive/feedback-prize/logs/'\n",
+ "\n",
+ "\n",
+ "#early stopping\n",
+ "es = tf.keras.callbacks.EarlyStopping(patience=1,restore_best_weights=True)\n",
+ "\n",
+ "#save weights at every epoch\n",
+ "checkpoint_saver = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoints_path,\n",
+ " save_weights_only=True,\n",
+ " save_best_only = True,\n",
+ " monitor = 'val_categorical_accuracy',\n",
+ " mode = 'max',\n",
+ " verbose = 1)\n",
+ "\n",
+ "#logs for tensorboard\n",
+ "tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logdir)\n",
+ "\n",
+ "# list callbacks\n",
+ "\n",
+ "callbacks=[es,checkpoint_saver,tensorboard]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "d14a118f",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-08T15:36:39.914302Z",
+ "start_time": "2022-02-08T15:36:39.910221Z"
+ },
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "d14a118f",
+ "outputId": "85f537a3-c0a1-45cd-9946-b0f60a430f3d"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading model weights below\n"
+ ]
+ }
+ ],
+ "source": [
+ "## TRAINING\n",
+ "if not LOAD_MODEL:\n",
+ " history = model.fit(X_train,y_train,\n",
+ " validation_data= (X_val,y_val),\n",
+ " epochs=30,callbacks=callbacks,batch_size=BATCH_SIZE)\n",
+ "else:\n",
+ " print('Loading model weights below')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da39c25d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## saving\n",
+ "if not LOAD_MODEL:\n",
+ " os.mkdir(MODEL_NAME)\n",
+ " model.save_weights(MODEL_NAME)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5c9b34e7",
+ "metadata": {
+ "id": "5c9b34e7"
+ },
+ "source": [
+ "## Model evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "e4b0466a",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "e4b0466a",
+ "outputId": "ee35de22-350b-4c42-b124-69e1246ed7d4",
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98/98 [==============================] - 193s 2s/step - loss: 0.3133 - categorical_accuracy: 0.8977 - mean_metric_wrapper: 0.7678\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[0.3132975399494171, 0.8977440595626831, 0.7678290605545044]"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## model eval on val set \n",
+ "model.evaluate(X_val,y_val,batch_size=BATCH_SIZE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9153f4d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#predict on test split\n",
+ "y_pred = model.predict(X_test,batch_size=BATCH_SIZE)\n",
+ "\n",
+ "#dump pickle\n",
+ "with open(os.path.join(PATH,'raw_data','preds_on_testsplit.pickle'),'wb') as file:\n",
+ " pickle.dump(y_pred,file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b2bb7321",
+ "metadata": {},
+ "source": [
+ "> Evaluation on a separate notebook"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9a0d1923",
+ "metadata": {
+ "heading_collapsed": true,
+ "id": "9a0d1923"
+ },
+ "source": [
+ "## Model evaluation\n",
+ "\n",
+ "No longer accurate "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb0aa2f6",
+ "metadata": {
+ "hidden": true,
+ "id": "fb0aa2f6"
+ },
+ "source": [
+ "### Get predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "9LPEEQCzC3LI",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:45.557610Z",
+ "start_time": "2022-02-09T15:20:45.532672Z"
+ },
+ "hidden": true,
+ "id": "9LPEEQCzC3LI"
+ },
+ "outputs": [],
+ "source": [
+ "labels_mapping = {'B-Lead' : 0,\n",
+ " 'B-Position' : 1,\n",
+ " 'B-Evidence' : 2,\n",
+ " 'B-Claim' : 3,\n",
+ " 'B-Concluding_Statement' : 4,\n",
+ " 'B-Counterclaim' : 5,\n",
+ " 'B-Rebuttal' : 6,\n",
+ " 'I-Lead' : 7,\n",
+ " 'I-Position' : 8,\n",
+ " 'I-Evidence' : 9,\n",
+ " 'I-Claim' : 10,\n",
+ " 'I-Concluding_Statement' : 11,\n",
+ " 'I-Counterclaim' : 12,\n",
+ " 'I-Rebuttal': 13,\n",
+ " 'O':14,\n",
+ " 'PAD':15}\n",
+ "\n",
+ "reversed_mapping = {v:(k[2:] if v<14 else k) for k,v in labels_mapping.items()}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "4586e676",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:20:45.632079Z",
+ "start_time": "2022-02-09T15:20:45.600034Z"
+ },
+ "hidden": true,
+ "id": "4586e676"
+ },
+ "outputs": [],
+ "source": [
+ "def get_preds(y_pred,ps):\n",
+ " \"\"\"\n",
+ " Generate readable predictions from the output of the model.\n",
+ "\n",
+ " Args:\n",
+ " y_pred (ndarray): output of the model\n",
+ " ps (ndarray): predictionstring referring to the token predicted\n",
+ "\n",
+ " Returns:\n",
+ " DataFrame : DataFrame with class and predictionstrings\n",
+ " \"\"\"\n",
+ "\n",
+ "\n",
+ " labels = []\n",
+ " predictionstrings = []\n",
+ " counts = []\n",
+ " \n",
+ " counter=dict()\n",
+ " \n",
+ " for tok,pos in zip(y_pred,ps):\n",
+ " \n",
+ " if tok <= 13:\n",
+ " lab = reversed_mapping[tok]\n",
+ " labels.append(lab)\n",
+ " predictionstrings.append(pos)\n",
+ " if len(labels)<2:\n",
+ " counts.append(str(1))\n",
+ " counter.setdefault(lab,1)\n",
+ " continue\n",
+ " if lab == labels[-2]:\n",
+ " counts.append(str(counter[lab]))\n",
+ " else: \n",
+ " try:\n",
+ " counter[lab]+=1\n",
+ " except KeyError:\n",
+ " counter.setdefault(lab,1)\n",
+ " counts.append(str(counter[lab]))\n",
+ " \n",
+ " preds = pd.DataFrame([labels,counts,predictionstrings],index=['class','count','predictionstring']).T\n",
+ " preds['class'] += ' ' + preds['count'].astype(str)\n",
+ " preds = preds.groupby('class',sort=False).agg({'predictionstring':list}).reset_index()\n",
+ " preds['class']=preds['class'].apply(lambda txt : txt.split()[0])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : [str(l) for l in l_])\n",
+ " preds['predictionstring']=preds['predictionstring'].apply(lambda l_ : ' '.join(l_))\n",
+ " \n",
+ " return preds"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dbfbd22c",
+ "metadata": {
+ "hidden": true,
+ "id": "dbfbd22c"
+ },
+ "source": [
+ "### Evaluate test split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "74d63b23",
+ "metadata": {
+ "hidden": true,
+ "id": "74d63b23"
+ },
+ "outputs": [],
+ "source": [
+ "#predict on test split\n",
+ "y_pred = model.predict(X_test,batch_size=BATCH_SIZE)\n",
+ "\n",
+ "#dump pickle\n",
+ "with open(os.path.join(PATH,'raw_data','preds_on_testsplit.pickle'),'wb') as file:\n",
+ " pickle.dump(y_pred,file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "234d1115",
+ "metadata": {
+ "hidden": true,
+ "id": "234d1115"
+ },
+ "source": [
+ "#### F1 Report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "87454dba",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:23:57.539068Z",
+ "start_time": "2022-02-09T15:23:08.212220Z"
+ },
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 381
+ },
+ "hidden": true,
+ "id": "87454dba",
+ "outputId": "e76feccd-2868-47ab-eb6b-a2936fcaa7c5"
+ },
+ "outputs": [],
+ "source": [
+ "## Create two DF with preds and ground truth\n",
+ "\n",
+ "y_true = np.argmax(y_test,axis=-1)\n",
+ "y_pred = np.argmax(y_pred_,axis=-1)\n",
+ "\n",
+ "ps = ps_test\n",
+ "\n",
+ "true_df = pd.DataFrame()\n",
+ "pred_df = pd.DataFrame()\n",
+ "\n",
+ "for i,idx in enumerate(idx_test): ## CHANGE idx_test\n",
+ " \n",
+ " true_ = get_preds(y_true[i],ps[i])\n",
+ " pred_ = get_preds(y_pred[i],ps[i])\n",
+ " \n",
+ " true_['id']=df_essays.iloc[idx]['id']\n",
+ " pred_['id']=df_essays.iloc[idx]['id']\n",
+ " \n",
+ " true_df = true_df.append(true_)\n",
+ " pred_df = pred_df.append(pred_)\n",
+ " \n",
+ "true_df['unique_id'] = pd.util.hash_pandas_object(true_df,hash_key='1234567890123456')\n",
+ "pred_df['unique_id'] = pd.util.hash_pandas_object(pred_df,hash_key='azerty1234567890')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "d6d45514",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:24:11.107286Z",
+ "start_time": "2022-02-09T15:24:11.101012Z"
+ },
+ "hidden": true,
+ "id": "d6d45514"
+ },
+ "outputs": [],
+ "source": [
+ "## Determine whether a prediction is a true positive or not\n",
+ "\n",
+ "def true_positive(predictionstring_true,predictionstring_pred):\n",
+ " ps_true = set(predictionstring_true.split(' '))\n",
+ " ps_pred = set(predictionstring_pred.split(' '))\n",
+ " \n",
+ " inter = ps_pred.intersection(ps_true)\n",
+ " overlap_1 = len(inter)/len(ps_true)\n",
+ " overlap_2 = len(inter)/len(ps_pred)\n",
+ " \n",
+ " if overlap_1 >= .5 and overlap_2 >= .5:\n",
+ " return 1\n",
+ " return 0\n",
+ "\n",
+ "## vectorize the funct\n",
+ "true_positive_vect = np.vectorize(true_positive)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "e811bd53",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:24:13.910117Z",
+ "start_time": "2022-02-09T15:24:13.487097Z"
+ },
+ "hidden": true,
+ "id": "e811bd53"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " class \n",
+ " predictionstring_pred \n",
+ " id \n",
+ " unique_id_pred \n",
+ " predictionstring_true \n",
+ " unique_id_true \n",
+ " FP \n",
+ " FN \n",
+ " TP \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Lead \n",
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... \n",
+ " E6870101D8EE \n",
+ " 1.172241e+19 \n",
+ " \n",
+ " NaN \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Lead \n",
+ " 65 66 67 68 69 70 71 72 73 74 75 76 77 78 \n",
+ " E6870101D8EE \n",
+ " 9.178975e+18 \n",
+ " \n",
+ " NaN \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Lead \n",
+ " 83 84 \n",
+ " E6870101D8EE \n",
+ " 2.577067e+18 \n",
+ " \n",
+ " NaN \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Evidence \n",
+ " 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... \n",
+ " E6870101D8EE \n",
+ " 5.320668e+18 \n",
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... \n",
+ " 1.447722e+19 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Evidence \n",
+ " 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... \n",
+ " E6870101D8EE \n",
+ " 5.320668e+18 \n",
+ " 110 111 112 113 114 115 116 117 118 119 120 12... \n",
+ " 1.562738e+19 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " class predictionstring_pred id \\\n",
+ "0 Lead 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... E6870101D8EE \n",
+ "1 Lead 65 66 67 68 69 70 71 72 73 74 75 76 77 78 E6870101D8EE \n",
+ "2 Lead 83 84 E6870101D8EE \n",
+ "3 Evidence 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... E6870101D8EE \n",
+ "4 Evidence 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 4... E6870101D8EE \n",
+ "\n",
+ " unique_id_pred predictionstring_true \\\n",
+ "0 1.172241e+19 \n",
+ "1 9.178975e+18 \n",
+ "2 2.577067e+18 \n",
+ "3 5.320668e+18 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18... \n",
+ "4 5.320668e+18 110 111 112 113 114 115 116 117 118 119 120 12... \n",
+ "\n",
+ " unique_id_true FP FN TP \n",
+ "0 NaN 1 0 0 \n",
+ "1 NaN 1 0 0 \n",
+ "2 NaN 1 0 0 \n",
+ "3 1.447722e+19 0 0 0 \n",
+ "4 1.562738e+19 0 0 0 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## Merge the 2 DF to compute F1 \n",
+ "\n",
+ "merged_df = pred_df.merge(true_df,how = 'outer',on=['id','class'],suffixes=('_pred','_true'))\n",
+ "\n",
+ "## if a pred is not matched it is a FN ; if the truth is not matched it is a FP\n",
+ "\n",
+ "#creating separate columns \n",
+ "merged_df['FP'] = np.where(merged_df.predictionstring_true.isna(), 1, 0)\n",
+ "merged_df['FN'] = np.where(merged_df.predictionstring_pred.isna(), 1, 0)\n",
+ "\n",
+ "\n",
+ "#cleaning nan for the true positive function\n",
+ "merged_df['predictionstring_pred'].fillna('',inplace=True)\n",
+ "merged_df['predictionstring_true'].fillna('',inplace=True)\n",
+ "\n",
+ "\n",
+ "merged_df['TP'] = true_positive_vect(merged_df['predictionstring_true'],merged_df['predictionstring_pred'])\n",
+ "\n",
+ "merged_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "b518be6c",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:24:16.473582Z",
+ "start_time": "2022-02-09T15:24:16.470122Z"
+ },
+ "hidden": true,
+ "id": "b518be6c"
+ },
+ "outputs": [],
+ "source": [
+ "## creating f1 function\n",
+ "def f1_score(fp,fn,tp):\n",
+ " return tp/(tp+.5*(fp+fn))*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "62ad9adb",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:26:19.380276Z",
+ "start_time": "2022-02-09T15:26:19.352593Z"
+ },
+ "hidden": true,
+ "id": "62ad9adb"
+ },
+ "outputs": [],
+ "source": [
+ "## group by class for F1 score calculation\n",
+ "f1_df = merged_df.groupby('class').sum()\n",
+ "f1_df.drop(['unique_id_pred','unique_id_true'],axis=1,inplace=True)\n",
+ "\n",
+ "#apply to the df\n",
+ "f1_df['f1']=f1_score(f1_df.FP,f1_df.FN,f1_df.TP)\n",
+ "\n",
+ "#Create a total row\n",
+ "f1_df.loc['Total']=f1_df.mean()\n",
+ "\n",
+ "## weighted average\n",
+ "\n",
+ "f1_df['support'] = true_df.groupby('class').count()['id']\n",
+ "\n",
+ "f1_df['f1_weighted']=f1_df['f1']*(f1_df['support']/f1_df['support'].sum())\n",
+ "f1_df.loc['Total','support']=f1_df.support.sum()\n",
+ "f1_df.loc['Total','f1_weighted']=f1_df.f1_weighted.sum()\n",
+ "\n",
+ "# impute correct values for the Total row for TP FN FP \n",
+ "\n",
+ "f1_df.loc['Total','FP'] = f1_df.loc[:'Rebuttal','FP'].sum()\n",
+ "f1_df.loc['Total','FN'] = f1_df.loc[:'Rebuttal','FN'].sum()\n",
+ "f1_df.loc['Total','TP'] = f1_df.loc[:'Rebuttal','TP'].sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "d296aa49",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:26:19.534016Z",
+ "start_time": "2022-02-09T15:26:19.530003Z"
+ },
+ "hidden": true,
+ "id": "d296aa49"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "F1 Macro Score = 76.36%\n",
+ "F1 Micro Score = 90.47%\n",
+ "F1 Weighted Score = 90.70%\n"
+ ]
+ }
+ ],
+ "source": [
+ "## PRINT REPORT\n",
+ "\n",
+ "print(f\"F1 Macro Score = {f1_df.loc['Total','f1']:.2f}%\")\n",
+ "\n",
+ "\n",
+ "print(f\"F1 Micro Score = {f1_score(f1_df.loc['Total','FP'],f1_df.loc['Total','FN'],f1_df.loc['Total','TP']):.2f}%\")\n",
+ "\n",
+ "print(f\"F1 Weighted Score = {f1_df.loc['Total','f1_weighted']:.2f}%\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "e3cd3b9d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:26:19.827938Z",
+ "start_time": "2022-02-09T15:26:19.681316Z"
+ },
+ "hidden": true,
+ "id": "e3cd3b9d"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.barplot(y=f1_df.reset_index().loc[:6,'f1'],x=f1_df.index[:-1],palette='Set2')\n",
+ "plt.xticks(rotation=90)\n",
+ "locs,_=plt.xticks()\n",
+ "plt.plot([locs[0]-.5,locs[-1]+.5],[f1_df.loc['Total','f1'],f1_df.loc['Total','f1']],c='r');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "ee4dd298",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:26:20.595497Z",
+ "start_time": "2022-02-09T15:26:20.579822Z"
+ },
+ "hidden": true,
+ "id": "ee4dd298"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " FP \n",
+ " FN \n",
+ " TP \n",
+ " f1 \n",
+ " support \n",
+ " f1_weighted \n",
+ " \n",
+ " \n",
+ " class \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Claim \n",
+ " 79 \n",
+ " 49 \n",
+ " 1570 \n",
+ " 96.08% \n",
+ " 3412 \n",
+ " 27.34% \n",
+ " \n",
+ " \n",
+ " Concluding_Statement \n",
+ " 214 \n",
+ " 27 \n",
+ " 1106 \n",
+ " 90.18% \n",
+ " 1346 \n",
+ " 10.12% \n",
+ " \n",
+ " \n",
+ " Counterclaim \n",
+ " 162 \n",
+ " 202 \n",
+ " 141 \n",
+ " 43.65% \n",
+ " 542 \n",
+ " 1.97% \n",
+ " \n",
+ " \n",
+ " Evidence \n",
+ " 25 \n",
+ " 0 \n",
+ " 2400 \n",
+ " 99.48% \n",
+ " 3792 \n",
+ " 31.46% \n",
+ " \n",
+ " \n",
+ " Lead \n",
+ " 249 \n",
+ " 50 \n",
+ " 774 \n",
+ " 83.81% \n",
+ " 963 \n",
+ " 6.73% \n",
+ " \n",
+ " \n",
+ " Position \n",
+ " 23 \n",
+ " 68 \n",
+ " 955 \n",
+ " 95.45% \n",
+ " 1535 \n",
+ " 12.22% \n",
+ " \n",
+ " \n",
+ " Rebuttal \n",
+ " 128 \n",
+ " 199 \n",
+ " 57 \n",
+ " 25.85% \n",
+ " 402 \n",
+ " 0.87% \n",
+ " \n",
+ " \n",
+ " Total \n",
+ " 880 \n",
+ " 595 \n",
+ " 7003 \n",
+ " 76.36% \n",
+ " 11992 \n",
+ " 90.70% \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " FP FN TP f1 support f1_weighted\n",
+ "class \n",
+ "Claim 79 49 1570 96.08% 3412 27.34%\n",
+ "Concluding_Statement 214 27 1106 90.18% 1346 10.12%\n",
+ "Counterclaim 162 202 141 43.65% 542 1.97%\n",
+ "Evidence 25 0 2400 99.48% 3792 31.46%\n",
+ "Lead 249 50 774 83.81% 963 6.73%\n",
+ "Position 23 68 955 95.45% 1535 12.22%\n",
+ "Rebuttal 128 199 57 25.85% 402 0.87%\n",
+ "Total 880 595 7003 76.36% 11992 90.70%"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Full report\n",
+ "f1_df[['FP','FN','TP','support']]=f1_df[['FP','FN','TP','support']].applymap('{:.0f}'.format)\n",
+ "f1_df['f1_weighted']=f1_df['f1_weighted'].map('{:.2f}%'.format)\n",
+ "f1_df['f1']=f1_df['f1'].map('{:.2f}%'.format)\n",
+ "\n",
+ "f1_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0514f5e",
+ "metadata": {
+ "hidden": true,
+ "id": "f0514f5e"
+ },
+ "source": [
+ "#### Confusion Matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "c3a81d7f",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:26:26.334535Z",
+ "start_time": "2022-02-09T15:26:26.322643Z"
+ },
+ "hidden": true,
+ "id": "c3a81d7f"
+ },
+ "outputs": [],
+ "source": [
+ "def show_confusion_matrix(y_true,y_pred):\n",
+ "\n",
+ " reversed_mapping = {\n",
+ " 0: 'Lead',\n",
+ " 1: 'Position',\n",
+ " 2: 'Evidence',\n",
+ " 3: 'Claim',\n",
+ " 4: 'Concluding_Statement',\n",
+ " 5: 'Counterclaim',\n",
+ " 6: 'Rebuttal',\n",
+ " 7: 'Lead',\n",
+ " 8: 'Position',\n",
+ " 9: 'Evidence',\n",
+ " 10: 'Claim',\n",
+ " 11: 'Concluding_Statement',\n",
+ " 12: 'Counterclaim',\n",
+ " 13: 'Rebuttal',\n",
+ " 14: 'O',\n",
+ " 15: 'PAD'}\n",
+ " \n",
+ " y_true_flat = [reversed_mapping[y] for y in y_true.flatten()]\n",
+ " y_pred_flat = [reversed_mapping[y] for y in y_pred.flatten()]\n",
+ " \n",
+ " LABELS = ['Lead','Position','Claim','Counterclaim','Rebuttal','Evidence','Concluding_Statement','O','PAD']\n",
+ "\n",
+ " cfn = confusion_matrix(y_true_flat,y_pred_flat,labels=LABELS)\n",
+ " \n",
+ " fig,ax = plt.subplots(1,1,figsize=(10,10))\n",
+ " plt.title('Confusion Matrix',size=18,pad=20)\n",
+ " sns.heatmap(cfn/np.sum(cfn,axis=0)*100,cmap='Blues',annot = True,fmt='.2f',annot_kws={'size':10},ax=ax);\n",
+ " plt.xticks(np.arange(len(LABELS))+.5,LABELS,rotation = 90,size=12);\n",
+ " plt.yticks(np.arange(len(LABELS))+.5,LABELS,rotation = 0,size=12);\n",
+ " plt.xlabel('PREDICTED',size=16);\n",
+ " plt.ylabel('ACTUAL',size=16);\n",
+ " for t in ax.texts: t.set_text(t.get_text() + \" %\")\n",
+ " \n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "80189c2f",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-09T15:26:32.922434Z",
+ "start_time": "2022-02-09T15:26:27.324606Z"
+ },
+ "hidden": true,
+ "id": "80189c2f"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "show_confusion_matrix(y_true,y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f03e95e4",
+ "metadata": {
+ "hidden": true
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "machine_shape": "hm",
+ "name": "training_v2.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.12"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": true,
+ "toc_position": {
+ "height": "calc(100% - 180px)",
+ "left": "10px",
+ "top": "150px",
+ "width": "288px"
+ },
+ "toc_section_display": true,
+ "toc_window_display": true
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/arthur/utils.ipynb b/notebooks/arthur/utils.ipynb
new file mode 100644
index 0000000..69551cd
--- /dev/null
+++ b/notebooks/arthur/utils.ipynb
@@ -0,0 +1,373 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "9b1e0525",
+ "metadata": {},
+ "source": [
+ "# Imports and data loading"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "d09b1154",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-14T17:47:42.080464Z",
+ "start_time": "2022-02-14T17:47:39.447048Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "import pandas as pd\n",
+ "import numpy as np \n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "from IPython.display import HTML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "36a2c14a",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-14T17:48:09.482598Z",
+ "start_time": "2022-02-14T17:48:09.468757Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# loading only a sample for testing purposes\n",
+ "df = pd.read_csv('../raw_data/train.csv',nrows=300)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ee8eeff1",
+ "metadata": {},
+ "source": [
+ "# Showing ground truth and prediction in text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "id": "4a3f06ab",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T14:28:01.286117Z",
+ "start_time": "2022-02-15T14:28:01.183939Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#import custom.css into this notebook\n",
+ "\n",
+ "def css():\n",
+ " styles = open(\"./styles/custom.css\", \"r\").read()\n",
+ " return HTML('')\n",
+ "css()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "id": "703baf88",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-14T20:22:58.368090Z",
+ "start_time": "2022-02-14T20:22:58.289671Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def render_html(df):\n",
+ " return \"<{0} style='padding: 2px'>{1} [{0}] {0}>\".format(df['discourse_type'],df['discourse_text'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "id": "eafb22ad",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-14T20:24:18.459300Z",
+ "start_time": "2022-02-14T20:24:18.452825Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def comparison_text(prediction=pred, ground_truth=true):\n",
+ " html = f\"\"\"\n",
+ " \n",
+ "
Legend --> \n",
+ "
Lead \n",
+ "
Position \n",
+ "
Claim \n",
+ "
Counterclaim \n",
+ "
Rebuttal \n",
+ "
Evidence \n",
+ "
Concluding_Statement \n",
+ "
\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
Prediction \n",
+ "
{pred}
\n",
+ "
\n",
+ "
\n",
+ "
Ground Truth \n",
+ "
{true}
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \"\"\"\n",
+ " \n",
+ " return HTML(html)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "id": "b513750b",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-14T18:15:43.568970Z",
+ "start_time": "2022-02-14T18:15:43.554821Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# as if the output were post processed ; just to see \n",
+ "\n",
+ "df['html'] = df.apply(render_html, axis=1) #transform discourse_text in html txt with formating \n",
+ "df_essays = df.groupby('id').agg({'html':' '.join,}).reset_index() #groupby essay id\n",
+ "\n",
+ "true = df_essays.loc[7,'html'].replace('Concluding Statement','Concluding_Statement')\n",
+ "pred = df_essays.loc[15,'html'].replace('Concluding Statement','Concluding_Statement')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 115,
+ "id": "756fcffb",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-02-15T18:32:31.663131Z",
+ "start_time": "2022-02-15T18:32:31.657046Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ "
Legend --> \n",
+ "
Lead \n",
+ "
Position \n",
+ "
Claim \n",
+ "
Counterclaim \n",
+ "
Rebuttal \n",
+ "
Evidence \n",
+ "
Concluding_Statement \n",
+ "
\n",
+ "\n",
+ " \n",
+ "
\n",
+ "
Prediction \n",
+ "
The ability to stay connected to people we know despite distance was originally brought to fruition by the use of letters. This system was found to be rather slow and new pathways were searched for until the invention of the telegram; the people thought it was an invention of the millennia, but after time it too was thought of as slow until the invention of the telephone. Today, a telephone is in the hand or pocket of a majority of the seven billion people on planet earth [Lead] However, this device is taken to areas that it is irresponsible and dangerous. [Position] Within a vehicle capable of traveling upwards of one hundred miles per hour any possible distraction can become fatal spontaneously [Claim] The most common of these distractions is a cell phone, with its capabilities to connect us to anyone also in ownership of one, it is easy to pick it up whenever it sounds. In that split second of reaching over for a phone, eyes no longer on the road, it is impossible to no an exact location of anything, making an extremely dangerous action. For the myriad of possibilities that lead to serious injury cell phones should stay in the current state they are in regards of the law, but taken as a more serious offense. [Evidence] Conversely people may believe that laws in present need to change, becoming less restrictive. People have the right to communicate with whom they wish, when or wherever they may choose to do so. [Counterclaim] The problem becomes apparent that this is a selfish process of thought; people aren't thinking of those they share the road with. Laws currently in place are not to punish people making poor choices, they are an attempt to keep people safe. [Rebuttal] The creation of telecommunication devices was to keep connected to others without regard to the obsession that would encompass the human mind that was bound to follow. The safety of people is top priority without exemption. [Concluding_Statement]
\n",
+ "
\n",
+ "
\n",
+ "
Ground Truth \n",
+ "
Drivers should absolutely never be able to use a cell phone while driving. [Position] This is evident as in recent years there has been a big problem in our society with drivers using their cell phones while driving. It has gotten so bad that it has become illegal because driving while using a cell phone is more likely to be the cause of a fatal accident, than when a person drives under the influence of drugs or alcohol.\n",
+ " [Evidence] Use a hands free device at all times while driving. I say this because a cell phone will take your attention off of the task at hand. A driver's focus should be maintained on only driving to ensure safety [Evidence] It has been proven that driving while using a cell phone is the cause of more vehicular accidents resulting in a fatality than those caused by drivers who are under the influence of drugs or alcohol. [Claim] In two thousand and thirteen there were three thousand one hundred fifty-four people killed in distracted driver related crashes. In just one year's time, there are three hundred and ninety thousand people injured in accidents caused by texting and driving. I personally do not use my cell phone when I drive, because it distracts me very easily. [Evidence] I suggest that drivers should be able to use their cell phone for an emergency purpose only. Even in a crisis situation, the driver should pull over to the side of the road before making a call. [Claim] I also encourage that the cell phone companies should be made to have the cell phone come from the factory to where if your cellphone is moving, it will not be allowed to ring or give you any notifications. [Claim] In order to get your notifications, you will have to manually tell your cellphone to read that you are not driving for it to allow you to open the phone. It will then give you the notifications and or missed calls you have missed. In addition, it will not allow you to access absolutely any of your social media platforms or let you get social media notifications while you are driving. [Evidence] In conclusion, my opinion on whether drivers should be using their cell phones while driving is that they should refrain from using their cell phone while driving. This is because driving is a task that requires a lot of focus and a cell phone is very distracting.\n",
+ "\n",
+ "Driving while using a cell phone is now illegal because of how dangerous it is. [Concluding_Statement]
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 115,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "comparison_text(pred,true)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7eeb7b82",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.12"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}