From e0af302f4ad4669bca0f70ac17f4412313beebc8 Mon Sep 17 00:00:00 2001
From: Jan Heinrich Merker <heinrich@merker.id>
Date: Mon, 25 Nov 2024 12:53:26 +0100
Subject: [PATCH] Fix and improve stopwords tutorial

---
 tutorials/tutorial-stopword-lists.ipynb | 1302 ++++++++++++-----------
 1 file changed, 659 insertions(+), 643 deletions(-)

diff --git a/tutorials/tutorial-stopword-lists.ipynb b/tutorials/tutorial-stopword-lists.ipynb
index 804a4d5..c47cb38 100644
--- a/tutorials/tutorial-stopword-lists.ipynb
+++ b/tutorials/tutorial-stopword-lists.ipynb
@@ -1,672 +1,688 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "LCoilJP00Boh"
-      },
-      "source": [
-        "# IR Lab Tutorial: Stopword Lists\n",
-        "\n",
-        "This tutorial shows how to configure and use custom stopword lists in PyTerrier.\n",
-        "\n",
-        "**Attention:** The scenario below is cherry-picked to explain the concept of stopword lists with a minimal example.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Z5O2-TXL0Vrl"
-      },
-      "source": [
-        "## Preparation: Install dependencies"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "_8E377GCgB_q",
-        "outputId": "bda4bbf4-13a4-430b-f236-9b841cc9b69a"
-      },
-      "outputs": [],
-      "source": [
-        "# This is only needed in Google Colab, in a dev container, everything should be installed already\n",
-        "!pip3 install python-terrier"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "O-3NyXcP0ZGu"
-      },
-      "source": [
-        "## Our Scenario\n",
-        "\n",
-        "We want to build a search engine to support web developers working with CSS.\n",
-        "\n",
-        "Our search engine has the following 3 documents:\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "zfGKcxfcgrF6"
-      },
-      "outputs": [],
-      "source": [
-        "documents = [\n",
-        "    {'docno': 'd1', 'text': 'In CSS, ::before creates a pseudo-element that is the first child of the selected element.'},\n",
-        "    {'docno': 'd2', 'text': 'In CSS, ::after creates a pseudo-element that is the last child of the selected element.'},\n",
-        "    {'docno': 'd3', 'text': 'The ::first-line CSS pseudo-element applies styles to the first line of a block-level element.'}\n",
-        "]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PePGP6ll2WNd"
-      },
-      "source": [
-        "We create an index containing our three documents and use BM25 as retrieval model:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Sfh3QpYFhDsc",
-        "outputId": "e256317a-70f8-4095-9d6e-6f7a65e79db9"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n",
-            "\n",
-            "No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n"
-          ]
-        }
-      ],
-      "source": [
-        "import pyterrier as pt\n",
-        "if not pt.started():\n",
-        "    pt.init()\n",
-        "\n",
-        "def create_index(documents):\n",
-        "    indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, meta={'docno': 100, 'text': 20480})\n",
-        "    index_ref = indexer.index(documents)\n",
-        "    return pt.IndexFactory.of(index_ref)\n",
-        "\n",
-        "index = create_index(documents)"
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LCoilJP00Boh"
+   },
+   "source": [
+    "# IR Lab Tutorial: Stopword Lists\n",
+    "\n",
+    "This tutorial shows how to configure and use custom stopword lists in PyTerrier.\n",
+    "\n",
+    "**Attention:** The scenario below is cherry-picked to explain the concept of stopword lists with a minimal example.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Z5O2-TXL0Vrl"
+   },
+   "source": [
+    "## Preparation: Install dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "zhecz8fj42BX"
-      },
-      "outputs": [],
-      "source": [
-        "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
-      ]
+    "id": "_8E377GCgB_q",
+    "outputId": "bda4bbf4-13a4-430b-f236-9b841cc9b69a"
+   },
+   "outputs": [],
+   "source": [
+    "# This is only needed in Google Colab, in a dev container, everything should be installed already\n",
+    "!pip3 install python-terrier"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "O-3NyXcP0ZGu"
+   },
+   "source": [
+    "## Our Scenario\n",
+    "\n",
+    "We want to build a search engine to support web developers working with CSS.\n",
+    "\n",
+    "Our search engine has the following 3 documents:\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "zfGKcxfcgrF6"
+   },
+   "outputs": [],
+   "source": [
+    "documents = [\n",
+    "    {'docno': 'd1', 'text': 'In CSS, ::before creates a pseudo-element that is the first child of the selected element.'},\n",
+    "    {'docno': 'd2', 'text': 'In CSS, ::after creates a pseudo-element that is the last child of the selected element.'},\n",
+    "    {'docno': 'd3', 'text': 'The ::first-line CSS pseudo-element applies styles to the first line of a block-level element.'}\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "PePGP6ll2WNd"
+   },
+   "source": [
+    "We create an index containing our three documents and use BM25 as retrieval model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "Sfh3QpYFhDsc",
+    "outputId": "e256317a-70f8-4095-9d6e-6f7a65e79db9"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "TozJqJ_73A92"
-      },
-      "source": [
-        "## The Problem\n",
-        "\n",
-        "Our search engine is now in operation for a while and we already received some positive feedback.\n",
-        "Still, we received some mails that complained that our search engine retrieves no relevant results for queries like \"before\", \"css before\", \"after\", and \"CSS after\".\n",
-        "\n",
-        "We wonder why this is the case, because our index contains relevant documents for all four mentioned queries: (1) document `d1` is relevant for queries like \"before\" and \"css before\", and (2) document `d2` is relevant for queries like \"after\", and \"CSS after\".\n",
-        "\n",
-        "Lets look into the problem:"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n",
+      "\n",
+      "No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pyterrier as pt\n",
+    "if not pt.started():\n",
+    "    pt.init()\n",
+    "\n",
+    "def create_index(documents):\n",
+    "    indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, meta={'docno': 100, 'text': 20480})\n",
+    "    index_ref = indexer.index(documents)\n",
+    "    return pt.IndexFactory.of(index_ref)\n",
+    "\n",
+    "index = create_index(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "zhecz8fj42BX"
+   },
+   "outputs": [],
+   "source": [
+    "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TozJqJ_73A92"
+   },
+   "source": [
+    "## The Problem\n",
+    "\n",
+    "Our search engine is now in operation for a while and we already received some positive feedback.\n",
+    "Still, we received some mails that complained that our search engine retrieves no relevant results for queries like \"before\", \"css before\", \"after\", and \"CSS after\".\n",
+    "\n",
+    "We wonder why this is the case, because our index contains relevant documents for all four mentioned queries: (1) document `d1` is relevant for queries like \"before\" and \"css before\", and (2) document `d2` is relevant for queries like \"after\", and \"CSS after\".\n",
+    "\n",
+    "Lets look into the problem:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 53
     },
+    "id": "I_Rb-3022o0Y",
+    "outputId": "dc04b69e-b51a-4328-a55f-3efd25077a5a"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 53
-        },
-        "id": "I_Rb-3022o0Y",
-        "outputId": "dc04b69e-b51a-4328-a55f-3efd25077a5a"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>docid</th>\n",
-              "      <th>docno</th>\n",
-              "      <th>rank</th>\n",
-              "      <th>score</th>\n",
-              "      <th>qid</th>\n",
-              "      <th>query</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "Empty DataFrame\n",
-              "Columns: [docid, docno, rank, score, qid, query]\n",
-              "Index: []"
-            ]
-          },
-          "execution_count": 4,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>docid</th>\n",
+       "      <th>docno</th>\n",
+       "      <th>rank</th>\n",
+       "      <th>score</th>\n",
+       "      <th>qid</th>\n",
+       "      <th>query</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "# searching for before returns no results\n",
-        "bm25.search(\"before\")"
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [docid, docno, rank, score, qid, query]\n",
+       "Index: []"
       ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# searching for before returns no results\n",
+    "bm25.search(\"before\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 53
     },
+    "id": "BilCBjOShNKK",
+    "outputId": "a04ec653-437d-432f-f449-ad7d9d8d1b34"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 53
-        },
-        "id": "BilCBjOShNKK",
-        "outputId": "a04ec653-437d-432f-f449-ad7d9d8d1b34"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>docid</th>\n",
-              "      <th>docno</th>\n",
-              "      <th>rank</th>\n",
-              "      <th>score</th>\n",
-              "      <th>qid</th>\n",
-              "      <th>query</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "Empty DataFrame\n",
-              "Columns: [docid, docno, rank, score, qid, query]\n",
-              "Index: []"
-            ]
-          },
-          "execution_count": 5,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>docid</th>\n",
+       "      <th>docno</th>\n",
+       "      <th>rank</th>\n",
+       "      <th>score</th>\n",
+       "      <th>qid</th>\n",
+       "      <th>query</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "# searching for after returns no results\n",
-        "bm25.search(\"after\")"
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [docid, docno, rank, score, qid, query]\n",
+       "Index: []"
       ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# searching for after returns no results\n",
+    "bm25.search(\"after\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 144
     },
+    "id": "YrYrFsq55D3S",
+    "outputId": "f7c0cd39-5ac6-4a36-bdc6-3dcb42d561df"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 144
-        },
-        "id": "YrYrFsq55D3S",
-        "outputId": "f7c0cd39-5ac6-4a36-bdc6-3dcb42d561df"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>qid</th>\n",
-              "      <th>docid</th>\n",
-              "      <th>docno</th>\n",
-              "      <th>rank</th>\n",
-              "      <th>score</th>\n",
-              "      <th>query</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>1</td>\n",
-              "      <td>2</td>\n",
-              "      <td>d3</td>\n",
-              "      <td>0</td>\n",
-              "      <td>-2.513562</td>\n",
-              "      <td>css after</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>d1</td>\n",
-              "      <td>1</td>\n",
-              "      <td>-2.981605</td>\n",
-              "      <td>css after</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>1</td>\n",
-              "      <td>1</td>\n",
-              "      <td>d2</td>\n",
-              "      <td>2</td>\n",
-              "      <td>-2.981605</td>\n",
-              "      <td>css after</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "  qid  docid docno  rank     score      query\n",
-              "0   1      2    d3     0 -2.513562  css after\n",
-              "1   1      0    d1     1 -2.981605  css after\n",
-              "2   1      1    d2     2 -2.981605  css after"
-            ]
-          },
-          "execution_count": 6,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>qid</th>\n",
+       "      <th>docid</th>\n",
+       "      <th>docno</th>\n",
+       "      <th>rank</th>\n",
+       "      <th>score</th>\n",
+       "      <th>query</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>d3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-2.513562</td>\n",
+       "      <td>css after</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>d1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>-2.981605</td>\n",
+       "      <td>css after</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>d2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-2.981605</td>\n",
+       "      <td>css after</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "# searching for css before returns results, but the relevant document d2 is only on the last position\n",
-        "bm25.search(\"css after\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "TD0gHVIiz_Pf"
-      },
-      "source": [
-        "## The Solution\n",
-        "\n",
-        "After some more debugging, we found out that [after](https://github.com/terrier-org/terrier-desktop/blob/master/share/stopword-list.txt#L206) and [before](https://github.com/terrier-org/terrier-desktop/blob/master/share/stopword-list.txt#L332) are both on the default stopword list and therefore removed from the documents and index.\n",
-        "\n",
-        "To address this problem systematically, we create a small Cranfield-Style collection to measure if our bugfixes to the stopword list improve the retrieval:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {
-        "id": "GZSMnePh7EBd"
-      },
-      "outputs": [],
-      "source": [
-        "# The information needs that we want to test\n",
-        "import pandas as pd\n",
-        "\n",
-        "topics = pd.DataFrame([\n",
-        "    {'qid': '1', 'query': 'before'},\n",
-        "    {'qid': '2', 'query': 'css before'},\n",
-        "    {'qid': '3', 'query': 'after'},\n",
-        "    {'qid': '4', 'query': 'CSS after'},\n",
-        "])\n",
-        "\n",
-        "qrels = pd.DataFrame([\n",
-        "    {'qid': '1', 'docno': 'd1', 'relevance': 1}, #d1 is the only relevant document for query 1\n",
-        "    {'qid': '2', 'docno': 'd1', 'relevance': 1}, #d1 is the only relevant document for query 2\n",
-        "    {'qid': '3', 'docno': 'd2', 'relevance': 1}, #d2 is the only relevant document for query 3\n",
-        "    {'qid': '3', 'docno': 'd2', 'relevance': 1}, #d2 is the only relevant document for query 3\n",
-        "])"
+      "text/plain": [
+       "  qid  docid docno  rank     score      query\n",
+       "0   1      2    d3     0 -2.513562  css after\n",
+       "1   1      0    d1     1 -2.981605  css after\n",
+       "2   1      1    d2     2 -2.981605  css after"
       ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# searching for css before returns results, but the relevant document d2 is only on the last position\n",
+    "bm25.search(\"css after\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TD0gHVIiz_Pf"
+   },
+   "source": [
+    "## The Solution\n",
+    "\n",
+    "After some more debugging, we found out that [after](https://github.com/terrier-org/terrier-desktop/blob/master/share/stopword-list.txt#L206) and [before](https://github.com/terrier-org/terrier-desktop/blob/master/share/stopword-list.txt#L332) are both on the default stopword list and therefore removed from the documents and index.\n",
+    "\n",
+    "To address this problem systematically, we create a small Cranfield-Style collection to measure if our bugfixes to the stopword list improve the retrieval:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "GZSMnePh7EBd"
+   },
+   "outputs": [],
+   "source": [
+    "# The information needs that we want to test\n",
+    "import pandas as pd\n",
+    "\n",
+    "topics = pd.DataFrame([\n",
+    "    {'qid': '1', 'query': 'before'},\n",
+    "    {'qid': '2', 'query': 'css before'},\n",
+    "    {'qid': '3', 'query': 'after'},\n",
+    "    {'qid': '4', 'query': 'CSS after'},\n",
+    "])\n",
+    "\n",
+    "qrels = pd.DataFrame([\n",
+    "    {'qid': '1', 'docno': 'd1', 'relevance': 1}, #d1 is the only relevant document for query 1\n",
+    "    {'qid': '2', 'docno': 'd1', 'relevance': 1}, #d1 is the only relevant document for query 2\n",
+    "    {'qid': '3', 'docno': 'd2', 'relevance': 1}, #d2 is the only relevant document for query 3\n",
+    "    {'qid': '4', 'docno': 'd2', 'relevance': 1}, #d2 is the only relevant document for query 4\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 81
     },
+    "id": "bamLeFLH8GdT",
+    "outputId": "95ef2571-4398-4ae4-f1fb-4bc5559eb593"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 8,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 81
-        },
-        "id": "bamLeFLH8GdT",
-        "outputId": "95ef2571-4398-4ae4-f1fb-4bc5559eb593"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>name</th>\n",
-              "      <th>ndcg_cut_3</th>\n",
-              "      <th>P_1</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>BR(BM25)</td>\n",
-              "      <td>0.166667</td>\n",
-              "      <td>0.0</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "       name  ndcg_cut_3  P_1\n",
-              "0  BR(BM25)    0.166667  0.0"
-            ]
-          },
-          "execution_count": 8,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>ndcg_cut_3</th>\n",
+       "      <th>P_1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>BR(BM25)</td>\n",
+       "      <td>0.166667</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "pt.Experiment([bm25], topics, qrels, eval_metrics=['ndcg_cut_3', 'P_1'])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zrxIUNaQ8BmI"
-      },
-      "source": [
-        "Alright, now that we can measure the effectiveness, lets try to improve the effectiveness.\n",
-        "\n",
-        "By thinking about the problem, we came to the conclusion that our stopword list should contain terms like \"the\", but also \"css\" should be a stopword because all our documents are on CSS.\n",
-        "\n",
-        "To implement this, we store the terms the and css in a file called custom-stopwords.txt and we configure the `stopwords.filename` property of pyterrier so that our new stopword list is used."
+      "text/plain": [
+       "       name  ndcg_cut_3  P_1\n",
+       "0  BR(BM25)    0.166667  0.0"
       ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pt.Experiment([bm25], topics, qrels, eval_metrics=['ndcg_cut_3', 'P_1'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zrxIUNaQ8BmI"
+   },
+   "source": [
+    "Alright, now that we can measure the effectiveness, lets try to improve the effectiveness.\n",
+    "\n",
+    "By thinking about the problem, we came to the conclusion that our stopword list should contain terms like \"the\", but also \"css\" should be a stopword because all our documents are on CSS.\n",
+    "\n",
+    "To implement this, we store the terms the and css in a file called custom-stopwords.txt and we configure the `stopwords.filename` property of pyterrier so that our new stopword list is used."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "0jq3M34A9ODA",
-        "outputId": "e9a247b0-6ad9-42e3-8d86-757596d1510f"
-      },
-      "outputs": [],
-      "source": [
-        "def create_index(documents, stopwords):\n",
-        "    indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords)\n",
-        "    index_ref = indexer.index(documents)\n",
-        "    return pt.IndexFactory.of(index_ref)\n",
-        "\n",
-        "index = create_index(documents, stopwords=['the', 'css'])"
-      ]
+    "id": "0jq3M34A9ODA",
+    "outputId": "e9a247b0-6ad9-42e3-8d86-757596d1510f"
+   },
+   "outputs": [],
+   "source": [
+    "def create_index(documents, stopwords):\n",
+    "    indexer = pt.IterDictIndexer(\"/tmp/index-custom\", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords)\n",
+    "    index_ref = indexer.index(documents)\n",
+    "    return pt.IndexFactory.of(index_ref)\n",
+    "\n",
+    "index_custom = create_index(documents, stopwords=['the', 'css'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
-    {
-      "cell_type": "code",
-      "execution_count": 10,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "uzVKjLPzg7LE",
-        "outputId": "7adaed24-2d64-48c8-eba7-4630ecbda4bd"
-      },
-      "outputs": [],
-      "source": [
-        "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
-      ]
+    "id": "uzVKjLPzg7LE",
+    "outputId": "7adaed24-2d64-48c8-eba7-4630ecbda4bd"
+   },
+   "outputs": [],
+   "source": [
+    "bm25_custom = pt.BatchRetrieve(index_custom, wmodel=\"BM25\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 81
     },
+    "id": "dzA-J__zlMzL",
+    "outputId": "0e08bf36-71ee-477a-a98d-6a02c4ef234e"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 81
-        },
-        "id": "dzA-J__zlMzL",
-        "outputId": "0e08bf36-71ee-477a-a98d-6a02c4ef234e"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>name</th>\n",
-              "      <th>ndcg_cut_3</th>\n",
-              "      <th>P_1</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>BR(BM25)</td>\n",
-              "      <td>1.0</td>\n",
-              "      <td>1.0</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "       name  ndcg_cut_3  P_1\n",
-              "0  BR(BM25)         1.0  1.0"
-            ]
-          },
-          "execution_count": 11,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>ndcg_cut_3</th>\n",
+       "      <th>P_1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>BR(BM25)</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "pt.Experiment([bm25], topics, qrels, eval_metrics=['ndcg_cut_3', 'P_1'])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Lbsw3jYz-v-k"
-      },
-      "source": [
-        "# Summary\n",
-        "\n",
-        "We made quite some improvement with adjusting our stopword list (nDCG@3 improved from 0.167 to 1.0 and Precision@1 improved from 0.0 to 1.0).\n",
-        "\n",
-        "To summarize everything, please answer the following three questions:\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Question 1:\n",
-        "\n",
-        "Is Stopword-removal a Precision-Oriented or a Recall-Oriented technique?\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### TODO: Add your Solution"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Question 2:\n",
-        "\n",
-        "There are many Different Stopword lists out there, please find 3 stopword lists and skim over them, do you spot obvious differences or surprising terms?\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### TODO: Add your Solution"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "\n",
-        "### Question 3:\n",
-        "\n",
-        "Do you know famous phrases that are challenging to retrieve when we apply stopword removal?\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### TODO: Add your Solution"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Question 4:\n",
-        "\n",
-        "Given the task of building a domain-specific search engine, do you think ChatGPT would be useful to construct an effective stopword list? What would be the benefits, what would be the caveats? How would you use ChatGPT for that?\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### TODO: Add your Solution"
+      "text/plain": [
+       "       name  ndcg_cut_3  P_1\n",
+       "0  BR(BM25)         1.0  1.0"
       ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
     }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [
-        "Z5O2-TXL0Vrl"
-      ],
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.12"
-    }
+   ],
+   "source": [
+    "pt.Experiment([bm25_custom], topics, qrels, eval_metrics=['ndcg_cut_3', 'P_1'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With PyTerrier, we can also directly compare the two options (default stopwords and custom stopwords)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pt.Experiment([bm25, bm25_custom], topics, qrels, eval_metrics=['ndcg_cut_3', 'P_1'], names=[\"default stopwords\", \"custom stopwords\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Lbsw3jYz-v-k"
+   },
+   "source": [
+    "# Summary\n",
+    "\n",
+    "We made quite some improvement with adjusting our stopword list (nDCG@3 improved from 0.167 to 1.0 and Precision@1 improved from 0.0 to 1.0).\n",
+    "\n",
+    "To summarize everything, please answer the following three questions:\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Question 1:\n",
+    "\n",
+    "Is Stopword-removal a Precision-oriented or a Recall-oriented technique (i.e., does removing certain stopwords rather improve Precision or Recall)?\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TODO: Add your solution."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Question 2:\n",
+    "\n",
+    "There are many different stopword lists out there, please find three stopword lists and skim over them, do you spot obvious differences or surprising terms?\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TODO: Add your solution."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Question 3:\n",
+    "\n",
+    "Do you know famous phrases that are challenging to retrieve when we apply stopword removal?\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TODO: Add your solution."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Question 4:\n",
+    "\n",
+    "Given the task of building a domain-specific search engine, do you think ChatGPT would be useful to construct an effective stopword list? What would be the benefits, what would be the caveats? How would you use ChatGPT for that?\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TODO: Add your solution."
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [
+    "Z5O2-TXL0Vrl"
+   ],
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }