-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Christian A
committed
Jun 25, 2024
1 parent
710eadd
commit 5f6a2c8
Showing
1 changed file
with
252 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n", | ||
"\n", | ||
"No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Imports\n", | ||
"from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n", | ||
"from tira.rest_api_client import Client\n", | ||
"\n", | ||
"import pyterrier as pt\n", | ||
"\n", | ||
"# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n", | ||
"ensure_pyterrier_is_loaded()\n", | ||
"tira = Client()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# The dataset: the union of the IR Anthology and the ACL Anthology\n", | ||
"# This line creates an IRDSDataset object and registers it under the name provided as an argument.\n", | ||
"dataset = 'antique-test-20230107-training'\n", | ||
"\n", | ||
"pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n", | ||
"\n", | ||
"# A (pre-built) PyTerrier index loaded from TIRA\n", | ||
"index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Stopwords" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def create_index(documents, stopwords):\n", | ||
" indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=customStopwords)\n", | ||
" index_ref = indexer.index(documents)\n", | ||
" return pt.IndexFactory.of(index_ref)\n", | ||
"\n", | ||
"customStopwords =[\n", | ||
" 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', \n", | ||
" 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', \n", | ||
" 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', \n", | ||
" 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', \n", | ||
" 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', \n", | ||
" 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', \n", | ||
" 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', \n", | ||
" 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', \n", | ||
" 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', \n", | ||
" 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', \n", | ||
" 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', \n", | ||
" 'will', 'just', 'don', 'should', 'now'\n", | ||
"]\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 71%|███████ | 90005/126958 [00:22<00:08, 4212.71it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"19:23:10.931 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (2020.mir_conference-2020.1) - further warnings are suppressed\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:31<00:00, 4002.84it/s]\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"19:23:24.124 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"index = create_index(pt_dataset.get_corpus_iter(), customStopwords)\n", | ||
"\n", | ||
"bm25_stopwords = pt.BatchRetrieve(index, wmodel=\"BM25\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Evaluation" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>name</th>\n", | ||
" <th>recall_1000</th>\n", | ||
" <th>ndcg_cut_5</th>\n", | ||
" <th>ndcg_cut.10</th>\n", | ||
" <th>recip_rank</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>BM25_Stopwords</td>\n", | ||
" <td>0.83441</td>\n", | ||
" <td>0.382805</td>\n", | ||
" <td>0.367656</td>\n", | ||
" <td>0.581239</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" name recall_1000 ndcg_cut_5 ndcg_cut.10 recip_rank\n", | ||
"0 BM25_Stopwords 0.83441 0.382805 0.367656 0.581239" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"pt.Experiment(\n", | ||
" retr_systems=[bm25_stopwords],\n", | ||
" topics=pt_dataset.get_topics('text'),\n", | ||
" qrels=pt_dataset.get_qrels(),\n", | ||
" names=['BM25_Stopwords'],\n", | ||
" eval_metrics=['recall_1000', 'ndcg_cut_5', 'ndcg_cut.10', 'recip_rank']\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"run = bm25_stopwords(pt_dataset.get_topics('text'))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n", | ||
"Done. run file is stored under \"../runs/run.txt\".\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |