-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New Try for Porter2Stemmer & CleanedUp old Files
- Loading branch information
Christian A
committed
Jun 22, 2024
1 parent
b73b2a7
commit 30ceb2e
Showing
9 changed files
with
67,510 additions
and
64,117 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
311 changes: 311 additions & 0 deletions
311
Group-8-Retrieval-System/DontWORK/Porter2Stemmer copy.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,311 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: tira in /usr/local/lib/python3.10/dist-packages (0.0.132)\n", | ||
"Requirement already satisfied: ir-datasets in /usr/local/lib/python3.10/dist-packages (0.5.5)\n", | ||
"Requirement already satisfied: python-terrier in /usr/local/lib/python3.10/dist-packages (0.10.0)\n", | ||
"Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", | ||
"Requirement already satisfied: docker==6.*,>=6.0.0 in /usr/local/lib/python3.10/dist-packages (from tira) (6.1.3)\n", | ||
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from tira) (4.66.1)\n", | ||
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from tira) (2.1.3)\n", | ||
"Requirement already satisfied: requests==2.*,>=2.26 in /usr/local/lib/python3.10/dist-packages (from tira) (2.31.0)\n", | ||
"Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n", | ||
"Requirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n", | ||
"Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.1.0)\n", | ||
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.6)\n", | ||
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.3.2)\n", | ||
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (2023.11.17)\n", | ||
"Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.2)\n", | ||
"Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.9.3)\n", | ||
"Requirement already satisfied: inscriptis>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.3.2)\n", | ||
"Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n", | ||
"Requirement already satisfied: lz4>=3.1.10 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.3.2)\n", | ||
"Requirement already satisfied: pyautocorpus>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.12)\n", | ||
"Requirement already satisfied: unlzw3>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.2)\n", | ||
"Requirement already satisfied: trec-car-tools>=2.5.4 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.6)\n", | ||
"Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (1.26.2)\n", | ||
"Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.5)\n", | ||
"Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.6)\n", | ||
"Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.3)\n", | ||
"Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (3.2.3)\n", | ||
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.2)\n", | ||
"Requirement already satisfied: nptyping==1.4.4 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.4.4)\n", | ||
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", | ||
"Requirement already satisfied: wget in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.2)\n", | ||
"Requirement already satisfied: pytrec-eval-terrier>=0.5.3 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.6)\n", | ||
"Requirement already satisfied: matchpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.5)\n", | ||
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", | ||
"Requirement already satisfied: ir-measures>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.3)\n", | ||
"Requirement already satisfied: chest in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.2.3)\n", | ||
"Requirement already satisfied: deprecated in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.14)\n", | ||
"Requirement already satisfied: pyjnius>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.6.1)\n", | ||
"Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.7)\n", | ||
"Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.0)\n", | ||
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n", | ||
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n", | ||
"Requirement already satisfied: typish>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from nptyping==1.4.4->python-terrier) (1.9.3)\n", | ||
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.5.15)\n", | ||
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)\n", | ||
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir-datasets) (2.5)\n", | ||
"Requirement already satisfied: cwl-eval>=1.0.10 in /usr/local/lib/python3.10/dist-packages (from ir-measures>=0.3.1->python-terrier) (1.0.12)\n", | ||
"Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from trec-car-tools>=2.5.4->ir-datasets) (1.0.0)\n", | ||
"Requirement already satisfied: heapdict in /usr/local/lib/python3.10/dist-packages (from chest->python-terrier) (1.0.1)\n", | ||
"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.16.0)\n", | ||
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.3)\n", | ||
"Requirement already satisfied: multiset<3.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from matchpy->python-terrier) (2.1.1)\n", | ||
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3.post1)\n", | ||
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2.8.2)\n", | ||
"Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3)\n", | ||
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.2.0)\n", | ||
"Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.4)\n", | ||
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.2->statsmodels->python-terrier) (1.16.0)\n", | ||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", | ||
"\u001b[0m" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Nur benötigt für GoLab\n", | ||
"!pip3 install tira ir-datasets python-terrier nltk\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Imports\n", | ||
"from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n", | ||
"from tira.rest_api_client import Client\n", | ||
"from nltk.stem import SnowballStemmer\n", | ||
"\n", | ||
"import pyterrier as pt\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"\n", | ||
"# PyTerrier initialisieren\n", | ||
"ensure_pyterrier_is_loaded()\n", | ||
"tira = Client()\n", | ||
"\n", | ||
"# PyTerrier starten\n", | ||
"if not pt.started():\n", | ||
" pt.init()\n", | ||
"\n", | ||
"# Pandas Display-Einstellungen\n", | ||
"pd.set_option('display.max_colwidth', 0)\n", | ||
"\n", | ||
"# Dataset von der TIRA Plattform abrufen\n", | ||
"pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Benutzerdefinierter SnowballStemmer (Porter2)\n", | ||
"class NLTKStemmerWrapper:\n", | ||
" def __init__(self):\n", | ||
" self.stemmer = SnowballStemmer(\"english\")\n", | ||
"\n", | ||
" def __call__(self, text):\n", | ||
" return \" \".join([self.stemmer.stem(word) for word in text.split()])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Benutzerdefinierten Stemmer instanziieren\n", | ||
"custom_stemmer = NLTKStemmerWrapper()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Vorverarbeitung: SnowballStemmer auf den Text anwenden\n", | ||
"def stem_text(text):\n", | ||
" return custom_stemmer(text)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Index erstellen mit einer PyTerrier-Pipeline\n", | ||
"def create_index(corpus):\n", | ||
" # Indexer erstellen\n", | ||
" indexer = pt.IterDictIndexer(\"/tmp/index2\", overwrite=True)\n", | ||
" \n", | ||
" # Corpus durchlaufen und Stemmer anwenden\n", | ||
" processed_corpus = [\n", | ||
" {'docno': doc['docno'], 'text': stem_text(doc['text'])}\n", | ||
" for doc in corpus\n", | ||
" ]\n", | ||
"\n", | ||
" # Indexieren\n", | ||
" index_ref = indexer.index(processed_corpus)\n", | ||
" return index_ref\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 66%|██████▋ | 84161/126958 [02:10<01:02, 684.46it/s] " | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Index erstellen\n", | ||
"index_ref = create_index(pt_dataset.get_corpus_iter())\n", | ||
"\n", | ||
"# Index laden\n", | ||
"index = pt.IndexFactory.of(index_ref)\n", | ||
"\n", | ||
"# BM25 Retrieval Modell initialisieren\n", | ||
"bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Gestemmte Queries vorbereiten\n", | ||
"topics = pt_dataset.get_topics()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Übertragen der Stemming-Funktion auf die Queries\n", | ||
"def apply_stem_to_queries(df):\n", | ||
" df['text'] = df['text'].apply(stem_text)\n", | ||
" df['title'] = df['title'].apply(stem_text)\n", | ||
" df['query'] = df['query'].apply(stem_text)\n", | ||
" return df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Gestemmte Queries anwenden\n", | ||
"stemmed_topics = apply_stem_to_queries(topics)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Experiment durchführen\n", | ||
"experiment = pt.Experiment(\n", | ||
" [bm25], \n", | ||
" stemmed_topics, # Verwende die gestemmten Topics hier\n", | ||
" pt_dataset.get_qrels(), \n", | ||
" eval_metrics=['P_1000', 'map', 'recip_rank'],\n", | ||
" names=['BM25'],\n", | ||
" baseline=0\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Ergebnisse des Experiments anzeigen\n", | ||
"print(\"Ergebnisse des Experiments:\")\n", | ||
"print(experiment)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Output runfile für das Deployment auf TIRA\n", | ||
"persist_and_normalize_run(experiment, system_name='bm25-custom-stemmer', default_output='../../runs')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Direkte Abfrage mit gestemmter Query\n", | ||
"query = \"retrieval system improving effectiveness\"\n", | ||
"stemmed_query = stem_text(query)\n", | ||
"print(f\"Original Query: {query}\")\n", | ||
"print(f\"Stemmed Query: {stemmed_query}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Abfrage auf dem Index mit gestemmter Query\n", | ||
"manual_run = bm25.search(stemmed_query)\n", | ||
"print(\"Ergebnisse der manuellen Abfrage:\")\n", | ||
"print(manual_run.head(10))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.