Skip to content

Commit

Permalink
New Try for Porter2Stemmer & CleanedUp old Files
Browse files Browse the repository at this point in the history
  • Loading branch information
Christian A committed Jun 22, 2024
1 parent b73b2a7 commit 30ceb2e
Show file tree
Hide file tree
Showing 9 changed files with 67,510 additions and 64,117 deletions.
311 changes: 311 additions & 0 deletions Group-8-Retrieval-System/DontWORK/Porter2Stemmer copy.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tira in /usr/local/lib/python3.10/dist-packages (0.0.132)\n",
"Requirement already satisfied: ir-datasets in /usr/local/lib/python3.10/dist-packages (0.5.5)\n",
"Requirement already satisfied: python-terrier in /usr/local/lib/python3.10/dist-packages (0.10.0)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n",
"Requirement already satisfied: docker==6.*,>=6.0.0 in /usr/local/lib/python3.10/dist-packages (from tira) (6.1.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from tira) (4.66.1)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from tira) (2.1.3)\n",
"Requirement already satisfied: requests==2.*,>=2.26 in /usr/local/lib/python3.10/dist-packages (from tira) (2.31.0)\n",
"Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n",
"Requirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n",
"Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.1.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.6)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.3.2)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (2023.11.17)\n",
"Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.2)\n",
"Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.9.3)\n",
"Requirement already satisfied: inscriptis>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.3.2)\n",
"Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n",
"Requirement already satisfied: lz4>=3.1.10 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.3.2)\n",
"Requirement already satisfied: pyautocorpus>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.12)\n",
"Requirement already satisfied: unlzw3>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.2)\n",
"Requirement already satisfied: trec-car-tools>=2.5.4 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.6)\n",
"Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (1.26.2)\n",
"Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.5)\n",
"Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.6)\n",
"Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.3)\n",
"Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (3.2.3)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.2)\n",
"Requirement already satisfied: nptyping==1.4.4 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.4.4)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n",
"Requirement already satisfied: wget in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.2)\n",
"Requirement already satisfied: pytrec-eval-terrier>=0.5.3 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.6)\n",
"Requirement already satisfied: matchpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.5)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n",
"Requirement already satisfied: ir-measures>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.3)\n",
"Requirement already satisfied: chest in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.2.3)\n",
"Requirement already satisfied: deprecated in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.14)\n",
"Requirement already satisfied: pyjnius>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.6.1)\n",
"Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.7)\n",
"Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.0)\n",
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n",
"Requirement already satisfied: typish>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from nptyping==1.4.4->python-terrier) (1.9.3)\n",
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.5.15)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)\n",
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir-datasets) (2.5)\n",
"Requirement already satisfied: cwl-eval>=1.0.10 in /usr/local/lib/python3.10/dist-packages (from ir-measures>=0.3.1->python-terrier) (1.0.12)\n",
"Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from trec-car-tools>=2.5.4->ir-datasets) (1.0.0)\n",
"Requirement already satisfied: heapdict in /usr/local/lib/python3.10/dist-packages (from chest->python-terrier) (1.0.1)\n",
"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.16.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.3)\n",
"Requirement already satisfied: multiset<3.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from matchpy->python-terrier) (2.1.1)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3.post1)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2.8.2)\n",
"Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.2.0)\n",
"Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.4)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.2->statsmodels->python-terrier) (1.16.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m"
]
}
],
"source": [
"# Nur benötigt für GoLab\n",
"!pip3 install tira ir-datasets python-terrier nltk\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n",
"from tira.rest_api_client import Client\n",
"from nltk.stem import SnowballStemmer\n",
"\n",
"import pyterrier as pt\n",
"import pandas as pd\n",
"\n",
"\n",
"# PyTerrier initialisieren\n",
"ensure_pyterrier_is_loaded()\n",
"tira = Client()\n",
"\n",
"# PyTerrier starten\n",
"if not pt.started():\n",
" pt.init()\n",
"\n",
"# Pandas Display-Einstellungen\n",
"pd.set_option('display.max_colwidth', 0)\n",
"\n",
"# Dataset von der TIRA Plattform abrufen\n",
"pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# Benutzerdefinierter SnowballStemmer (Porter2)\n",
"class NLTKStemmerWrapper:\n",
" def __init__(self):\n",
" self.stemmer = SnowballStemmer(\"english\")\n",
"\n",
" def __call__(self, text):\n",
" return \" \".join([self.stemmer.stem(word) for word in text.split()])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Benutzerdefinierten Stemmer instanziieren\n",
"custom_stemmer = NLTKStemmerWrapper()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Vorverarbeitung: SnowballStemmer auf den Text anwenden\n",
"def stem_text(text):\n",
" return custom_stemmer(text)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# Index erstellen mit einer PyTerrier-Pipeline\n",
"def create_index(corpus):\n",
" # Indexer erstellen\n",
" indexer = pt.IterDictIndexer(\"/tmp/index2\", overwrite=True)\n",
" \n",
" # Corpus durchlaufen und Stemmer anwenden\n",
" processed_corpus = [\n",
" {'docno': doc['docno'], 'text': stem_text(doc['text'])}\n",
" for doc in corpus\n",
" ]\n",
"\n",
" # Indexieren\n",
" index_ref = indexer.index(processed_corpus)\n",
" return index_ref\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 66%|██████▋ | 84161/126958 [02:10<01:02, 684.46it/s] "
]
}
],
"source": [
"# Index erstellen\n",
"index_ref = create_index(pt_dataset.get_corpus_iter())\n",
"\n",
"# Index laden\n",
"index = pt.IndexFactory.of(index_ref)\n",
"\n",
"# BM25 Retrieval Modell initialisieren\n",
"bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Gestemmte Queries vorbereiten\n",
"topics = pt_dataset.get_topics()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Übertragen der Stemming-Funktion auf die Queries\n",
"def apply_stem_to_queries(df):\n",
" df['text'] = df['text'].apply(stem_text)\n",
" df['title'] = df['title'].apply(stem_text)\n",
" df['query'] = df['query'].apply(stem_text)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Gestemmte Queries anwenden\n",
"stemmed_topics = apply_stem_to_queries(topics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Experiment durchführen\n",
"experiment = pt.Experiment(\n",
" [bm25], \n",
" stemmed_topics, # Verwende die gestemmten Topics hier\n",
" pt_dataset.get_qrels(), \n",
" eval_metrics=['P_1000', 'map', 'recip_rank'],\n",
" names=['BM25'],\n",
" baseline=0\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Ergebnisse des Experiments anzeigen\n",
"print(\"Ergebnisse des Experiments:\")\n",
"print(experiment)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Output runfile für das Deployment auf TIRA\n",
"persist_and_normalize_run(experiment, system_name='bm25-custom-stemmer', default_output='../../runs')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Direkte Abfrage mit gestemmter Query\n",
"query = \"retrieval system improving effectiveness\"\n",
"stemmed_query = stem_text(query)\n",
"print(f\"Original Query: {query}\")\n",
"print(f\"Stemmed Query: {stemmed_query}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Abfrage auf dem Index mit gestemmter Query\n",
"manual_run = bm25.search(stemmed_query)\n",
"print(\"Ergebnisse der manuellen Abfrage:\")\n",
"print(manual_run.head(10))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 30ceb2e

Please sign in to comment.