-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_with_bm25.py
30 lines (25 loc) · 1.2 KB
/
index_with_bm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# Uses PyTerrier and pyterrier-xlang to build a BM25 index for a dataset
import argparse
import pyterrier as pt
import logging
from pyterrier_xlang.preprocess import anserini_tokenizer
from pyterrier_pisa import PisaIndex
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
argparser = argparse.ArgumentParser()
argparser.add_argument("--language", type=str, help="language of the dataset", required=True)
argparser.add_argument("--dataset", type=str, help="dataset to index", required=True)
argparser.add_argument("--index", type=str, help="index path", required=True)
args = argparser.parse_args()
dataset = args.dataset
index = args.index
language = args.language
# Load xlang preprocessing pipeline
preproc = anserini_tokenizer(language)
logging.info(f"Preprocessing pipeline loaded for {language}: {preproc}")
# Load dataset
dataset = pt.get_dataset(f"irds:{dataset}")
# Index dataset
idx = PisaIndex(index, stemmer="none", text_field="text", overwrite=True, threads=64)
# (preproc >> idx.toks_indexer(scale=1)).index(dataset.get_corpus_iter()) # for languages that need tokenization (e.g. Chinese)
(preproc >> idx).index(dataset.get_corpus_iter())
logging.info(f"Indexing complete for {dataset}")