Merge branch 'main' of github.com:webis-de/ir_axioms

webis-de · Oct 14, 2024 · a9b3cd6 · a9b3cd6
2 parents 4fdb6cc + 2e33e42
commit a9b3cd6
Show file tree

Hide file tree

Showing 10 changed files with 71 additions and 24 deletions.
diff --git a/ir_axioms/axiom/cache.py b/ir_axioms/axiom/cache.py
@@ -19,7 +19,7 @@ def _key(
             document2: RankedDocument
     ) -> str:
         return (
-            f"{self.axiom!r},{context!r},"
+            f"{self.axiom!r},{context},"
             f"{query.title},{document1.id},{document2.id}"
         )
 

diff --git a/ir_axioms/axiom/preconditions.py b/ir_axioms/axiom/preconditions.py
@@ -12,8 +12,8 @@ def approximately_same_length(
         margin_fraction: float,
 ) -> bool:
     return approximately_equal(
-        len(context.terms(document1)),
-        len(context.terms(document2)),
+        context.document_length(document1),
+        context.document_length(document2),
         margin_fraction=margin_fraction
     )
 

diff --git a/ir_axioms/axiom/query_aspects.py b/ir_axioms/axiom/query_aspects.py
@@ -197,10 +197,14 @@ def preference(
             document2: RankedDocument
     ):
         query_terms = context.term_set(query)
-        document1_terms = context.term_set(document1)
-        document2_terms = context.term_set(document2)
-        s1 = query_terms.issubset(document1_terms)
-        s2 = query_terms.issubset(document2_terms)
+        s1, s2 = set(), set()
+
+        for query_term in query_terms:
+            if context.term_frequency(document1, query_term) > 0:
+                s1.add(query_term)
+            if context.term_frequency(document2, query_term) > 0:
+                s2.add(query_term)
+
         return strictly_greater(s1, s2)
 
 

diff --git a/ir_axioms/axiom/term_similarity.py b/ir_axioms/axiom/term_similarity.py
@@ -21,9 +21,9 @@ def preference(
             document1: RankedDocument,
             document2: RankedDocument
     ):
+        query_terms = context.term_set(query)
         document1_terms = context.term_set(document1)
         document2_terms = context.term_set(document2)
-        query_terms = context.term_set(query)
 
         return strictly_greater(
             self.average_similarity(document1_terms, query_terms),
@@ -63,10 +63,11 @@ def preference(
         is non-deterministic if there are multiple equally most similar pairs.
         """
 
+        query_terms = context.term_set(query)
         document1_terms = context.term_set(document1)
         document2_terms = context.term_set(document2)
         document_terms = document1_terms | document2_terms
-        query_terms = context.term_set(query)
+
         non_query_terms = document_terms - query_terms
 
         most_similar_terms = self.most_similar_pair(

diff --git a/ir_axioms/backend/pyterrier/__init__.py b/ir_axioms/backend/pyterrier/__init__.py
@@ -87,6 +87,9 @@ def _index(self) -> Index:
                 f"Cannot load index from location {self.index_location}."
             )
 
+    def __str__(self):
+        return f'TerrierIndexContext({str(self.index_location).split("/")[-1].split(" ")[0]})'
+
     @cached_property
     def _meta_index(self) -> MetaIndex:
         meta_index = self._index.getMetaIndex()

diff --git a/ir_axioms/backend/pyterrier/transformers.py b/ir_axioms/backend/pyterrier/transformers.py
@@ -68,9 +68,10 @@ def transform(self, topics_or_res: DataFrame) -> DataFrame:
 
 
 class AxiomTransformer(PerGroupTransformer, ABC):
-    index: Union[Index, IndexRef, Path, str]
+    index: Optional[Union[Index, IndexRef, Path, str]] = None
     dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
     contents_accessor: Optional[ContentsAccessor] = "text"
+    context: Optional[IndexContext] = None
     tokeniser: Optional[Tokeniser] = None
     cache_dir: Optional[Path] = None
     verbose: bool = False
@@ -80,15 +81,17 @@ class AxiomTransformer(PerGroupTransformer, ABC):
     optional_group_columns = {"qid", "name"}
     unit = "query"
 
-    @cached_property
+    @property
     def _context(self) -> IndexContext:
-        return TerrierIndexContext(
-            index_location=self.index,
-            dataset=self.dataset,
-            contents_accessor=self.contents_accessor,
-            tokeniser=self.tokeniser,
-            cache_dir=self.cache_dir,
-        )
+        if not self.context:
+            self.context = TerrierIndexContext(
+                index_location=self.index,
+                dataset=self.dataset,
+                contents_accessor=self.contents_accessor,
+                tokeniser=self.tokeniser,
+                cache_dir=self.cache_dir,
+            )
+        return self.context
 
     @final
     def transform_group(self, topics_or_res: DataFrame) -> DataFrame:
@@ -124,8 +127,9 @@ class KwikSortReranker(AxiomTransformer):
     description = "Reranking query axiomatically"
 
     axiom: AxiomLike
-    index: Union[Index, IndexRef, Path, str]
+    index: Optional[Union[Index, IndexRef, Path, str]] = None
     dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
+    context: Optional[IndexContext] = None
     contents_accessor: Optional[ContentsAccessor] = "text"
     pivot_selection: PivotSelection = RandomPivotSelection()
     tokeniser: Optional[Tokeniser] = None
@@ -170,8 +174,8 @@ class AggregatedAxiomaticPreferences(AxiomTransformer):
     description = "Aggregating query axiom preferences"
 
     axioms: Sequence[AxiomLike]
-    index: Union[Index, IndexRef, Path, str]
     aggregations: Sequence[Callable[[Sequence[float]], float]]
+    index: Optional[Union[Index, IndexRef, Path, str]] = None
     dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
     contents_accessor: Optional[ContentsAccessor] = "text"
     filter_pairs: Optional[Callable[
@@ -233,7 +237,8 @@ class AxiomaticPreferences(AxiomTransformer):
     description = "Computing query axiom preferences"
 
     axioms: Sequence[AxiomLike]
-    index: Union[Index, IndexRef, Path, str]
+    index: Optional[Union[Index, IndexRef, Path, str]] = None
+    context: Optional[IndexContext] = None
     axiom_names: Optional[Sequence[str]] = None
     dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
     contents_accessor: Optional[ContentsAccessor] = "text"

diff --git a/ir_axioms/model/context.py b/ir_axioms/model/context.py
@@ -18,7 +18,7 @@ class IndexContext(ABC):
     def cache(self) -> Optional[Cache]:
         if self.cache_dir is None:
             return None
-        return Cache(str(self.cache_dir.absolute()))
+        return Cache(str(self.cache_dir.absolute()), eviction_policy='none')
 
     @property
     @abstractmethod
@@ -64,6 +64,9 @@ def term_set(
     ) -> FrozenSet[str]:
         return frozenset(self.terms(query_or_document))
 
+    def document_length(self, document: Document):
+        return len(self.terms(document))
+
     @lru_cache(None)
     def term_frequency(
             self,

diff --git a/ir_axioms/modules/similarity.py b/ir_axioms/modules/similarity.py
@@ -199,4 +199,5 @@ def similarity(self, term1: str, term2: str):
 
 
 class FastTextWikiNewsTermSimilarityMixin(MagnitudeTermSimilarityMixin):
-    embeddings_path: Final[str] = "fasttext/medium/wiki-news-300d-1M.magnitude"
+    # wget via: https://files.webis.de/data-in-production/data-research/ir-axioms/wiki-news-300d-1M.magnitude
+    embeddings_path: Final[str] = "/workspaces/ecir25-gpt-axioms/wiki-news-300d-1M.magnitude"
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "joblib~=1.0",
     "lz4~=4.0",
     "nltk~=3.6",
-    "numpy~=1.21",
+    "numpy>=1.21,<3.0",
     "pandas>=1.3,<3.0",
     "pymagnitude~=0.1.143",
     "requests~=2.26",

diff --git a/tests/unit/test_string_representations.py b/tests/unit/test_string_representations.py
@@ -0,0 +1,30 @@
+import unittest
+
+import pyterrier as pt
+if not pt.started():
+    pt.init()
+
+from ir_axioms.backend.pyterrier import TerrierIndexContext
+
+
+class TestStringRepresentations(unittest.TestCase):
+    def test_string_representation_of_terrier_index_context_01(self):
+        # Needed for caching
+        expected = 'TerrierIndexContext(index_location)'
+        actual = str(TerrierIndexContext('index_location'))
+
+        self.assertEqual(expected, actual)
+
+    def test_string_representation_of_terrier_index_context_02(self):
+        # Needed for caching
+        expected = 'TerrierIndexContext(index_location)'
+        actual = str(TerrierIndexContext('ignore/absolute/path/index_location'))
+
+        self.assertEqual(expected, actual)
+
+    def test_string_representation_of_terrier_index_context_03(self):
+        # Needed for caching
+        expected = 'TerrierIndexContext(index_location)'
+        actual = str(TerrierIndexContext('ignore/absolute/path/index_location ignore suffix'))
+
+        self.assertEqual(expected, actual)