Skip to content

Commit

Permalink
Merge branch 'main' of github.com:webis-de/ir_axioms
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Oct 14, 2024
2 parents 4fdb6cc + 2e33e42 commit a9b3cd6
Show file tree
Hide file tree
Showing 10 changed files with 71 additions and 24 deletions.
2 changes: 1 addition & 1 deletion ir_axioms/axiom/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _key(
document2: RankedDocument
) -> str:
return (
f"{self.axiom!r},{context!r},"
f"{self.axiom!r},{context},"
f"{query.title},{document1.id},{document2.id}"
)

Expand Down
4 changes: 2 additions & 2 deletions ir_axioms/axiom/preconditions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def approximately_same_length(
margin_fraction: float,
) -> bool:
return approximately_equal(
len(context.terms(document1)),
len(context.terms(document2)),
context.document_length(document1),
context.document_length(document2),
margin_fraction=margin_fraction
)

Expand Down
12 changes: 8 additions & 4 deletions ir_axioms/axiom/query_aspects.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,14 @@ def preference(
document2: RankedDocument
):
query_terms = context.term_set(query)
document1_terms = context.term_set(document1)
document2_terms = context.term_set(document2)
s1 = query_terms.issubset(document1_terms)
s2 = query_terms.issubset(document2_terms)
s1, s2 = set(), set()

for query_term in query_terms:
if context.term_frequency(document1, query_term) > 0:
s1.add(query_term)
if context.term_frequency(document2, query_term) > 0:
s2.add(query_term)

return strictly_greater(s1, s2)


Expand Down
5 changes: 3 additions & 2 deletions ir_axioms/axiom/term_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def preference(
document1: RankedDocument,
document2: RankedDocument
):
query_terms = context.term_set(query)
document1_terms = context.term_set(document1)
document2_terms = context.term_set(document2)
query_terms = context.term_set(query)

return strictly_greater(
self.average_similarity(document1_terms, query_terms),
Expand Down Expand Up @@ -63,10 +63,11 @@ def preference(
is non-deterministic if there are multiple equally most similar pairs.
"""

query_terms = context.term_set(query)
document1_terms = context.term_set(document1)
document2_terms = context.term_set(document2)
document_terms = document1_terms | document2_terms
query_terms = context.term_set(query)

non_query_terms = document_terms - query_terms

most_similar_terms = self.most_similar_pair(
Expand Down
3 changes: 3 additions & 0 deletions ir_axioms/backend/pyterrier/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def _index(self) -> Index:
f"Cannot load index from location {self.index_location}."
)

def __str__(self):
return f'TerrierIndexContext({str(self.index_location).split("/")[-1].split(" ")[0]})'

@cached_property
def _meta_index(self) -> MetaIndex:
meta_index = self._index.getMetaIndex()
Expand Down
29 changes: 17 additions & 12 deletions ir_axioms/backend/pyterrier/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ def transform(self, topics_or_res: DataFrame) -> DataFrame:


class AxiomTransformer(PerGroupTransformer, ABC):
index: Union[Index, IndexRef, Path, str]
index: Optional[Union[Index, IndexRef, Path, str]] = None
dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
contents_accessor: Optional[ContentsAccessor] = "text"
context: Optional[IndexContext] = None
tokeniser: Optional[Tokeniser] = None
cache_dir: Optional[Path] = None
verbose: bool = False
Expand All @@ -80,15 +81,17 @@ class AxiomTransformer(PerGroupTransformer, ABC):
optional_group_columns = {"qid", "name"}
unit = "query"

@cached_property
@property
def _context(self) -> IndexContext:
return TerrierIndexContext(
index_location=self.index,
dataset=self.dataset,
contents_accessor=self.contents_accessor,
tokeniser=self.tokeniser,
cache_dir=self.cache_dir,
)
if not self.context:
self.context = TerrierIndexContext(
index_location=self.index,
dataset=self.dataset,
contents_accessor=self.contents_accessor,
tokeniser=self.tokeniser,
cache_dir=self.cache_dir,
)
return self.context

@final
def transform_group(self, topics_or_res: DataFrame) -> DataFrame:
Expand Down Expand Up @@ -124,8 +127,9 @@ class KwikSortReranker(AxiomTransformer):
description = "Reranking query axiomatically"

axiom: AxiomLike
index: Union[Index, IndexRef, Path, str]
index: Optional[Union[Index, IndexRef, Path, str]] = None
dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
context: Optional[IndexContext] = None
contents_accessor: Optional[ContentsAccessor] = "text"
pivot_selection: PivotSelection = RandomPivotSelection()
tokeniser: Optional[Tokeniser] = None
Expand Down Expand Up @@ -170,8 +174,8 @@ class AggregatedAxiomaticPreferences(AxiomTransformer):
description = "Aggregating query axiom preferences"

axioms: Sequence[AxiomLike]
index: Union[Index, IndexRef, Path, str]
aggregations: Sequence[Callable[[Sequence[float]], float]]
index: Optional[Union[Index, IndexRef, Path, str]] = None
dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
contents_accessor: Optional[ContentsAccessor] = "text"
filter_pairs: Optional[Callable[
Expand Down Expand Up @@ -233,7 +237,8 @@ class AxiomaticPreferences(AxiomTransformer):
description = "Computing query axiom preferences"

axioms: Sequence[AxiomLike]
index: Union[Index, IndexRef, Path, str]
index: Optional[Union[Index, IndexRef, Path, str]] = None
context: Optional[IndexContext] = None
axiom_names: Optional[Sequence[str]] = None
dataset: Optional[Union[Dataset, str, IRDSDataset]] = None
contents_accessor: Optional[ContentsAccessor] = "text"
Expand Down
5 changes: 4 additions & 1 deletion ir_axioms/model/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class IndexContext(ABC):
def cache(self) -> Optional[Cache]:
if self.cache_dir is None:
return None
return Cache(str(self.cache_dir.absolute()))
return Cache(str(self.cache_dir.absolute()), eviction_policy='none')

@property
@abstractmethod
Expand Down Expand Up @@ -64,6 +64,9 @@ def term_set(
) -> FrozenSet[str]:
return frozenset(self.terms(query_or_document))

def document_length(self, document: Document):
return len(self.terms(document))

@lru_cache(None)
def term_frequency(
self,
Expand Down
3 changes: 2 additions & 1 deletion ir_axioms/modules/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,5 @@ def similarity(self, term1: str, term2: str):


class FastTextWikiNewsTermSimilarityMixin(MagnitudeTermSimilarityMixin):
embeddings_path: Final[str] = "fasttext/medium/wiki-news-300d-1M.magnitude"
# wget via: https://files.webis.de/data-in-production/data-research/ir-axioms/wiki-news-300d-1M.magnitude
embeddings_path: Final[str] = "/workspaces/ecir25-gpt-axioms/wiki-news-300d-1M.magnitude"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = [
"joblib~=1.0",
"lz4~=4.0",
"nltk~=3.6",
"numpy~=1.21",
"numpy>=1.21,<3.0",
"pandas>=1.3,<3.0",
"pymagnitude~=0.1.143",
"requests~=2.26",
Expand Down
30 changes: 30 additions & 0 deletions tests/unit/test_string_representations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import unittest

import pyterrier as pt
if not pt.started():
pt.init()

from ir_axioms.backend.pyterrier import TerrierIndexContext


class TestStringRepresentations(unittest.TestCase):
def test_string_representation_of_terrier_index_context_01(self):
# Needed for caching
expected = 'TerrierIndexContext(index_location)'
actual = str(TerrierIndexContext('index_location'))

self.assertEqual(expected, actual)

def test_string_representation_of_terrier_index_context_02(self):
# Needed for caching
expected = 'TerrierIndexContext(index_location)'
actual = str(TerrierIndexContext('ignore/absolute/path/index_location'))

self.assertEqual(expected, actual)

def test_string_representation_of_terrier_index_context_03(self):
# Needed for caching
expected = 'TerrierIndexContext(index_location)'
actual = str(TerrierIndexContext('ignore/absolute/path/index_location ignore suffix'))

self.assertEqual(expected, actual)

0 comments on commit a9b3cd6

Please sign in to comment.