Skip to content

Commit

Permalink
Merge pull request #178 from opensanctions/memory-index-docs
Browse files Browse the repository at this point in the history
Remove unused IDF, and add bits of docs useful for understanding xref
  • Loading branch information
pudo authored Nov 18, 2024
2 parents 7a1bc1a + d201717 commit 5fff4bc
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 15 deletions.
14 changes: 6 additions & 8 deletions nomenklatura/index/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ class Entry(object):
__slots__ = "idf", "entities"

def __init__(self) -> None:
self.idf: float = 0.0
self.entities: Dict[Identifier, int] = dict()

def add(self, entity_id: Identifier) -> None:
Expand All @@ -21,13 +20,15 @@ def add(self, entity_id: Identifier) -> None:
except KeyError:
self.entities[entity_id] = 1

def compute(self, field: "Field") -> None:
"""Compute weighted term frequency for scoring."""
self.idf = math.log(field.len / len(self.entities))

def frequencies(
self, field: "Field"
) -> Generator[Tuple[Identifier, float], None, None]:
"""
Term Frequency (TF) for each entity in this entry.
TF being the number of occurrences of this token in the entity divided
by the total number of tokens in the entity (scoped to this field).
"""
for entity_id, mentions in self.entities.items():
field_len = max(1, field.entities[entity_id])
yield entity_id, (mentions / field_len)
Expand Down Expand Up @@ -69,9 +70,6 @@ def compute(self) -> None:
self.len = max(1, len(self.entities))
self.avg_len = sum(self.entities.values()) / self.len

for entry in self.tokens.values():
entry.compute(self)

def to_dict(self) -> Dict[str, Any]:
return {
"tokens": {t: e.to_dict() for t, e in self.tokens.items()},
Expand Down
24 changes: 17 additions & 7 deletions nomenklatura/index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@


class Index(BaseIndex[DS, CE]):
"""An in-memory search index to match entities against a given dataset."""
"""
An in-memory search index to match entities against a given dataset.
For each field in the dataset, the index stores the IDs which contains each
token, along with the absolute frequency of each token in the document.
"""

name = "memory"

Expand Down Expand Up @@ -73,9 +78,16 @@ def commit(self) -> None:
field.compute()

def pairs(self, max_pairs: int = BaseIndex.MAX_PAIRS) -> List[Tuple[Pair, float]]:
"""A second method of doing xref: summing up the pairwise match value
for all entities lineraly. This uses a lot of memory but is really
fast."""
"""
A second method of doing xref: summing up the pairwise match value
for all entities linearly. This uses a lot of memory but is really
fast.
The score of each pair is the the sum of the product of term frequencies for
each co-occurring token in each field of the pair.
We skip any tokens with more than 100 entities.
"""
pairs: Dict[Pair, float] = {}
log.info("Building index blocking pairs...")
for field_name, field in self.fields.items():
Expand All @@ -86,9 +98,7 @@ def pairs(self, max_pairs: int = BaseIndex.MAX_PAIRS) -> List[Tuple[Pair, float]

if len(entry.entities) == 1 or len(entry.entities) > 100:
continue
entities = sorted(
entry.frequencies(field), key=lambda f: f[1], reverse=True
)
entities = entry.frequencies(field)
for (left, lw), (right, rw) in combinations(entities, 2):
if lw == 0.0 or rw == 0.0:
continue
Expand Down

0 comments on commit 5fff4bc

Please sign in to comment.