Merge pull request #178 from opensanctions/memory-index-docs

Remove unused IDF, and add bits of docs useful for understanding xref
opensanctions · Nov 18, 2024 · 5fff4bc · 5fff4bc
2 parents 7a1bc1a + d201717
commit 5fff4bc
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 15 deletions.
diff --git a/nomenklatura/index/entry.py b/nomenklatura/index/entry.py
@@ -10,7 +10,6 @@ class Entry(object):
     __slots__ = "idf", "entities"
 
     def __init__(self) -> None:
-        self.idf: float = 0.0
         self.entities: Dict[Identifier, int] = dict()
 
     def add(self, entity_id: Identifier) -> None:
@@ -21,13 +20,15 @@ def add(self, entity_id: Identifier) -> None:
         except KeyError:
             self.entities[entity_id] = 1
 
-    def compute(self, field: "Field") -> None:
-        """Compute weighted term frequency for scoring."""
-        self.idf = math.log(field.len / len(self.entities))
-
     def frequencies(
         self, field: "Field"
     ) -> Generator[Tuple[Identifier, float], None, None]:
+        """
+        Term Frequency (TF) for each entity in this entry.
+
+        TF being the number of occurrences of this token in the entity divided
+        by the total number of tokens in the entity (scoped to this field).
+        """
         for entity_id, mentions in self.entities.items():
             field_len = max(1, field.entities[entity_id])
             yield entity_id, (mentions / field_len)
@@ -69,9 +70,6 @@ def compute(self) -> None:
         self.len = max(1, len(self.entities))
         self.avg_len = sum(self.entities.values()) / self.len
 
-        for entry in self.tokens.values():
-            entry.compute(self)
-
     def to_dict(self) -> Dict[str, Any]:
         return {
             "tokens": {t: e.to_dict() for t, e in self.tokens.items()},

diff --git a/nomenklatura/index/index.py b/nomenklatura/index/index.py
@@ -18,7 +18,12 @@
 
 
 class Index(BaseIndex[DS, CE]):
-    """An in-memory search index to match entities against a given dataset."""
+    """
+    An in-memory search index to match entities against a given dataset.
+
+    For each field in the dataset, the index stores the IDs which contains each
+    token, along with the absolute frequency of each token in the document.
+    """
 
     name = "memory"
 
@@ -73,9 +78,16 @@ def commit(self) -> None:
             field.compute()
 
     def pairs(self, max_pairs: int = BaseIndex.MAX_PAIRS) -> List[Tuple[Pair, float]]:
-        """A second method of doing xref: summing up the pairwise match value
-        for all entities lineraly. This uses a lot of memory but is really
-        fast."""
+        """
+        A second method of doing xref: summing up the pairwise match value
+        for all entities linearly. This uses a lot of memory but is really
+        fast.
+
+        The score of each pair is the the sum of the product of term frequencies for
+        each co-occurring token in each field of the pair.
+
+        We skip any tokens with more than 100 entities.
+        """
         pairs: Dict[Pair, float] = {}
         log.info("Building index blocking pairs...")
         for field_name, field in self.fields.items():
@@ -86,9 +98,7 @@ def pairs(self, max_pairs: int = BaseIndex.MAX_PAIRS) -> List[Tuple[Pair, float]
 
                 if len(entry.entities) == 1 or len(entry.entities) > 100:
                     continue
-                entities = sorted(
-                    entry.frequencies(field), key=lambda f: f[1], reverse=True
-                )
+                entities = entry.frequencies(field)
                 for (left, lw), (right, rw) in combinations(entities, 2):
                     if lw == 0.0 or rw == 0.0:
                         continue