Living-with-machines · rwood-97 · Oct 3, 2023 · Oct 3, 2023 · Oct 4, 2023
diff --git a/docs/source/getting-started/complete-tour.rst b/docs/source/getting-started/complete-tour.rst
@@ -834,7 +834,7 @@ the Ranker method.
 
 .. code-block:: python
 
-    myranker.mentions_to_wikidata = myranker.load_resources()
+    myranker.load_resources()
 
 .. note::
 
@@ -1056,7 +1056,7 @@ of the Linker method.
 
 .. code-block:: python
 
-    mylinker.linking_resources = mylinker.load_resources()
+    mylinker.load_resources()
 
 .. note::
 

diff --git a/examples/train_use_deezy_model_1.ipynb b/examples/train_use_deezy_model_1.ipynb
@@ -120,7 +120,7 @@
    "outputs": [],
    "source": [
     "# Load the resources:\n",
-    "myranker.mentions_to_wikidata = myranker.load_resources()"
+    "myranker.load_resources()"
    ]
   },
   {
@@ -177,7 +177,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.17"
   },
   "orig_nbformat": 4
  },

diff --git a/examples/train_use_deezy_model_2.ipynb b/examples/train_use_deezy_model_2.ipynb
@@ -109,7 +109,7 @@
    "outputs": [],
    "source": [
     "# Load the resources:\n",
-    "myranker.mentions_to_wikidata = myranker.load_resources()"
+    "myranker.load_resources()"
    ]
   },
   {
@@ -173,7 +173,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.17"
   },
   "orig_nbformat": 4
  },

diff --git a/examples/train_use_deezy_model_3.ipynb b/examples/train_use_deezy_model_3.ipynb
@@ -111,7 +111,7 @@
    "outputs": [],
    "source": [
     "# Load the resources:\n",
-    "myranker.mentions_to_wikidata = myranker.load_resources()"
+    "myranker.load_resources()"
    ]
   },
   {
@@ -168,7 +168,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.17"
   },
   "orig_nbformat": 4
  },

diff --git a/experiments/experiment.py b/experiments/experiment.py
@@ -224,10 +224,7 @@ def prepare_data(self) -> dict:
         # Obtain candidates per sentence:
         for sentence_id in tqdm(dMentionsPred):
             pred_mentions_sent = dMentionsPred[sentence_id]
-            (
-                wk_cands,
-                self.myranker.already_collected_cands,
-            ) = self.myranker.find_candidates(pred_mentions_sent)
+            wk_cands = self.myranker.find_candidates(pred_mentions_sent)
             dCandidates[sentence_id] = wk_cands
 
         # -------------------------------------------

diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py
@@ -166,14 +166,14 @@
     # -----------------------------------------
     # Ranker loading resources and training a model:
     # Load the resources:
-    myranker.mentions_to_wikidata = myranker.load_resources()
+    myranker.load_resources()
     # Train a DeezyMatch model if needed:
     myranker.train()
 
     # -----------------------------------------
     # Linker loading resources:
     # Load linking resources:
-    mylinker.linking_resources = mylinker.load_resources()
+    mylinker.load_resources()
 
     # -----------------------------------------
     # Prepare experiment:

diff --git a/geoparser/pipeline.py b/geoparser/pipeline.py
@@ -118,7 +118,7 @@ def __init__(
         # Ranker loading resources and training a model:
 
         # Load the resources:
-        self.myranker.mentions_to_wikidata = self.myranker.load_resources()
+        self.myranker.load_resources()
 
         # Train a DeezyMatch model if needed:
         self.myranker.train()
@@ -127,7 +127,7 @@ def __init__(
         # Linker loading resources:
 
         # Load linking resources:
-        self.mylinker.linking_resources = self.mylinker.load_resources()
+        self.mylinker.load_resources()
 
         # Train a linking model if needed (it requires myranker to generate
         # potential candidates to the training set):
@@ -223,9 +223,7 @@ def run_sentence(
             rmentions = [{"mention": y["mention"]} for y in mentions]
 
         # Perform candidate ranking:
-        wk_cands, self.myranker.already_collected_cands = self.myranker.find_candidates(
-            rmentions
-        )
+        wk_cands = self.myranker.find_candidates(rmentions)
 
         mentions_dataset = dict()
         mentions_dataset["linking"] = []
@@ -685,9 +683,7 @@ def run_candidate_selection(self, document_dataset: List[dict]) -> dict:
         mentions = [{"mention": m} for m in mentions]
 
         # Perform candidate ranking:
-        wk_cands, self.myranker.already_collected_cands = self.myranker.find_candidates(
-            mentions
-        )
+        wk_cands = self.myranker.find_candidates(rmentions)
         return wk_cands
 
     def run_disambiguation(

diff --git a/geoparser/ranking.py b/geoparser/ranking.py
@@ -50,18 +50,18 @@ class Ranker:
             )
 
         >>> # Load resources
-        >>> ranker.mentions_to_wikidata = ranker.load_resources()
+        >>> ranker.load_resources()
 
         >>> # Train the ranker (if applicable)
         >>> ranker.train()
 
         >>> # Perform candidate selection
         >>> queries = ['London', 'Paraguay']
-        >>> candidates, already_collected = ranker.run(queries)
+        >>> candidates = ranker.run(queries)
 
         >>> # Find candidates for mentions
         >>> mentions = [{'mention': 'London'}, {'mention': 'Paraguay'}]
-        >>> mention_candidates, mention_already_collected = ranker.find_candidates(mentions)
+        >>> mention_candidates = ranker.find_candidates(mentions)
 
         >>> # Print the results
         >>> print("Candidate Selection Results:")
@@ -136,7 +136,7 @@ def __init__(
             "overwrite_training": False,
             "do_test": False,
         },
-        already_collected_cands: Optional[dict] = dict(),
+        already_collected_cands: Optional[dict] = None,
     ):
         """
         Initialize a Ranker object.
@@ -147,7 +147,11 @@ def __init__(
         self.wikidata_to_mentions = wikidata_to_mentions
         self.strvar_parameters = strvar_parameters
         self.deezy_parameters = deezy_parameters
-        self.already_collected_cands = already_collected_cands
+
+        if already_collected_cands:
+            self.already_collected_cands = already_collected_cands
+        else:
+            self.already_collected_cands = dict()
 
     def __str__(self) -> str:
         """
@@ -173,18 +177,10 @@ def __str__(self) -> str:
 
         return s
 
-    def load_resources(self) -> dict:
+    def load_resources(self):
         """
         Load the ranker resources.
 
-        Returns:
-            dict:
-                The loaded mentions-to-wikidata dictionary, which maps a
-                mention (e.g. ``"London"``) to the Wikidata entities that are
-                referred to by this mention on Wikipedia (e.g. ``Q84``,
-                ``Q2477346``). The data also includes, for each entity, their
-                normalized "relevance", i.e. number of in-links across Wikipedia.
-
         Note:
             This method loads the mentions-to-wikidata and
             wikidata-to-mentions dictionaries from the resources directory,
@@ -195,6 +191,12 @@ def load_resources(self) -> dict:
             It filters the dictionaries to remove noise and updates the class
             attributes accordingly.
 
+            The loaded mentions-to-wikidata dictionary, which maps a mention 
+            (e.g. ``"London"``) to the Wikidata entities that are
+            referred to by this mention on Wikipedia (e.g. ``Q84``,
+            ``Q2477346``). The data also includes, for each entity, their
+            normalized "relevance", i.e. number of in-links across Wikipedia.
+
             The method also initialises ``pandarallel`` if needed by the
             candidate ranking method (if the ``method`` set in the initialiser
             of the ``Ranker`` was set to "partialmatch" or "levenshtein").
@@ -254,8 +256,6 @@ def load_resources(self) -> dict:
             pandarallel.initialize(nb_workers=10)
             os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
-        return self.mentions_to_wikidata
-
     def train(self) -> None:
         """
         Training a DeezyMatch model. The training will be skipped if the model
@@ -466,7 +466,7 @@ def partial_match(self, queries: List[str], damlev: bool) -> Tuple[dict, dict]:
 
             self.already_collected_cands[query] = mention_df
 
-        return candidates, self.already_collected_cands
+        return candidates
 
     def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]:
         """
@@ -490,7 +490,7 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]:
 
         Example:
             >>> ranker = Ranker(...)
-            >>> ranker.mentions_to_wikidata = ranker.load_resources()
+            >>> ranker.load_resources()
             >>> queries = ['London', 'Shefrield']
             >>> candidates, already_collected = ranker.deezy_on_the_fly(queries)
             >>> print(candidates)
@@ -515,7 +515,7 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]:
         dm_output = self.deezy_parameters["dm_output"]
 
         # first we fill in the perfect matches and already collected queries
-        cands_dict, self.already_collected_cands = self.perfect_match(queries)
+        cands_dict = self.perfect_match(queries)
 
         # the rest go through
         remainers = [x for x, y in cands_dict.items() if len(y) == 0]
@@ -565,7 +565,7 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]:
 
                 self.already_collected_cands[row["query"]] = returned_cands
 
-        return cands_dict, self.already_collected_cands
+        return cands_dict
 
     def run(self, queries: List[str]) -> Tuple[dict, dict]:
         """
@@ -583,9 +583,9 @@ def run(self, queries: List[str]) -> Tuple[dict, dict]:
 
         Example:
             >>> myranker = Ranker(method="perfectmatch", ...)
-            >>> myranker.mentions_to_wikidata = myranker.load_resources()
+            >>> myranker.load_resources()
             >>> queries = ['London', 'Barcelona', 'Bologna']
-            >>> candidates, already_collected = myranker.run(queries)
+            >>> candidates = myranker.run(queries)
             >>> print(candidates)
             {'London': {'London': 1.0}, 'Barcelona': {'Barcelona': 1.0}, 'Bologna': {'Bologna': 1.0}}
             >>> print(already_collected)
@@ -674,7 +674,7 @@ def find_candidates(self, mentions: List[dict]) -> Tuple[dict, dict]:
         queries = list(set([mention["mention"] for mention in mentions]))
 
         # Pass the mentions to :py:meth:`geoparser.ranking.Ranker.run`
-        cands, self.already_collected_cands = self.run(queries)
+        cands = self.run(queries)
 
         # Get Wikidata candidates
         wk_cands = dict()
@@ -702,4 +702,4 @@ def find_candidates(self, mentions: List[dict]) -> Tuple[dict, dict]:
                             "Candidates": found_cands,
                         }
 
-        return wk_cands, self.already_collected_cands
+        return wk_cands
diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py
@@ -134,14 +134,14 @@ def test_train():
     # -----------------------------------------
     # Ranker loading resources and training a model:
     # Load the resources:
-    myranker.mentions_to_wikidata = myranker.load_resources()
+    myranker.load_resources()
     # Train a DeezyMatch model if needed:
     myranker.train()
 
     # -----------------------------------------
     # Linker loading resources:
     # Load linking resources:
-    mylinker.linking_resources = mylinker.load_resources()
+    mylinker.load_resources()
     # Train a linking model if needed (it requires myranker to generate potential
     # candidates to the training set):
     mylinker.rel_params["ed_model"] = mylinker.train_load_model(myranker)
@@ -236,14 +236,14 @@ def test_load_eval_model():
     # -----------------------------------------
     # Ranker loading resources and training a model:
     # Load the resources:
-    myranker.mentions_to_wikidata = myranker.load_resources()
+    myranker.load_resources()
     # Train a DeezyMatch model if needed:
     myranker.train()
 
     # -----------------------------------------
     # Linker loading resources:
     # Load linking resources:
-    mylinker.linking_resources = mylinker.load_resources()
+    mylinker.load_resources()
     # Train a linking model if needed (it requires myranker to generate potential
     # candidates to the training set):
     mylinker.rel_params["ed_model"] = mylinker.train_load_model(myranker)

diff --git a/tests/test_experiments.py b/tests/test_experiments.py
@@ -69,10 +69,10 @@ def test_load_data():
     myner.train()
     myner.pipe = myner.create_pipeline()
 
-    myranker.mentions_to_wikidata = myranker.load_resources()
+    myranker.load_resources()
     myranker.train()
 
-    mylinker.linking_resources = mylinker.load_resources()
+    mylinker.load_resources()
 
     # --------------------------------------
     # Instantiate the experiment:
@@ -172,10 +172,10 @@ def test_apply():
     myner.train()
     myner.pipe = myner.create_pipeline()
 
-    myranker.mentions_to_wikidata = myranker.load_resources()
+    myranker.load_resources()
     myranker.train()
 
-    mylinker.linking_resources = mylinker.load_resources()
+    mylinker.load_resources()
 
     # --------------------------------------
     # Instantiate the experiment: