diff --git a/docs/source/getting-started/complete-tour.rst b/docs/source/getting-started/complete-tour.rst index bee8401a..785b2cdf 100644 --- a/docs/source/getting-started/complete-tour.rst +++ b/docs/source/getting-started/complete-tour.rst @@ -834,7 +834,7 @@ the Ranker method. .. code-block:: python - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() .. note:: @@ -1056,7 +1056,7 @@ of the Linker method. .. code-block:: python - mylinker.linking_resources = mylinker.load_resources() + mylinker.load_resources() .. note:: diff --git a/examples/train_use_deezy_model_1.ipynb b/examples/train_use_deezy_model_1.ipynb index 95ffb5e4..ed5c726d 100644 --- a/examples/train_use_deezy_model_1.ipynb +++ b/examples/train_use_deezy_model_1.ipynb @@ -120,7 +120,7 @@ "outputs": [], "source": [ "# Load the resources:\n", - "myranker.mentions_to_wikidata = myranker.load_resources()" + "myranker.load_resources()" ] }, { @@ -177,7 +177,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/train_use_deezy_model_2.ipynb b/examples/train_use_deezy_model_2.ipynb index 1e49cc9f..6a10f511 100644 --- a/examples/train_use_deezy_model_2.ipynb +++ b/examples/train_use_deezy_model_2.ipynb @@ -109,7 +109,7 @@ "outputs": [], "source": [ "# Load the resources:\n", - "myranker.mentions_to_wikidata = myranker.load_resources()" + "myranker.load_resources()" ] }, { @@ -173,7 +173,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/train_use_deezy_model_3.ipynb b/examples/train_use_deezy_model_3.ipynb index f2b2f226..53925ea4 100644 --- a/examples/train_use_deezy_model_3.ipynb +++ b/examples/train_use_deezy_model_3.ipynb @@ -111,7 +111,7 @@ "outputs": [], "source": [ "# Load the resources:\n", - "myranker.mentions_to_wikidata = myranker.load_resources()" + "myranker.load_resources()" ] }, { @@ -168,7 +168,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/experiments/experiment.py b/experiments/experiment.py index 2e758586..617af0dc 100644 --- a/experiments/experiment.py +++ b/experiments/experiment.py @@ -224,10 +224,7 @@ def prepare_data(self) -> dict: # Obtain candidates per sentence: for sentence_id in tqdm(dMentionsPred): pred_mentions_sent = dMentionsPred[sentence_id] - ( - wk_cands, - self.myranker.already_collected_cands, - ) = self.myranker.find_candidates(pred_mentions_sent) + wk_cands = self.myranker.find_candidates(pred_mentions_sent) dCandidates[sentence_id] = wk_cands # ------------------------------------------- diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py index 0fb55bd8..de2f1dfe 100644 --- a/experiments/toponym_resolution.py +++ b/experiments/toponym_resolution.py @@ -166,14 +166,14 @@ # ----------------------------------------- # Ranker loading resources and training a model: # Load the resources: - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() # Train a DeezyMatch model if needed: myranker.train() # ----------------------------------------- # Linker loading resources: # Load linking resources: - mylinker.linking_resources = mylinker.load_resources() + mylinker.load_resources() # ----------------------------------------- # Prepare experiment: diff --git a/geoparser/pipeline.py b/geoparser/pipeline.py index dc0d095d..ac994df4 100644 --- a/geoparser/pipeline.py +++ b/geoparser/pipeline.py @@ -118,7 +118,7 @@ def __init__( # Ranker loading resources and training a model: # Load the resources: - self.myranker.mentions_to_wikidata = self.myranker.load_resources() + self.myranker.load_resources() # Train a DeezyMatch model if needed: self.myranker.train() @@ -127,7 +127,7 @@ def __init__( # Linker loading resources: # Load linking resources: - self.mylinker.linking_resources = self.mylinker.load_resources() + self.mylinker.load_resources() # Train a linking model if needed (it requires myranker to generate # potential candidates to the training set): @@ -223,9 +223,7 @@ def run_sentence( rmentions = [{"mention": y["mention"]} for y in mentions] # Perform candidate ranking: - wk_cands, self.myranker.already_collected_cands = self.myranker.find_candidates( - rmentions - ) + wk_cands = self.myranker.find_candidates(rmentions) mentions_dataset = dict() mentions_dataset["linking"] = [] @@ -685,9 +683,7 @@ def run_candidate_selection(self, document_dataset: List[dict]) -> dict: mentions = [{"mention": m} for m in mentions] # Perform candidate ranking: - wk_cands, self.myranker.already_collected_cands = self.myranker.find_candidates( - mentions - ) + wk_cands = self.myranker.find_candidates(rmentions) return wk_cands def run_disambiguation( diff --git a/geoparser/ranking.py b/geoparser/ranking.py index 63bdd3b6..15a46298 100644 --- a/geoparser/ranking.py +++ b/geoparser/ranking.py @@ -50,18 +50,18 @@ class Ranker: ) >>> # Load resources - >>> ranker.mentions_to_wikidata = ranker.load_resources() + >>> ranker.load_resources() >>> # Train the ranker (if applicable) >>> ranker.train() >>> # Perform candidate selection >>> queries = ['London', 'Paraguay'] - >>> candidates, already_collected = ranker.run(queries) + >>> candidates = ranker.run(queries) >>> # Find candidates for mentions >>> mentions = [{'mention': 'London'}, {'mention': 'Paraguay'}] - >>> mention_candidates, mention_already_collected = ranker.find_candidates(mentions) + >>> mention_candidates = ranker.find_candidates(mentions) >>> # Print the results >>> print("Candidate Selection Results:") @@ -136,7 +136,7 @@ def __init__( "overwrite_training": False, "do_test": False, }, - already_collected_cands: Optional[dict] = dict(), + already_collected_cands: Optional[dict] = None, ): """ Initialize a Ranker object. @@ -147,7 +147,11 @@ def __init__( self.wikidata_to_mentions = wikidata_to_mentions self.strvar_parameters = strvar_parameters self.deezy_parameters = deezy_parameters - self.already_collected_cands = already_collected_cands + + if already_collected_cands: + self.already_collected_cands = already_collected_cands + else: + self.already_collected_cands = dict() def __str__(self) -> str: """ @@ -173,18 +177,10 @@ def __str__(self) -> str: return s - def load_resources(self) -> dict: + def load_resources(self): """ Load the ranker resources. - Returns: - dict: - The loaded mentions-to-wikidata dictionary, which maps a - mention (e.g. ``"London"``) to the Wikidata entities that are - referred to by this mention on Wikipedia (e.g. ``Q84``, - ``Q2477346``). The data also includes, for each entity, their - normalized "relevance", i.e. number of in-links across Wikipedia. - Note: This method loads the mentions-to-wikidata and wikidata-to-mentions dictionaries from the resources directory, @@ -195,6 +191,12 @@ def load_resources(self) -> dict: It filters the dictionaries to remove noise and updates the class attributes accordingly. + The loaded mentions-to-wikidata dictionary, which maps a mention + (e.g. ``"London"``) to the Wikidata entities that are + referred to by this mention on Wikipedia (e.g. ``Q84``, + ``Q2477346``). The data also includes, for each entity, their + normalized "relevance", i.e. number of in-links across Wikipedia. + The method also initialises ``pandarallel`` if needed by the candidate ranking method (if the ``method`` set in the initialiser of the ``Ranker`` was set to "partialmatch" or "levenshtein"). @@ -254,8 +256,6 @@ def load_resources(self) -> dict: pandarallel.initialize(nb_workers=10) os.environ["TOKENIZERS_PARALLELISM"] = "true" - return self.mentions_to_wikidata - def train(self) -> None: """ Training a DeezyMatch model. The training will be skipped if the model @@ -466,7 +466,7 @@ def partial_match(self, queries: List[str], damlev: bool) -> Tuple[dict, dict]: self.already_collected_cands[query] = mention_df - return candidates, self.already_collected_cands + return candidates def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]: """ @@ -490,7 +490,7 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]: Example: >>> ranker = Ranker(...) - >>> ranker.mentions_to_wikidata = ranker.load_resources() + >>> ranker.load_resources() >>> queries = ['London', 'Shefrield'] >>> candidates, already_collected = ranker.deezy_on_the_fly(queries) >>> print(candidates) @@ -515,7 +515,7 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]: dm_output = self.deezy_parameters["dm_output"] # first we fill in the perfect matches and already collected queries - cands_dict, self.already_collected_cands = self.perfect_match(queries) + cands_dict = self.perfect_match(queries) # the rest go through remainers = [x for x, y in cands_dict.items() if len(y) == 0] @@ -565,7 +565,7 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]: self.already_collected_cands[row["query"]] = returned_cands - return cands_dict, self.already_collected_cands + return cands_dict def run(self, queries: List[str]) -> Tuple[dict, dict]: """ @@ -583,9 +583,9 @@ def run(self, queries: List[str]) -> Tuple[dict, dict]: Example: >>> myranker = Ranker(method="perfectmatch", ...) - >>> myranker.mentions_to_wikidata = myranker.load_resources() + >>> myranker.load_resources() >>> queries = ['London', 'Barcelona', 'Bologna'] - >>> candidates, already_collected = myranker.run(queries) + >>> candidates = myranker.run(queries) >>> print(candidates) {'London': {'London': 1.0}, 'Barcelona': {'Barcelona': 1.0}, 'Bologna': {'Bologna': 1.0}} >>> print(already_collected) @@ -674,7 +674,7 @@ def find_candidates(self, mentions: List[dict]) -> Tuple[dict, dict]: queries = list(set([mention["mention"] for mention in mentions])) # Pass the mentions to :py:meth:`geoparser.ranking.Ranker.run` - cands, self.already_collected_cands = self.run(queries) + cands = self.run(queries) # Get Wikidata candidates wk_cands = dict() @@ -702,4 +702,4 @@ def find_candidates(self, mentions: List[dict]) -> Tuple[dict, dict]: "Candidates": found_cands, } - return wk_cands, self.already_collected_cands + return wk_cands diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py index 1a206f2f..b1740d74 100644 --- a/tests/test_disambiguation.py +++ b/tests/test_disambiguation.py @@ -134,14 +134,14 @@ def test_train(): # ----------------------------------------- # Ranker loading resources and training a model: # Load the resources: - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() # Train a DeezyMatch model if needed: myranker.train() # ----------------------------------------- # Linker loading resources: # Load linking resources: - mylinker.linking_resources = mylinker.load_resources() + mylinker.load_resources() # Train a linking model if needed (it requires myranker to generate potential # candidates to the training set): mylinker.rel_params["ed_model"] = mylinker.train_load_model(myranker) @@ -236,14 +236,14 @@ def test_load_eval_model(): # ----------------------------------------- # Ranker loading resources and training a model: # Load the resources: - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() # Train a DeezyMatch model if needed: myranker.train() # ----------------------------------------- # Linker loading resources: # Load linking resources: - mylinker.linking_resources = mylinker.load_resources() + mylinker.load_resources() # Train a linking model if needed (it requires myranker to generate potential # candidates to the training set): mylinker.rel_params["ed_model"] = mylinker.train_load_model(myranker) diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 825e29db..f8e11738 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -69,10 +69,10 @@ def test_load_data(): myner.train() myner.pipe = myner.create_pipeline() - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() myranker.train() - mylinker.linking_resources = mylinker.load_resources() + mylinker.load_resources() # -------------------------------------- # Instantiate the experiment: @@ -172,10 +172,10 @@ def test_apply(): myner.train() myner.pipe = myner.create_pipeline() - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() myranker.train() - mylinker.linking_resources = mylinker.load_resources() + mylinker.load_resources() # -------------------------------------- # Instantiate the experiment: diff --git a/tests/test_ranking.py b/tests/test_ranking.py index 5f3dae5e..998092e3 100644 --- a/tests/test_ranking.py +++ b/tests/test_ranking.py @@ -33,14 +33,14 @@ def test_perfect_match(): method="perfectmatch", resources_path="resources/wikidata/", ) - myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.perfect_match(["London"]) + myranker.load_resources() + candidates = myranker.perfect_match(["London"]) assert candidates["London"]["London"] == 1.0 - candidates, already_collected_cands = myranker.perfect_match(["Lvndon"]) + candidates = myranker.perfect_match(["Lvndon"]) assert candidates["Lvndon"] == {} - candidates, already_collected_cands = myranker.perfect_match(["Paperopoli"]) + candidates = myranker.perfect_match(["Paperopoli"]) assert candidates["Paperopoli"] == {} @@ -99,11 +99,11 @@ def test_partial_match(): wikidata_to_mentions=dict(), ) - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() # Test that perfect_match acts before partial match myranker.mentions_to_wikidata = {"London": "Q84"} - candidates, already_collected_cands = myranker.partial_match( + candidates = myranker.partial_match( ["London"], damlev=False ) assert candidates["London"]["London"] == 1.0 @@ -111,7 +111,7 @@ def test_partial_match(): # Test that damlev works myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( + candidates = myranker.partial_match( ["Lvndvn"], damlev=True ) assert candidates["Lvndvn"]["London"] == 0.6666666567325592 @@ -120,7 +120,7 @@ def test_partial_match(): myranker.mentions_to_wikidata = {"New York City": "Q60"} myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( + candidates = myranker.partial_match( ["New York"], damlev=False ) assert candidates["New York"]["New York City"] == 0.6153846153846154 @@ -128,14 +128,14 @@ def test_partial_match(): myranker.mentions_to_wikidata = {"New York City": "Q60"} myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( + candidates = myranker.partial_match( ["Lvndvn"], damlev=False ) assert candidates["Lvndvn"] == {} myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( + candidates = myranker.partial_match( ["asdasd"], damlev=True ) assert candidates["asdasd"] == {"New York City": 0.0} @@ -176,14 +176,14 @@ def test_deezy_on_the_fly(): ) # Test that perfect_match acts before deezy - myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.deezy_on_the_fly(["London"]) + myranker.load_resources() + candidates = myranker.deezy_on_the_fly(["London"]) assert candidates["London"]["London"] == 1.0 # Test that deezy works myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.deezy_on_the_fly( + candidates = myranker.deezy_on_the_fly( ["Ashton-cnderLyne"] ) assert ( @@ -227,19 +227,15 @@ def test_find_candidates(): ) # Test that perfect_match acts before deezy - myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "London"}] - ) + myranker.load_resources() + candidates = myranker.find_candidates([{"mention": "London"}]) assert candidates["London"]["London"]["Score"] == 1.0 assert "Q84" in candidates["London"]["London"]["Candidates"] # Test that deezy works myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) + candidates = myranker.find_candidates([{"mention": "Sheftield"}]) assert ( candidates["Sheftield"]["Sheffield"]["Score"] > 0.0 and candidates["Sheftield"]["Sheffield"]["Score"] < 1.0 @@ -250,56 +246,44 @@ def test_find_candidates(): myranker.method = "perfectmatch" # Test that perfect_match acts before deezy - myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheffield"}] - ) + myranker.load_resources() + candidates = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) + candidates = myranker.find_candidates([{"mention": "Sheftield"}]) assert candidates["Sheftield"] == {} # Test that check if contained works myranker.method = "partialmatch" # Test that perfect_match acts before partialmatch - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheffield"}] - ) + candidates = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) + candidates = myranker.find_candidates([{"mention": "Sheftield"}]) assert "Sheffield" not in candidates["Sheftield"] # Test that levenshtein works myranker.method = "levenshtein" # Test that perfect_match acts before partialmatch - myranker.mentions_to_wikidata = myranker.load_resources() + myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheffield"}] - ) + candidates = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) + candidates = myranker.find_candidates([{"mention": "Sheftield"}]) assert ( candidates["Sheftield"]["Sheffield"]["Score"] > 0.0 and candidates["Sheftield"]["Sheffield"]["Score"] < 1.0 diff --git a/utils/rel_utils.py b/utils/rel_utils.py index 7589924f..e19c915e 100644 --- a/utils/rel_utils.py +++ b/utils/rel_utils.py @@ -323,7 +323,7 @@ def prepare_rel_trainset( # Format the mentions are required by the ranker: all_mentions = [{"mention": mention} for mention in all_mentions] # Use the ranker to find candidates: - wk_cands, myranker.already_collected_cands = myranker.find_candidates(all_mentions) + wk_cands = myranker.find_candidates(all_mentions) # Rank the candidates: rel_json = rank_candidates( rel_json,