From 98e9c25a4f73eac9aa66716a6493c8cd9d0e5476 Mon Sep 17 00:00:00 2001 From: Tim Hobson Date: Fri, 10 Jan 2025 15:10:24 +0000 Subject: [PATCH] Move coords field into WikidataLinks dataclass --- t_res/geoparser/linking.py | 60 ++++++++++++++++++-------------------- t_res/utils/dataclasses.py | 4 +-- tests/test_dataclasses.py | 9 +++++- 3 files changed, 38 insertions(+), 35 deletions(-) diff --git a/t_res/geoparser/linking.py b/t_res/geoparser/linking.py index 87c28115..604bff54 100644 --- a/t_res/geoparser/linking.py +++ b/t_res/geoparser/linking.py @@ -96,6 +96,16 @@ def wkdt_class(self, wqid: str) -> Optional[str]: """ return self.resources["entity2class"].get(wqid, None) + def wkdt_coords(self, wqid: str) -> Optional[Tuple[float, float]]: + """ + Returns the lat-lon coordinates for the given Wikidata entry, if available. + + Returns: + Latitude and longitude coordinates for the given Wikidata entry, if + available. + """ + return self.resources["wqid_to_coords"].get(wqid, None) + def empty_candidates(self, mention: Mention, ranking_method: str, @@ -144,6 +154,22 @@ def load(self): ) as f: self.resources["entity2class"] = json.load(f) + print(" > Loading gazetteer.") + gaz = pd.read_csv( + os.path.join(self.resources_path, "wikidata/wikidata_gazetteer.csv"), + usecols=["wikidata_id", "latitude", "longitude"], + ) + gaz["latitude"] = gaz["latitude"].astype(float) + gaz["longitude"] = gaz["longitude"].astype(float) + gaz["coords"] = gaz[["latitude", "longitude"]].to_numpy().tolist() + wqid_to_coords = dict(zip(gaz.wikidata_id, gaz.coords)) + self.resources["wqid_to_coords"] = wqid_to_coords + gaz_ids = set(gaz["wikidata_id"].tolist()) + # Keep only wikipedia entities in the gazetteer: + self.resources["wikidata_locs"] = gaz_ids + gaz_ids = "" + gaz = "" + print("*** Linking resources loaded!\n") def run( @@ -284,6 +310,7 @@ def wikidata_links( links = [MostPopularLink( wqid=wqid, wkdt_class=self.wkdt_class(wqid), + coords=self.wkdt_coords(wqid), freq=self.resources["mentions_to_wikidata"][match.variation][wqid]) for wqid in match.wqid_links] return links @@ -320,38 +347,6 @@ class ByDistanceLinker(Linker): # Override the method_name class attribute. method_name: str = "bydistance" - def load(self): - """ - Loads the linking resources and assigns them to instance variables. - """ - super().load() - - print(" > Loading gazetteer.") - gaz = pd.read_csv( - os.path.join(self.resources_path, "wikidata/wikidata_gazetteer.csv"), - usecols=["wikidata_id", "latitude", "longitude"], - ) - gaz["latitude"] = gaz["latitude"].astype(float) - gaz["longitude"] = gaz["longitude"].astype(float) - gaz["coords"] = gaz[["latitude", "longitude"]].to_numpy().tolist() - wqid_to_coords = dict(zip(gaz.wikidata_id, gaz.coords)) - self.resources["wqid_to_coords"] = wqid_to_coords - gaz_ids = set(gaz["wikidata_id"].tolist()) - # Keep only wikipedia entities in the gazetteer: - self.resources["wikidata_locs"] = gaz_ids - gaz_ids = "" - gaz = "" - - def wkdt_coords(self, wqid: str) -> Optional[Tuple[float, float]]: - """ - Returns the lat-lon coordinates for the given Wikidata entry, if available. - - Returns: - Latitude and longitude coordinates for the given Wikidata entry, if - available. - """ - return self.resources["wqid_to_coords"].get(wqid, None) - def wikidata_links( self, match: StringMatchLinks, @@ -622,6 +617,7 @@ def wikidata_links( links = [RelDisambLink( wqid=wqid, wkdt_class=self.wkdt_class(wqid), + coords=self.wkdt_coords(wqid), freq=self.resources["mentions_to_wikidata"][match.variation][wqid], normalized_score=self.resources["mentions_to_wikidata_normalized"][match.variation][ wqid diff --git a/t_res/utils/dataclasses.py b/t_res/utils/dataclasses.py index cadaa362..9a7711b5 100644 --- a/t_res/utils/dataclasses.py +++ b/t_res/utils/dataclasses.py @@ -300,9 +300,11 @@ class WikidataLink: Attributes: wqid (str): The Wikidata ID. wkdt_class (Optional[str]): The Wikidata class of this Wikidata entry (if available). + coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the link in Wikidata. """ wqid: str wkdt_class: Optional[str] + coords: Optional[Tuple[float, float]] # For API deserialisation. def from_dict(data: dict) -> 'WikidataLink': @@ -333,12 +335,10 @@ class ByDistanceLink(WikidataLink): Wikidata under the `bydistance` linking method. Attributes: - coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the link in Wikidata. place_of_pub_coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the place of publication. geodist (Optional[float]): The geodesic distance between the wqid and the origin wqid. normalized_score (float): The normalized score from resource `mentions_to_wikidata_normalized.json`. """ - coords: Optional[Tuple[float, float]] place_of_pub_coords: Optional[Tuple[float, float]] geodist: Optional[float] normalized_score: float diff --git a/tests/test_dataclasses.py b/tests/test_dataclasses.py index d2c101b2..017dee28 100644 --- a/tests/test_dataclasses.py +++ b/tests/test_dataclasses.py @@ -86,9 +86,10 @@ def test_candidate_matches(): def test_wikidata_links(): - wikidata_link = MostPopularLink('Q619055', wkdt_class='Q1076486', freq=22) + wikidata_link = MostPopularLink('Q619055', wkdt_class='Q1076486', coords=(55.76, -2.01583), freq=22) assert wikidata_link.wqid == 'Q619055' assert wikidata_link.freq == 22 + assert wikidata_link.coords == (55.76, -2.01583) wikidata_link = ByDistanceLink( 'Q619055', @@ -107,6 +108,7 @@ def test_wikidata_links(): wikidata_link = RelDisambLink( 'Q619055', wkdt_class='Q1076486', + coords=(55.76, -2.01583), freq=22, normalized_score=0.03571428571428571, ) @@ -122,12 +124,14 @@ def test_candidate_links(): RelDisambLink( 'Q619055', wkdt_class='Q1076486', + coords=(55.76, -2.01583), freq=5, normalized_score=0.03571428571428571, ), RelDisambLink( 'Q5953687', wkdt_class='Q23764314', + coords=(55.76, -2.01583), freq=33, normalized_score=0.22857142857142856, ), @@ -146,11 +150,13 @@ def test_predicted_links(): MostPopularLink( 'Q619055', wkdt_class='Q1076486', + coords=(55.76, -2.01583), freq=5 ), MostPopularLink( 'Q5953687', wkdt_class='Q23764314', + coords=(55.76, -2.01583), freq=33, ), ] @@ -173,6 +179,7 @@ def test_predicted_links(): assert predicted_links.best_wikidata_link() == MostPopularLink( 'Q5953687', wkdt_class='Q23764314', + coords=(55.76, -2.01583), freq=33 )