Skip to content

Commit

Permalink
Move coords field into WikidataLinks dataclass
Browse files Browse the repository at this point in the history
  • Loading branch information
thobson88 committed Jan 10, 2025
1 parent ce14a48 commit 98e9c25
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 35 deletions.
60 changes: 28 additions & 32 deletions t_res/geoparser/linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ def wkdt_class(self, wqid: str) -> Optional[str]:
"""
return self.resources["entity2class"].get(wqid, None)

def wkdt_coords(self, wqid: str) -> Optional[Tuple[float, float]]:
"""
Returns the lat-lon coordinates for the given Wikidata entry, if available.
Returns:
Latitude and longitude coordinates for the given Wikidata entry, if
available.
"""
return self.resources["wqid_to_coords"].get(wqid, None)

def empty_candidates(self,
mention: Mention,
ranking_method: str,
Expand Down Expand Up @@ -144,6 +154,22 @@ def load(self):
) as f:
self.resources["entity2class"] = json.load(f)

print(" > Loading gazetteer.")
gaz = pd.read_csv(
os.path.join(self.resources_path, "wikidata/wikidata_gazetteer.csv"),
usecols=["wikidata_id", "latitude", "longitude"],
)
gaz["latitude"] = gaz["latitude"].astype(float)
gaz["longitude"] = gaz["longitude"].astype(float)
gaz["coords"] = gaz[["latitude", "longitude"]].to_numpy().tolist()
wqid_to_coords = dict(zip(gaz.wikidata_id, gaz.coords))
self.resources["wqid_to_coords"] = wqid_to_coords
gaz_ids = set(gaz["wikidata_id"].tolist())
# Keep only wikipedia entities in the gazetteer:
self.resources["wikidata_locs"] = gaz_ids
gaz_ids = ""
gaz = ""

print("*** Linking resources loaded!\n")

def run(
Expand Down Expand Up @@ -284,6 +310,7 @@ def wikidata_links(
links = [MostPopularLink(
wqid=wqid,
wkdt_class=self.wkdt_class(wqid),
coords=self.wkdt_coords(wqid),
freq=self.resources["mentions_to_wikidata"][match.variation][wqid])
for wqid in match.wqid_links]
return links
Expand Down Expand Up @@ -320,38 +347,6 @@ class ByDistanceLinker(Linker):
# Override the method_name class attribute.
method_name: str = "bydistance"

def load(self):
"""
Loads the linking resources and assigns them to instance variables.
"""
super().load()

print(" > Loading gazetteer.")
gaz = pd.read_csv(
os.path.join(self.resources_path, "wikidata/wikidata_gazetteer.csv"),
usecols=["wikidata_id", "latitude", "longitude"],
)
gaz["latitude"] = gaz["latitude"].astype(float)
gaz["longitude"] = gaz["longitude"].astype(float)
gaz["coords"] = gaz[["latitude", "longitude"]].to_numpy().tolist()
wqid_to_coords = dict(zip(gaz.wikidata_id, gaz.coords))
self.resources["wqid_to_coords"] = wqid_to_coords
gaz_ids = set(gaz["wikidata_id"].tolist())
# Keep only wikipedia entities in the gazetteer:
self.resources["wikidata_locs"] = gaz_ids
gaz_ids = ""
gaz = ""

def wkdt_coords(self, wqid: str) -> Optional[Tuple[float, float]]:
"""
Returns the lat-lon coordinates for the given Wikidata entry, if available.
Returns:
Latitude and longitude coordinates for the given Wikidata entry, if
available.
"""
return self.resources["wqid_to_coords"].get(wqid, None)

def wikidata_links(
self,
match: StringMatchLinks,
Expand Down Expand Up @@ -622,6 +617,7 @@ def wikidata_links(
links = [RelDisambLink(
wqid=wqid,
wkdt_class=self.wkdt_class(wqid),
coords=self.wkdt_coords(wqid),
freq=self.resources["mentions_to_wikidata"][match.variation][wqid],
normalized_score=self.resources["mentions_to_wikidata_normalized"][match.variation][
wqid
Expand Down
4 changes: 2 additions & 2 deletions t_res/utils/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,11 @@ class WikidataLink:
Attributes:
wqid (str): The Wikidata ID.
wkdt_class (Optional[str]): The Wikidata class of this Wikidata entry (if available).
coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the link in Wikidata.
"""
wqid: str
wkdt_class: Optional[str]
coords: Optional[Tuple[float, float]]

# For API deserialisation.
def from_dict(data: dict) -> 'WikidataLink':
Expand Down Expand Up @@ -333,12 +335,10 @@ class ByDistanceLink(WikidataLink):
Wikidata under the `bydistance` linking method.
Attributes:
coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the link in Wikidata.
place_of_pub_coords (Optional[Tuple[float, float]]): The lat-lon coordinates of the place of publication.
geodist (Optional[float]): The geodesic distance between the wqid and the origin wqid.
normalized_score (float): The normalized score from resource `mentions_to_wikidata_normalized.json`.
"""
coords: Optional[Tuple[float, float]]
place_of_pub_coords: Optional[Tuple[float, float]]
geodist: Optional[float]
normalized_score: float
Expand Down
9 changes: 8 additions & 1 deletion tests/test_dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,10 @@ def test_candidate_matches():

def test_wikidata_links():

wikidata_link = MostPopularLink('Q619055', wkdt_class='Q1076486', freq=22)
wikidata_link = MostPopularLink('Q619055', wkdt_class='Q1076486', coords=(55.76, -2.01583), freq=22)
assert wikidata_link.wqid == 'Q619055'
assert wikidata_link.freq == 22
assert wikidata_link.coords == (55.76, -2.01583)

wikidata_link = ByDistanceLink(
'Q619055',
Expand All @@ -107,6 +108,7 @@ def test_wikidata_links():
wikidata_link = RelDisambLink(
'Q619055',
wkdt_class='Q1076486',
coords=(55.76, -2.01583),
freq=22,
normalized_score=0.03571428571428571,
)
Expand All @@ -122,12 +124,14 @@ def test_candidate_links():
RelDisambLink(
'Q619055',
wkdt_class='Q1076486',
coords=(55.76, -2.01583),
freq=5,
normalized_score=0.03571428571428571,
),
RelDisambLink(
'Q5953687',
wkdt_class='Q23764314',
coords=(55.76, -2.01583),
freq=33,
normalized_score=0.22857142857142856,
),
Expand All @@ -146,11 +150,13 @@ def test_predicted_links():
MostPopularLink(
'Q619055',
wkdt_class='Q1076486',
coords=(55.76, -2.01583),
freq=5
),
MostPopularLink(
'Q5953687',
wkdt_class='Q23764314',
coords=(55.76, -2.01583),
freq=33,
),
]
Expand All @@ -173,6 +179,7 @@ def test_predicted_links():
assert predicted_links.best_wikidata_link() == MostPopularLink(
'Q5953687',
wkdt_class='Q23764314',
coords=(55.76, -2.01583),
freq=33
)

Expand Down

0 comments on commit 98e9c25

Please sign in to comment.