Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Automate location extraction and english translation #642

Merged
merged 24 commits into from
Aug 5, 2024
Merged
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: added location extraction
cka-y committed Jul 30, 2024
commit e2fee2b9f5159682bc526d5f98ccfe56c9f8c2c7
54 changes: 45 additions & 9 deletions functions-python/extract_location/src/location_extractor.py
Original file line number Diff line number Diff line change
@@ -90,6 +90,13 @@ def reverse_coords(
subdivision_name,
municipality,
) = reverse_coord(lat, lon, include_lang_header)
logging.info(
f"Reverse geocoding result for point lat={lat}, lon={lon}: "
f"country_code={country_code}, "
f"country={country}, "
f"subdivision={subdivision_name}, "
f"municipality={municipality}"
)
if country_code is not None:
municipalities.append(municipality) if municipality else None
subdivisions.append(subdivision_name) if subdivision_name else None
@@ -118,17 +125,23 @@ def reverse_coords(
most_common_subdivision, subdivision_count = Counter(subdivisions).most_common(
1
)[0]
logging.info(
f"Most common municipality: {most_common_municipality} with count {municipality_count}"
)
logging.info(
f"Most common subdivision: {most_common_subdivision} with count {subdivision_count}"
)

# Apply decision threshold to determine final values
if municipality_count / len(points) < decision_threshold:
if municipality_count / len(results) < decision_threshold:
most_common_municipality = None

if subdivision_count / len(points) < decision_threshold:
if subdivision_count / len(results) < decision_threshold:
most_common_subdivision = None

return LocationInfo(
country_codes=country_codes,
countries=countries,
country_codes=list(set(country_codes)),
countries=list(set(countries)),
most_common_subdivision_name=most_common_subdivision,
most_common_municipality=most_common_municipality,
)
@@ -151,18 +164,41 @@ def update_location(location_info: LocationInfo, dataset_id: str, session: Sessi
raise Exception(f"Dataset {dataset_id} does not exist in the database.")
locations = []
for i in range(len(location_info.country_codes)):
location = Location(
country_code=location_info.country_codes[i],
country=location_info.countries[i],
subdivision_name=location_info.most_common_subdivision_name,
municipality=location_info.most_common_municipality,
logging.info(
f"[{dataset_id}] Extracted location: "
f"country={location_info.countries[i]}, "
f"country_code={location_info.country_codes[i]}, "
f"subdivision={location_info.most_common_subdivision_name}, "
f"municipality={location_info.most_common_municipality}"
)
# Check if location already exists
location_id = (
f"{location_info.country_codes[i] or ''}-"
f"{location_info.most_common_subdivision_name or ''}-"
f"{location_info.most_common_municipality or ''}"
).replace(" ", "_")
location = (
session.query(Location).filter(Location.id == location_id).one_or_none()
)
if location is not None:
logging.info(f"[{dataset_id}] Location already exists: {location_id}")
else:
logging.info(f"[{dataset_id}] Creating new location: {location_id}")
location = Location(
id=location_id,
)
location.country = location_info.countries[i]
location.country_code = location_info.country_codes[i]
location.subdivision = location_info.most_common_subdivision_name
location.municipality = location_info.most_common_municipality
locations.append(location)
if len(locations) == 0:
raise Exception("No locations found for the dataset.")
dataset.locations.clear()
dataset.locations = locations

# Update the location of the related feed as well
dataset.feed.locations.clear()
dataset.feed.locations = locations

session.add(dataset)
28 changes: 26 additions & 2 deletions functions-python/extract_location/tests/test_extract_location.py
Original file line number Diff line number Diff line change
@@ -77,11 +77,35 @@ def test_reverse_coords(self, mock_get):
points = [(34.0522, -118.2437), (37.7749, -122.4194)]
location_info = reverse_coords(points)

self.assertEqual(location_info.country_codes, ["US", "US"])
self.assertEqual(location_info.countries, ["United States", "United States"])
self.assertEqual(location_info.country_codes, ["US"])
self.assertEqual(location_info.countries, ["United States"])
self.assertEqual(location_info.most_common_subdivision_name, "California")
self.assertEqual(location_info.most_common_municipality, "Los Angeles")

@patch("extract_location.src.location_extractor.reverse_coord")
def test_reverse_coords_decision(self, mock_reverse_coord):
# Mock data for known lat/lon points
mock_reverse_coord.side_effect = [
("us", "United States", "California", "Los Angeles"),
("us", "United States", "California", "San Francisco"),
("us", "United States", "California", "San Diego"),
("us", "United States", "California", "San Francisco"),
]

points = [
(34.0522, -118.2437), # Los Angeles
(37.7749, -122.4194), # San Francisco
(32.7157, -117.1611), # San Diego
(37.7749, -122.4194), # San Francisco (duplicate to test counting)
]

location_info = reverse_coords(points, decision_threshold=0.5)

self.assertEqual(location_info.country_codes, ["us"])
self.assertEqual(location_info.countries, ["United States"])
self.assertEqual(location_info.most_common_subdivision_name, "California")
self.assertEqual(location_info.most_common_municipality, "San Francisco")

def test_update_location(self):
# Setup mock database session and models
mock_session = MagicMock(spec=Session)