Skip to content

Commit

Permalink
fix(pipeline): replace char special by oe
Browse files Browse the repository at this point in the history
  • Loading branch information
hlecuyer authored and vmttn committed Jul 9, 2024
1 parent c59e78f commit 5a6a661
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions pipeline/dags/dag_utils/sources/monenfant.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@
logger = logging.getLogger(__name__)


def unaccent(text: str) -> str:
def normalize(text: str) -> str:
# Decompose the unicode string into its base and combining characters
nfkd_form = unicodedata.normalize("NFKD", text)

# Filter out the combining characters (like accents)
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
normalized_text = "".join([c for c in nfkd_form if not unicodedata.combining(c)])
# replace ligature oe
return normalized_text.replace("œ", "oe")


def get_location(city_code: str, commune: str, region: str) -> str:
Expand All @@ -33,8 +36,8 @@ def get_location(city_code: str, commune: str, region: str) -> str:
The location string is formatted as "Xeme Arrondissement Paris" for Paris.
For other cities, it is formatted like "Lille Nord".
"""
commune = unaccent(commune)
region = unaccent(region)
commune = normalize(commune)
region = normalize(region)

if "Arrondissement" in commune:
commune = commune.split()[0]
Expand Down

0 comments on commit 5a6a661

Please sign in to comment.