Skip to content

Commit

Permalink
feat: add slovenian packager codes (#10124)
Browse files Browse the repository at this point in the history
* feat_add_si_packager_codes

* applies suggestions
  • Loading branch information
benbenben2 authored Apr 30, 2024
1 parent a724c12 commit 9577c03
Show file tree
Hide file tree
Showing 7 changed files with 808 additions and 0 deletions.
9 changes: 9 additions & 0 deletions lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3915,6 +3915,15 @@ HTML
;
}

if ($packager_codes{$canon_tagid}{cc} eq 'si') {
$description .= <<HTML
<p>$packager_codes{$canon_tagid}{name}<br>
$packager_codes{$canon_tagid}{address} (Slovenija)
</p>
HTML
;
}

if ($packager_codes{$canon_tagid}{cc} eq 'uk') {

my $district = '';
Expand Down
1 change: 1 addition & 0 deletions lib/ProductOpener/PackagerCodes.pm
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ my %local_ec = (
NL => "EG",
PL => "WE",
PT => "CE",
SI => "ES",
UK => "EC",
);

Expand Down
541 changes: 541 additions & 0 deletions packager-codes/SI-merge-UTF-8.csv

Large diffs are not rendered by default.

Binary file modified packager-codes/geocode_addresses.sto
Binary file not shown.
Binary file modified packager-codes/packager_codes.sto
Binary file not shown.
255 changes: 255 additions & 0 deletions scripts/packager-codes/si-packagers-refresh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
'''
This file is part of Product Opener.
Product Opener
Copyright (C) 2011-2023 Association Open Food Facts
Contact: [email protected]
Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
Product Opener is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
# PREREQUISITES
python3
apikey for geocode.maps.co (free account)
# INSTALLATION
## install virtual environment
sudo apt install python3.11-venv
python3 -m venv venv
source venv/bin/activate
## install needed packages
pip install polars
pip install requests
# FETCH INPUT FILE
# - from the Slovenian government website,
# download the actual list of "Seznam odobrenih živilskih obratov"
# (List of approved food establishments):
# https://www.gov.si/zbirke/storitve/odobritev-zivilskega-obrata/
# https://www.gov.si/assets/organi-v-sestavi/UVHVVR/Varna-hrana/Odobritev-obrata/Obrati-Zivila-O_ang.pdf
# - download last version of https://github.com/tabulapdf/tabula-java/releases
# - convert the pdf file into csv (update release number and file name):
# $ java -jar tabula-1.0.5-jar-with-dependencies.jar Obrati-Zivila-O_ang.pdf \
# --lattice --format CSV --pages all > slovenian_packaging_raw.csv
# RUN
update: api_key
python3 si-packagers-refresh.py
# POSTPROCESSING
- deactivate the virtual environment:
deactivate
delete cache.db file
'''

import polars as pl
import re
import requests
import sys
from time import sleep
import dbm
import json



file_name = "slovenian_packaging_raw.csv"
api_key = "" # TODO remove
output_file_name = 'SI-merge-UTF-8.csv'

def clean_code(input_code: str) -> str:
# remove double spaces
input_code = input_code.replace(' ', ' ')

# SI H-1015 ES
if input_code.endswith('ES'):
input_code = input_code.replace('ES', '').strip()

# SI M-1035 SI
if input_code.endswith('SI'):
input_code = re.sub(r"\b(SI|ES)$", "", input_code).strip()

# SI H-731, SI 731
if ',' in input_code:
input_code = "".join(input_code.split(', ')[1])

# SI H - 728, SI H 728, SI H-728, also with M
input_code = input_code.replace('H - ', 'H-')
input_code = input_code.replace('H ', 'H-')
input_code = input_code.replace('M - ', 'M-')
input_code = input_code.replace('M ', 'M-')

# SI - 907 -> SI 907
input_code = input_code.replace(' - ', ' ')
input_code = input_code.replace(' -', ' ')
input_code = input_code.replace('SI-', 'SI ')

# SI1194
if 'SI ' not in input_code:
input_code = input_code.replace('SI', 'SI ')
# SI M1106
if 'M-' not in input_code:
input_code = input_code.replace('M', 'M-')

return input_code


def clean_address(input_address: str) -> str:
# special character because
# sometimes new line between 2 addreses
# sometimes line for single address split in 2
input_address = "<>".join(input_address.split('\r'))

# fetch last occurence
# words 123A, place, 4567 city name
# Á found in a city name (PROSENJAKOVCI -PÁRTOSFALVA)
pattern = r'(([a-zčćžđšA-ZČĆŽĐŠŽ\s.-]+\d+[ABCDEFGIJ]?),(?:[a-zčćžđšA-ZČĆŽĐŠŽ\s\<\>.-]+,\s*)?[\<\>]*(\s*\d{4}[a-zčćžđšA-ZČĆŽĐŠŽÁ\s\<\>.-]+)$)'
# SI M-316 - should be Fužinska Ulica 1, 4220 Škofja Loka - not Kidričeva Cesta 63A, 4220 Škofja Loka


match = re.search(pattern, input_address)


if match:
output_address = (f"{match.group(2).strip().title()}, {match.group(3).replace('<>', ' ').strip().title()}")
else:
# MOŠNJE , MOŠNJE, 4240 RADOVLJICA -> no street number (also DIJAŠKA ULICA , 5220 TOLMIN)
# instead, fetch "something, postal_code city"
pattern_2 = r'(([a-zčćžđšA-ZČĆŽĐŠŽ\s\-\.]+),(\s*\d{4})([a-zčćžđšA-ZČĆŽĐŠŽÁ\s\-\.\<\>]+)$)'
match_2 = re.search(pattern_2, input_address)
if match_2:
output_address = (f"{match_2.group(2).strip().title()}, {match_2.group(3).replace('<>', ' ').strip().title()}")
else:
print("Match problem", input_address)
output_address = input_address

return output_address


def cached_get(url: str, cache) -> list:
# Check if the URL is already in the cache
if url in cache:
# If yes, return the cached response
print(" from cache")
return json.loads(cache[url])

# restart 3 times in case of empty response to make sure it is not an issue in API-side
restart = True
i = 0
while restart:
# If not, make the HTTP request
try:
response = requests.get(url)
except (requests.exceptions.RequestException, KeyError, IndexError) as e:
return []
data = response.json()
if data == [] and i < 3:
i += 1
print(" restart ", i)
sleep(1)
else:
restart = False


# Store the JSON response in the cache
cache[url] = json.dumps(data)

return data


def convert_address_to_lat_lng(address_to_convert: str) -> str:
# free plan: 1 request per second
sleep(1)

print("address_to_convert: ", address_to_convert)
street, post_and_town = address_to_convert.split(',')
postalcode = post_and_town.strip().split()[0]
town = " ".join(post_and_town.strip().split()[1:])

url = f"https://geocode.maps.co/search?street={street}&town={town}&postalcode={postalcode}&country=Slovenia&country_code=si&api_key={api_key}"

with dbm.open('cache', 'c') as cache:
data = cached_get(url, cache)
if data != []:
lat_lng = f"{data[0]['lat']},{data[0]['lon']}"
else:
sleep(1)
# drop housenumber (example: Vrhpolje 1D, 5271 Vipava)
url_2 = f"https://geocode.maps.co/search?street={' '.join(street.split()[:-1])}&town={town}&postalcode={postalcode}&country=Slovenia&country_code=si&api_key={api_key}"

print(" try remove house number")

data = cached_get(url_2, cache)

if data != []:
lat_lng = f"{data[0]['lat']},{data[0]['lon']}"
else:
sleep(1)
# drop street (example: Gabrovlje 14, 3214 Zreče)
url_3 = f"https://geocode.maps.co/search?town={town}&postalcode={postalcode}&country=Slovenia&country_code=si&api_key={api_key}"

print(" try remove street")

data = cached_get(url_3, cache)

if data != []:
lat_lng = f"{data[0]['lat']},{data[0]['lon']}"
else:
print(f'Empty response for: {address_to_convert}')
sys.exit(1)

return lat_lng


def main():
if api_key == "":
print("missing API key")
sys.exit(1)

df = pl.read_csv(file_name, separator=',')

# keep only needed columns
df_selected = df[:, [0, 1, 2]]

# rename columns
new_column_names = ['code', 'name', 'address']
df_renamed = df_selected.rename({i: j for i, j in zip(df_selected.columns, new_column_names)})

# ignore rows if first column is "Approval No.", or if missing SI (sometimes just number or H + number)
df_filtered_tmp = df_renamed.filter(df_renamed['code'].str.starts_with("SI"))
df_filtered = df_filtered_tmp.with_columns(pl.col('code').map_elements(lambda x: clean_code(x), return_dtype=str))

# first column keep only first row (second row tell about business, meat processing, for example)
df_unique_name = df_filtered.with_columns(pl.col('name').map_elements(lambda x: "".join((x.split('\r')[0]).split(',')[0].title()), return_dtype=str))

df_unique_address = df_unique_name.with_columns(pl.col('address').map_elements(lambda x: clean_address(x), return_dtype=str))

# rm duplicates
df_deduplicated = df_unique_address.unique()

df_lat_lng = df_deduplicated.with_columns(pl.col("address").map_elements(lambda x: convert_address_to_lat_lng(x), return_dtype=str).alias("lat_lng"))

# split in 2
df_lat = df_lat_lng.with_columns(pl.col('lat_lng').str.split(',').list.get(0).alias('lat'))
df_lng = df_lat.with_columns(pl.col('lat_lng').str.split(',').list.get(1).alias('lng'))

df_final = df_lng.drop(['lat_lng'])

df_final.write_csv(output_file_name, separator=';')


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions scripts/update_packager_codes.pl
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ sub normalize_code {
"$code EC" when 'rs';
"SE $code EC" when 'se';
"SK $code EC" when 'sk';
"$code ES" when 'si';
"UK $code EC" when 'uk';
join q{ }, uc($cc), $code, 'EC';
}
Expand Down Expand Up @@ -175,6 +176,7 @@ sub normalize_local_authority {
pl => 'code',
rs => 'approval_number',
se => 'nr',
si => 'code',
sk => 'schvaľovacie_čislo',
uk => 'approval_number',
);
Expand Down

0 comments on commit 9577c03

Please sign in to comment.