From bf10476294457b718f8afd7912218c31fd3813ce Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Tue, 21 Jan 2025 15:35:34 -0300 Subject: [PATCH 1/5] + Post-processing GeoJSON datasets adding stats and transliterations --- src/app.py | 39 ++++- src/post_processing/__init__.py | 0 src/post_processing/geojson_stats.py | 61 ++++++++ src/post_processing/processor.py | 123 +++++++++++++++ src/post_processing/stats_building_tpl.html | 165 ++++++++++++++++++++ src/post_processing/stats_highway_tpl.html | 164 +++++++++++++++++++ src/post_processing/stats_tpl.html | 161 +++++++++++++++++++ src/post_processing/stats_waterway_tpl.html | 165 ++++++++++++++++++++ src/post_processing/transliterator.py | 34 ++++ src/validation/models.py | 29 +++- 10 files changed, 939 insertions(+), 2 deletions(-) create mode 100644 src/post_processing/__init__.py create mode 100644 src/post_processing/geojson_stats.py create mode 100644 src/post_processing/processor.py create mode 100644 src/post_processing/stats_building_tpl.html create mode 100644 src/post_processing/stats_highway_tpl.html create mode 100644 src/post_processing/stats_tpl.html create mode 100644 src/post_processing/stats_waterway_tpl.html create mode 100644 src/post_processing/transliterator.py diff --git a/src/app.py b/src/app.py index 9aef25c8..dfa9245c 100644 --- a/src/app.py +++ b/src/app.py @@ -33,6 +33,7 @@ from datetime import datetime, timedelta, timezone from json import dumps from json import loads as json_loads +from .post_processing.processor import PostProcessor # Third party imports import boto3 @@ -947,7 +948,10 @@ def upload(self, file_path, file_name, file_suffix=None): start_time = time.time() try: - self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name)) + if file_path[-5:] == ".html": + self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name), ExtraArgs={'ContentType': 'text/html'}) + else: + self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name)) except Exception as ex: logging.error(ex) raise ex @@ -1385,6 +1389,12 @@ def zip_to_s3(self, resources): temp_zip_path = resource["url"] resource["url"] = self.upload_resources(resource_path=temp_zip_path) os.remove(temp_zip_path) + + if resource.get("stats_html"): + temp_stats_html_path = resource["stats_html"] + resource["stats_html"] = self.upload_resources(resource_path=temp_stats_html_path) + os.remove(temp_stats_html_path) + return resources def file_to_zip(self, working_dir, zip_path): @@ -1491,6 +1501,26 @@ def process_export_format(export_format): ) run_ogr2ogr_cmd(ogr2ogr_cmd) + # Post-processing GeoJSON files + # Adds: stats, HTML stats summary and transliterations + if export_format.driver_name == "GeoJSON" and ( + self.params.include_stats or self.params.include_translit + ): + post_processor = PostProcessor( + { + "include_stats": self.params.include_stats, + "include_translit": self.params.include_translit, + "include_stats_html": self.params.include_stats_html, + } + ) + post_processor.init() + post_processor.custom( + category_name=category_name, + export_format_path=export_format_path, + export_filename=export_filename, + file_export_path=file_export_path, + ) + zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") zip_path = self.file_to_zip(export_format_path, zip_file_path) @@ -1500,6 +1530,9 @@ def process_export_format(export_format): resource["format"] = export_format.suffix resource["description"] = export_format.driver_name resource["size"] = os.path.getsize(zip_path) + if self.params.include_stats_html and export_format.driver_name == "GeoJSON": + resource["stats_html"] = f"{file_export_path}/stats-summary.html" + # resource["last_modified"] = datetime.now().isoformat() logging.info( "Done %s:%s in %s", @@ -1895,6 +1928,10 @@ def add_resource(self, resource_meta): resource_obj.mark_data_updated() self.dataset.add_update_resource(resource_obj) + # Add customviz if available + if resource_meta.get("stats_html"): + self.dataset.update({"customviz": [{"url": resource_meta["stats_html"]}]}) + def upload_dataset(self, dump_config_to_s3=False): """ Uploads the dataset to HDX. diff --git a/src/post_processing/__init__.py b/src/post_processing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py new file mode 100644 index 00000000..9ca74d68 --- /dev/null +++ b/src/post_processing/geojson_stats.py @@ -0,0 +1,61 @@ +from geojson_stats.stats import Stats +from geojson_stats.html import Html + +CONFIG_AREA = ["building"] +CONFIG_LENGTH = ["highway", "waterway"] + + +class GeoJSONStats(Stats): + """Used for collecting stats while processing GeoJSON files line by line""" + + def __init__(self, filters, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.config.clean = True + self.config.properties_prop = "properties.tags" + + if filters and filters.tags: + for tag in CONFIG_AREA: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.area = True + + for tag in CONFIG_LENGTH: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.length = True + + def check_filter(self, tags, tag): + """ + Check if a tag is present in tag filters + """ + + if tags.all_geometry: + if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: + return True + if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: + return True + if tags.polygon: + if tags.polygon.join_or and tag in tags.polygon.join_or: + return True + if tags.polygon.join_and and tag in tags.polygon.join_and: + return True + if tags.line: + if tags.line.join_or and tag in tags.line.join_or: + return True + if tags.line.join_and and tag in tags.line.join_and: + return True + + def raw_data_line_stats(self, json_object: dict): + """ + Process a GeoJSON line (for getting stats) and return that line + """ + self.get_object_stats(json_object) + + def html(self, tpl): + """ + Returns stats Html object, generated from stats data using a template + """ + return Html(tpl, self) diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py new file mode 100644 index 00000000..c27e3d8c --- /dev/null +++ b/src/post_processing/processor.py @@ -0,0 +1,123 @@ +import json +from .transliterator import Transliterator +from .geojson_stats import GeoJSONStats +import os +import pathlib + + +class PostProcessor: + """Used for posst-process data while processing GeoJSON files line by line""" + + options = {} + filters = {} + functions = [] + + def __init__(self, options, *args, **kwargs): + self.options = options + + def post_process_line(self, line: str): + """ + Parses line, run functions over it and returns it + """ + + line_object = json.loads(line) + + for fn in self.functions: + fn(line_object) + + return json.dumps(line_object) + + def custom(self, category_name, export_format_path, export_filename, file_export_path): + """ + Post-process custom exports + """ + self.geoJSONStats.config.properties_prop = "properties" + + category_tag = "" + if category_name == "roads": + category_tag = "highway" + self.geoJSONStats.config.length = True + elif category_name == "buildings": + category_tag = "building" + self.geoJSONStats.config.area = True + elif category_name == "waterways": + category_tag = "waterway" + self.geoJSONStats.config.length = True + elif category_name == "railways": + category_tag = "railway" + self.geoJSONStats.config.length = True + + if self.options["include_stats"]: + if category_tag: + self.geoJSONStats.config.keys.append(category_tag) + self.geoJSONStats.config.value_keys.append(category_tag) + + path_input = os.path.join( + export_format_path, f"{export_filename}.geojson" + ) + path_output = os.path.join( + export_format_path, f"{export_filename}-post.geojson" + ) + + with open(path_input, "r") as input_file, open( + path_output, "w" + ) as output_file: + for line in input_file: + comma = False + if line.startswith('{ "type": "Feature"'): + json_string = "" + if line[-2:-1] == ",": + json_string = line[:-2] + comma = True + else: + json_string = line + line = self.post_process_line(json_string) + if self.options["include_translit"]: + if comma: + output_file.write(line + ",") + else: + output_file.write(line) + + if self.options.get("include_translit"): + os.remove(path_input) + os.rename(path_output, path_input) + else: + os.remove(path_output) + + geojson_stats_json = json.dumps(self.geoJSONStats.dict()) + with open( + os.path.join(file_export_path, "stats.json"), + "w", + ) as f: + f.write(geojson_stats_json) + + if self.options.get("include_stats_html"): + tpl = ( + "stats_{category_tag}".format(category_tag=category_tag) + if category_tag + else "stats" + ) + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join( + project_root, + "{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = self.geoJSONStats.html(tpl_path).build() + upload_html_path = os.path.join( + file_export_path, "stats-summary.html" + ) + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + def init(self): + """ + Initialize post-processor + """ + + if self.options.get("include_stats"): + self.geoJSONStats = GeoJSONStats(self.filters) + self.functions.append(self.geoJSONStats.raw_data_line_stats) + + if self.options.get("include_translit"): + self.transliterator = Transliterator() + self.functions.append(self.transliterator.translit) diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html new file mode 100644 index 00000000..97c3f659 --- /dev/null +++ b/src/post_processing/stats_building_tpl.html @@ -0,0 +1,165 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/stats_highway_tpl.html b/src/post_processing/stats_highway_tpl.html new file mode 100644 index 00000000..be09c355 --- /dev/null +++ b/src/post_processing/stats_highway_tpl.html @@ -0,0 +1,164 @@ + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/stats_tpl.html b/src/post_processing/stats_tpl.html new file mode 100644 index 00000000..7359ad13 --- /dev/null +++ b/src/post_processing/stats_tpl.html @@ -0,0 +1,161 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/stats_waterway_tpl.html b/src/post_processing/stats_waterway_tpl.html new file mode 100644 index 00000000..af11d163 --- /dev/null +++ b/src/post_processing/stats_waterway_tpl.html @@ -0,0 +1,165 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/transliterator.py b/src/post_processing/transliterator.py new file mode 100644 index 00000000..ddc16bb7 --- /dev/null +++ b/src/post_processing/transliterator.py @@ -0,0 +1,34 @@ +from transliterate import translit, get_available_language_codes + + +class Transliterator: + """Used for transliterate names while processing GeoJSON files line by line""" + + props = "properties" + + def __init__(self): + self.available_language_codes = get_available_language_codes() + self.name_tags = [f"name:{x}" for x in self.available_language_codes] + + def translit(self, line): + """ + Transliterate names and add a new tag suffixed with -translit + """ + for code in self.available_language_codes: + tag = "name:{code}".format(code=code) + prop = ( + line["properties"]["tags"] + if self.props == "properties.tags" + else line["properties"] + ) + if tag in prop: + translit_tag = "{tag}-translit".format(tag=tag) + if not translit_tag in prop: + if self.props == "properties.tags": + line["properties"]["tags"][translit_tag] = translit( + prop[tag], code, reversed=True + ) + else: + line["properties"][translit_tag] = translit( + prop[tag], code, reversed=True + ) diff --git a/src/validation/models.py b/src/validation/models.py index 1b1b0e92..bd04e43c 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -175,6 +175,14 @@ class RawDataCurrentParamsBase(BaseModel, GeometryValidatorMixin): }, description="Filter for point,line,polygon/ all geometry for both select and where clause, All geometry filter means : It will apply the same filter to all the geometry type", ) + include_stats_html: Optional[bool] = Field( + default=False, + description="Includes detailed stats about the polygon passed such as buildings count , road count along with summary about data completeness in the area", + ) + include_translit: Optional[bool] = Field( + default=False, + description="Includes transliterations", + ) geometry: Union[ Polygon, MultiPolygon, @@ -540,7 +548,11 @@ class DatasetConfig(BaseModel): description="Default base folder for the exports", example="ISO3", ) - + customviz: Optional[List[dict[str, str]]] | None = Field( + default=[], + description="List of objects for custom visualization", + example="[{'url': 'https://something.org/datasetviz.html'}]", + ) @validator("update_frequency") def validate_frequency(cls, value): """Validates frequency @@ -610,6 +622,9 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): Fields: - iso3 (Optional[str]): ISO3 Country Code. + - include_stats (bool): Include a JSON file with stats. Available for GeoJSON exports only. + - include_stats_html (bool): Include a HTML file with a stats summary. Available for GeoJSON exports only. + - include_translit (bool): Add transliterations. Available for GeoJSON exports only. - dataset (Optional[DatasetConfig]): Dataset Configurations for HDX Upload. - meta (bool): Dumps Meta db in parquet format & HDX config JSON to S3. - hdx_upload (bool): Enable/Disable uploading the dataset to HDX. @@ -624,6 +639,18 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): max_length=3, example="USA", ) + include_stats: Optional[bool] = Field( + default=False, + description="Include a JSON file with stats. Available for GeoJSON exports only.", + ) + include_stats_html: Optional[bool] = Field( + default=False, + description="Include a HTML file with a stats summary. Available for GeoJSON exports only.", + ) + include_translit: Optional[bool] = Field( + default=False, + description="Add transliterations. Available for GeoJSON exports only.", + ) geometry: Optional[ Union[Polygon, MultiPolygon, Feature, FeatureCollection] ] = Field( From 2a4a362f3e1641bbe14e4d9fbbac9104e080a986 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Tue, 21 Jan 2025 15:43:01 -0300 Subject: [PATCH 2/5] Add missing railways stats html template --- src/post_processing/stats_railway_tpl.html | 165 +++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 src/post_processing/stats_railway_tpl.html diff --git a/src/post_processing/stats_railway_tpl.html b/src/post_processing/stats_railway_tpl.html new file mode 100644 index 00000000..b71cd6b7 --- /dev/null +++ b/src/post_processing/stats_railway_tpl.html @@ -0,0 +1,165 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + From 60790dfa474a1c6b766862572767d0b2a8f15789 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Tue, 21 Jan 2025 19:29:16 -0300 Subject: [PATCH 3/5] Add geojsonstats req --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c295c164..904bdf11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,7 @@ humanize==4.9.0 python-slugify==8.0.1 geomet==1.1.0 PyYAML==6.0.1 +geojson-stats==0.2.5 ## documentation # mkdocs-material==8.5.11 # mkdocs-jupyter==0.22.0 From f60349b792727b17d8d17147b63342019e319b58 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Wed, 22 Jan 2025 09:11:05 -0300 Subject: [PATCH 4/5] Black reformatting --- src/.DS_Store | Bin 0 -> 6148 bytes src/app.py | 20 ++++++++++++++++---- src/post_processing/processor.py | 12 +++++------- src/validation/models.py | 3 ++- 4 files changed, 23 insertions(+), 12 deletions(-) create mode 100644 src/.DS_Store diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..cc2546c895a4cade5f2590bdf407f0f2b52f89c6 GIT binary patch literal 6148 zcmeHKOG*P#5UkcL0xrzb<-3A67(zTj4iG^l2r?syx}KHi@@T34APf`1jT@pi9x)6)QK{r-3htN<+Oj`;L2H-GLvvx~|YkcePCb(s(EtDd literal 0 HcmV?d00001 diff --git a/src/app.py b/src/app.py index dfa9245c..8117953d 100644 --- a/src/app.py +++ b/src/app.py @@ -949,7 +949,12 @@ def upload(self, file_path, file_name, file_suffix=None): try: if file_path[-5:] == ".html": - self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name), ExtraArgs={'ContentType': 'text/html'}) + self.s_3.upload_file( + str(file_path), + BUCKET_NAME, + str(file_name), + ExtraArgs={"ContentType": "text/html"}, + ) else: self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name)) except Exception as ex: @@ -1392,7 +1397,9 @@ def zip_to_s3(self, resources): if resource.get("stats_html"): temp_stats_html_path = resource["stats_html"] - resource["stats_html"] = self.upload_resources(resource_path=temp_stats_html_path) + resource["stats_html"] = self.upload_resources( + resource_path=temp_stats_html_path + ) os.remove(temp_stats_html_path) return resources @@ -1530,7 +1537,10 @@ def process_export_format(export_format): resource["format"] = export_format.suffix resource["description"] = export_format.driver_name resource["size"] = os.path.getsize(zip_path) - if self.params.include_stats_html and export_format.driver_name == "GeoJSON": + if ( + self.params.include_stats_html + and export_format.driver_name == "GeoJSON" + ): resource["stats_html"] = f"{file_export_path}/stats-summary.html" # resource["last_modified"] = datetime.now().isoformat() @@ -1930,7 +1940,9 @@ def add_resource(self, resource_meta): # Add customviz if available if resource_meta.get("stats_html"): - self.dataset.update({"customviz": [{"url": resource_meta["stats_html"]}]}) + self.dataset.update( + {"customviz": [{"url": resource_meta["stats_html"]}]} + ) def upload_dataset(self, dump_config_to_s3=False): """ diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py index c27e3d8c..c7b416f2 100644 --- a/src/post_processing/processor.py +++ b/src/post_processing/processor.py @@ -27,7 +27,9 @@ def post_process_line(self, line: str): return json.dumps(line_object) - def custom(self, category_name, export_format_path, export_filename, file_export_path): + def custom( + self, category_name, export_format_path, export_filename, file_export_path + ): """ Post-process custom exports """ @@ -52,9 +54,7 @@ def custom(self, category_name, export_format_path, export_filename, file_export self.geoJSONStats.config.keys.append(category_tag) self.geoJSONStats.config.value_keys.append(category_tag) - path_input = os.path.join( - export_format_path, f"{export_filename}.geojson" - ) + path_input = os.path.join(export_format_path, f"{export_filename}.geojson") path_output = os.path.join( export_format_path, f"{export_filename}-post.geojson" ) @@ -103,9 +103,7 @@ def custom(self, category_name, export_format_path, export_filename, file_export "{tpl}_tpl.html".format(tpl=tpl), ) geojson_stats_html = self.geoJSONStats.html(tpl_path).build() - upload_html_path = os.path.join( - file_export_path, "stats-summary.html" - ) + upload_html_path = os.path.join(file_export_path, "stats-summary.html") with open(upload_html_path, "w") as f: f.write(geojson_stats_html) diff --git a/src/validation/models.py b/src/validation/models.py index bd04e43c..e43b6c75 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -182,7 +182,7 @@ class RawDataCurrentParamsBase(BaseModel, GeometryValidatorMixin): include_translit: Optional[bool] = Field( default=False, description="Includes transliterations", - ) + ) geometry: Union[ Polygon, MultiPolygon, @@ -553,6 +553,7 @@ class DatasetConfig(BaseModel): description="List of objects for custom visualization", example="[{'url': 'https://something.org/datasetviz.html'}]", ) + @validator("update_frequency") def validate_frequency(cls, value): """Validates frequency From e3066b6b4c5401cf1390a0ee723373ed4cf6ce5b Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Wed, 22 Jan 2025 09:14:51 -0300 Subject: [PATCH 5/5] Missing requirements --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 904bdf11..e1930f50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,8 @@ python-slugify==8.0.1 geomet==1.1.0 PyYAML==6.0.1 geojson-stats==0.2.5 +transliterate==1.10.2 + ## documentation # mkdocs-material==8.5.11 # mkdocs-jupyter==0.22.0