diff --git a/requirements.txt b/requirements.txt index c295c164..e1930f50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,9 @@ humanize==4.9.0 python-slugify==8.0.1 geomet==1.1.0 PyYAML==6.0.1 +geojson-stats==0.2.5 +transliterate==1.10.2 + ## documentation # mkdocs-material==8.5.11 # mkdocs-jupyter==0.22.0 diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 00000000..cc2546c8 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/app.py b/src/app.py index 9aef25c8..8117953d 100644 --- a/src/app.py +++ b/src/app.py @@ -33,6 +33,7 @@ from datetime import datetime, timedelta, timezone from json import dumps from json import loads as json_loads +from .post_processing.processor import PostProcessor # Third party imports import boto3 @@ -947,7 +948,15 @@ def upload(self, file_path, file_name, file_suffix=None): start_time = time.time() try: - self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name)) + if file_path[-5:] == ".html": + self.s_3.upload_file( + str(file_path), + BUCKET_NAME, + str(file_name), + ExtraArgs={"ContentType": "text/html"}, + ) + else: + self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name)) except Exception as ex: logging.error(ex) raise ex @@ -1385,6 +1394,14 @@ def zip_to_s3(self, resources): temp_zip_path = resource["url"] resource["url"] = self.upload_resources(resource_path=temp_zip_path) os.remove(temp_zip_path) + + if resource.get("stats_html"): + temp_stats_html_path = resource["stats_html"] + resource["stats_html"] = self.upload_resources( + resource_path=temp_stats_html_path + ) + os.remove(temp_stats_html_path) + return resources def file_to_zip(self, working_dir, zip_path): @@ -1491,6 +1508,26 @@ def process_export_format(export_format): ) run_ogr2ogr_cmd(ogr2ogr_cmd) + # Post-processing GeoJSON files + # Adds: stats, HTML stats summary and transliterations + if export_format.driver_name == "GeoJSON" and ( + self.params.include_stats or self.params.include_translit + ): + post_processor = PostProcessor( + { + "include_stats": self.params.include_stats, + "include_translit": self.params.include_translit, + "include_stats_html": self.params.include_stats_html, + } + ) + post_processor.init() + post_processor.custom( + category_name=category_name, + export_format_path=export_format_path, + export_filename=export_filename, + file_export_path=file_export_path, + ) + zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") zip_path = self.file_to_zip(export_format_path, zip_file_path) @@ -1500,6 +1537,12 @@ def process_export_format(export_format): resource["format"] = export_format.suffix resource["description"] = export_format.driver_name resource["size"] = os.path.getsize(zip_path) + if ( + self.params.include_stats_html + and export_format.driver_name == "GeoJSON" + ): + resource["stats_html"] = f"{file_export_path}/stats-summary.html" + # resource["last_modified"] = datetime.now().isoformat() logging.info( "Done %s:%s in %s", @@ -1895,6 +1938,12 @@ def add_resource(self, resource_meta): resource_obj.mark_data_updated() self.dataset.add_update_resource(resource_obj) + # Add customviz if available + if resource_meta.get("stats_html"): + self.dataset.update( + {"customviz": [{"url": resource_meta["stats_html"]}]} + ) + def upload_dataset(self, dump_config_to_s3=False): """ Uploads the dataset to HDX. diff --git a/src/post_processing/__init__.py b/src/post_processing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py new file mode 100644 index 00000000..9ca74d68 --- /dev/null +++ b/src/post_processing/geojson_stats.py @@ -0,0 +1,61 @@ +from geojson_stats.stats import Stats +from geojson_stats.html import Html + +CONFIG_AREA = ["building"] +CONFIG_LENGTH = ["highway", "waterway"] + + +class GeoJSONStats(Stats): + """Used for collecting stats while processing GeoJSON files line by line""" + + def __init__(self, filters, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.config.clean = True + self.config.properties_prop = "properties.tags" + + if filters and filters.tags: + for tag in CONFIG_AREA: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.area = True + + for tag in CONFIG_LENGTH: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.length = True + + def check_filter(self, tags, tag): + """ + Check if a tag is present in tag filters + """ + + if tags.all_geometry: + if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: + return True + if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: + return True + if tags.polygon: + if tags.polygon.join_or and tag in tags.polygon.join_or: + return True + if tags.polygon.join_and and tag in tags.polygon.join_and: + return True + if tags.line: + if tags.line.join_or and tag in tags.line.join_or: + return True + if tags.line.join_and and tag in tags.line.join_and: + return True + + def raw_data_line_stats(self, json_object: dict): + """ + Process a GeoJSON line (for getting stats) and return that line + """ + self.get_object_stats(json_object) + + def html(self, tpl): + """ + Returns stats Html object, generated from stats data using a template + """ + return Html(tpl, self) diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py new file mode 100644 index 00000000..c7b416f2 --- /dev/null +++ b/src/post_processing/processor.py @@ -0,0 +1,121 @@ +import json +from .transliterator import Transliterator +from .geojson_stats import GeoJSONStats +import os +import pathlib + + +class PostProcessor: + """Used for posst-process data while processing GeoJSON files line by line""" + + options = {} + filters = {} + functions = [] + + def __init__(self, options, *args, **kwargs): + self.options = options + + def post_process_line(self, line: str): + """ + Parses line, run functions over it and returns it + """ + + line_object = json.loads(line) + + for fn in self.functions: + fn(line_object) + + return json.dumps(line_object) + + def custom( + self, category_name, export_format_path, export_filename, file_export_path + ): + """ + Post-process custom exports + """ + self.geoJSONStats.config.properties_prop = "properties" + + category_tag = "" + if category_name == "roads": + category_tag = "highway" + self.geoJSONStats.config.length = True + elif category_name == "buildings": + category_tag = "building" + self.geoJSONStats.config.area = True + elif category_name == "waterways": + category_tag = "waterway" + self.geoJSONStats.config.length = True + elif category_name == "railways": + category_tag = "railway" + self.geoJSONStats.config.length = True + + if self.options["include_stats"]: + if category_tag: + self.geoJSONStats.config.keys.append(category_tag) + self.geoJSONStats.config.value_keys.append(category_tag) + + path_input = os.path.join(export_format_path, f"{export_filename}.geojson") + path_output = os.path.join( + export_format_path, f"{export_filename}-post.geojson" + ) + + with open(path_input, "r") as input_file, open( + path_output, "w" + ) as output_file: + for line in input_file: + comma = False + if line.startswith('{ "type": "Feature"'): + json_string = "" + if line[-2:-1] == ",": + json_string = line[:-2] + comma = True + else: + json_string = line + line = self.post_process_line(json_string) + if self.options["include_translit"]: + if comma: + output_file.write(line + ",") + else: + output_file.write(line) + + if self.options.get("include_translit"): + os.remove(path_input) + os.rename(path_output, path_input) + else: + os.remove(path_output) + + geojson_stats_json = json.dumps(self.geoJSONStats.dict()) + with open( + os.path.join(file_export_path, "stats.json"), + "w", + ) as f: + f.write(geojson_stats_json) + + if self.options.get("include_stats_html"): + tpl = ( + "stats_{category_tag}".format(category_tag=category_tag) + if category_tag + else "stats" + ) + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join( + project_root, + "{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = self.geoJSONStats.html(tpl_path).build() + upload_html_path = os.path.join(file_export_path, "stats-summary.html") + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + def init(self): + """ + Initialize post-processor + """ + + if self.options.get("include_stats"): + self.geoJSONStats = GeoJSONStats(self.filters) + self.functions.append(self.geoJSONStats.raw_data_line_stats) + + if self.options.get("include_translit"): + self.transliterator = Transliterator() + self.functions.append(self.transliterator.translit) diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html new file mode 100644 index 00000000..97c3f659 --- /dev/null +++ b/src/post_processing/stats_building_tpl.html @@ -0,0 +1,165 @@ + + + +
+ + + + + + +Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +
Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +
Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +
Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +
Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +