Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds stats and transliterations for custom exports in GeoJSON format + HDX customviz #289

Merged
merged 5 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ humanize==4.9.0
python-slugify==8.0.1
geomet==1.1.0
PyYAML==6.0.1
geojson-stats==0.2.5
transliterate==1.10.2

## documentation
# mkdocs-material==8.5.11
# mkdocs-jupyter==0.22.0
Expand Down
Binary file added src/.DS_Store
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No Need for this

Binary file not shown.
51 changes: 50 additions & 1 deletion src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from datetime import datetime, timedelta, timezone
from json import dumps
from json import loads as json_loads
from .post_processing.processor import PostProcessor

# Third party imports
import boto3
Expand Down Expand Up @@ -947,7 +948,15 @@ def upload(self, file_path, file_name, file_suffix=None):
start_time = time.time()

try:
self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name))
if file_path[-5:] == ".html":
self.s_3.upload_file(
str(file_path),
BUCKET_NAME,
str(file_name),
ExtraArgs={"ContentType": "text/html"},
)
else:
self.s_3.upload_file(str(file_path), BUCKET_NAME, str(file_name))
except Exception as ex:
logging.error(ex)
raise ex
Expand Down Expand Up @@ -1385,6 +1394,14 @@ def zip_to_s3(self, resources):
temp_zip_path = resource["url"]
resource["url"] = self.upload_resources(resource_path=temp_zip_path)
os.remove(temp_zip_path)

if resource.get("stats_html"):
temp_stats_html_path = resource["stats_html"]
resource["stats_html"] = self.upload_resources(
resource_path=temp_stats_html_path
)
os.remove(temp_stats_html_path)

return resources

def file_to_zip(self, working_dir, zip_path):
Expand Down Expand Up @@ -1491,6 +1508,26 @@ def process_export_format(export_format):
)
run_ogr2ogr_cmd(ogr2ogr_cmd)

# Post-processing GeoJSON files
# Adds: stats, HTML stats summary and transliterations
if export_format.driver_name == "GeoJSON" and (
self.params.include_stats or self.params.include_translit
):
post_processor = PostProcessor(
{
"include_stats": self.params.include_stats,
"include_translit": self.params.include_translit,
"include_stats_html": self.params.include_stats_html,
}
)
post_processor.init()
post_processor.custom(
category_name=category_name,
export_format_path=export_format_path,
export_filename=export_filename,
file_export_path=file_export_path,
)

zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip")
zip_path = self.file_to_zip(export_format_path, zip_file_path)

Expand All @@ -1500,6 +1537,12 @@ def process_export_format(export_format):
resource["format"] = export_format.suffix
resource["description"] = export_format.driver_name
resource["size"] = os.path.getsize(zip_path)
if (
self.params.include_stats_html
and export_format.driver_name == "GeoJSON"
):
resource["stats_html"] = f"{file_export_path}/stats-summary.html"

# resource["last_modified"] = datetime.now().isoformat()
logging.info(
"Done %s:%s in %s",
Expand Down Expand Up @@ -1895,6 +1938,12 @@ def add_resource(self, resource_meta):
resource_obj.mark_data_updated()
self.dataset.add_update_resource(resource_obj)

# Add customviz if available
if resource_meta.get("stats_html"):
self.dataset.update(
{"customviz": [{"url": resource_meta["stats_html"]}]}
)

def upload_dataset(self, dump_config_to_s3=False):
"""
Uploads the dataset to HDX.
Expand Down
Empty file added src/post_processing/__init__.py
Empty file.
61 changes: 61 additions & 0 deletions src/post_processing/geojson_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from geojson_stats.stats import Stats
from geojson_stats.html import Html

CONFIG_AREA = ["building"]
CONFIG_LENGTH = ["highway", "waterway"]


class GeoJSONStats(Stats):
"""Used for collecting stats while processing GeoJSON files line by line"""

def __init__(self, filters, *args, **kwargs):
super().__init__(*args, **kwargs)

self.config.clean = True
self.config.properties_prop = "properties.tags"

if filters and filters.tags:
for tag in CONFIG_AREA:
if self.check_filter(filters.tags, tag):
self.config.keys.append(tag)
self.config.value_keys.append(tag)
self.config.area = True

for tag in CONFIG_LENGTH:
if self.check_filter(filters.tags, tag):
self.config.keys.append(tag)
self.config.value_keys.append(tag)
self.config.length = True

def check_filter(self, tags, tag):
"""
Check if a tag is present in tag filters
"""

if tags.all_geometry:
if tags.all_geometry.join_or and tag in tags.all_geometry.join_or:
return True
if tags.all_geometry.join_and and tag in tags.all_geometry.join_and:
return True
if tags.polygon:
if tags.polygon.join_or and tag in tags.polygon.join_or:
return True
if tags.polygon.join_and and tag in tags.polygon.join_and:
return True
if tags.line:
if tags.line.join_or and tag in tags.line.join_or:
return True
if tags.line.join_and and tag in tags.line.join_and:
return True

def raw_data_line_stats(self, json_object: dict):
"""
Process a GeoJSON line (for getting stats) and return that line
"""
self.get_object_stats(json_object)

def html(self, tpl):
"""
Returns stats Html object, generated from stats data using a template
"""
return Html(tpl, self)
121 changes: 121 additions & 0 deletions src/post_processing/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import json
from .transliterator import Transliterator
from .geojson_stats import GeoJSONStats
import os
import pathlib


class PostProcessor:
"""Used for posst-process data while processing GeoJSON files line by line"""

options = {}
filters = {}
functions = []

def __init__(self, options, *args, **kwargs):
self.options = options

def post_process_line(self, line: str):
"""
Parses line, run functions over it and returns it
"""

line_object = json.loads(line)

for fn in self.functions:
fn(line_object)

return json.dumps(line_object)

def custom(
self, category_name, export_format_path, export_filename, file_export_path
):
"""
Post-process custom exports
"""
self.geoJSONStats.config.properties_prop = "properties"

category_tag = ""
if category_name == "roads":
category_tag = "highway"
self.geoJSONStats.config.length = True
elif category_name == "buildings":
category_tag = "building"
self.geoJSONStats.config.area = True
elif category_name == "waterways":
category_tag = "waterway"
self.geoJSONStats.config.length = True
elif category_name == "railways":
category_tag = "railway"
self.geoJSONStats.config.length = True

if self.options["include_stats"]:
if category_tag:
self.geoJSONStats.config.keys.append(category_tag)
self.geoJSONStats.config.value_keys.append(category_tag)

path_input = os.path.join(export_format_path, f"{export_filename}.geojson")
path_output = os.path.join(
export_format_path, f"{export_filename}-post.geojson"
)

with open(path_input, "r") as input_file, open(
path_output, "w"
) as output_file:
for line in input_file:
comma = False
if line.startswith('{ "type": "Feature"'):
json_string = ""
if line[-2:-1] == ",":
json_string = line[:-2]
comma = True
else:
json_string = line
line = self.post_process_line(json_string)
if self.options["include_translit"]:
if comma:
output_file.write(line + ",")
else:
output_file.write(line)

if self.options.get("include_translit"):
os.remove(path_input)
os.rename(path_output, path_input)
else:
os.remove(path_output)

geojson_stats_json = json.dumps(self.geoJSONStats.dict())
with open(
os.path.join(file_export_path, "stats.json"),
"w",
) as f:
f.write(geojson_stats_json)

if self.options.get("include_stats_html"):
tpl = (
"stats_{category_tag}".format(category_tag=category_tag)
if category_tag
else "stats"
)
project_root = pathlib.Path(__file__).resolve().parent
tpl_path = os.path.join(
project_root,
"{tpl}_tpl.html".format(tpl=tpl),
)
geojson_stats_html = self.geoJSONStats.html(tpl_path).build()
upload_html_path = os.path.join(file_export_path, "stats-summary.html")
with open(upload_html_path, "w") as f:
f.write(geojson_stats_html)

def init(self):
"""
Initialize post-processor
"""

if self.options.get("include_stats"):
self.geoJSONStats = GeoJSONStats(self.filters)
self.functions.append(self.geoJSONStats.raw_data_line_stats)

if self.options.get("include_translit"):
self.transliterator = Transliterator()
self.functions.append(self.transliterator.translit)
Loading
Loading