From d7453f4daa639e73a3c0f2ea694c8ee1fc50a19b Mon Sep 17 00:00:00 2001 From: MarieMcLaurent <117629025+MarieMcLaurent@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:24:15 +0100 Subject: [PATCH] feat : add new analysis output (#266) * feat : first analysis output tables * feat: add proportion & improvement outputs * fix : vehicle analysis & output folder * fix : coments & changelog * fix: correction with egt * fix: separate analysis from data output & update docs --------- Co-authored-by: Marie Laurent --- CHANGELOG.md | 1 + analysis/grid/comparison_flow_volume.py | 13 ++- analysis/synthesis/population.py | 127 ++++++++++++++++++++++++ docs/population.md | 8 ++ synthesis/output.py | 7 +- 5 files changed, 151 insertions(+), 5 deletions(-) create mode 100644 analysis/synthesis/population.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 37c79036..b583c4e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ **Under development** +- feat: add population analysis output - fix: avoid regenerating OSM when population changes - feat: add municipality information to households and activities - chore: update to `eqasim-java` commit `ece4932` diff --git a/analysis/grid/comparison_flow_volume.py b/analysis/grid/comparison_flow_volume.py index b2506ea1..963491dd 100644 --- a/analysis/grid/comparison_flow_volume.py +++ b/analysis/grid/comparison_flow_volume.py @@ -1,8 +1,9 @@ import pandas as pd import geopandas as gpd +import os import plotly.express as px - +ANALYSIS_FOLDER = "compare_flow_volume" SAMPLING_RATE = 0.05 @@ -84,6 +85,12 @@ def execute(context): df_grids = stat_grid(df_trips_comp,df_locations_comp,df_persons_comp,df_grid) point = df_grid.unary_union.centroid # a changé avec ploy_dep print("Printing grids...") + + # check output folder existence + analysis_output_path = os.path.join(context.config("output_path"), ANALYSIS_FOLDER) + if not os.path.exists(analysis_output_path): + os.mkdir(analysis_output_path) + for prefix, figure in figures.items(): df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])] df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index() @@ -103,7 +110,7 @@ def execute(context): df_select = df_select[df_select["count"] != 0] fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds', mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Localisation flow distribution for {prefix} group with {purpose} purpose") - fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html') + fig.write_html(f'{analysis_output_path}/{context.config("output_prefix")}{prefix}_{purpose}.html') else : df_grids_select = gpd.sjoin(df_grids_select,df_grid,how='right',predicate="contains").fillna(0) df_select = gpd.sjoin(df_select,df_grids_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0) @@ -111,6 +118,6 @@ def execute(context): df_select = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)] df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"] px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu","pourcentage_vol"], - mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html') + mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{analysis_output_path}/{context.config("output_prefix")}{prefix}_{purpose}.html') \ No newline at end of file diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py new file mode 100644 index 00000000..7a09c782 --- /dev/null +++ b/analysis/synthesis/population.py @@ -0,0 +1,127 @@ + +import os +import numpy as np +import pandas as pd +import geopandas as gpd +from analysis.marginals import NUMBER_OF_VEHICLES_LABELS +from shapely import distance +AGE_CLASS = [0, 10, 14, 17, 25, 50, 65, np.inf] +NUMBER_OF_VEHICLES= [0,1,2,3,np.inf] +NAME_AGE_CLASS = ["0-10","11-14","15-17","18-25","26-50","51-65","65+"] +ANALYSIS_FOLDER = "analysis_population" +def configure(context): + + context.config("output_path") + context.config("output_prefix", "ile_de_france_") + context.config("sampling_rate") + + context.stage("synthesis.population.trips") + context.stage("synthesis.population.enriched") + context.stage("synthesis.population.spatial.locations") + + context.stage("data.census.filtered", alias = "census") + context.stage("data.hts.selected", alias = "hts") + +def execute(context): + + # check output folder existence + analysis_output_path = os.path.join(context.config("output_path"), ANALYSIS_FOLDER) + if not os.path.exists(analysis_output_path): + os.mkdir(analysis_output_path) + + prefix = context.config("output_prefix") + sampling_rate = context.config("sampling_rate") + df_person_eq = context.stage("synthesis.population.enriched") + df_trip_eq = context.stage("synthesis.population.trips") + df_location_eq = context.stage("synthesis.population.spatial.locations")[["person_id", "activity_index", "geometry"]] + + df_location_eq = df_location_eq.to_crs("EPSG:2154") + df_trip_eq["preceding_activity_index"] = df_trip_eq["trip_index"] + df_trip_eq["following_activity_index"] = df_trip_eq["trip_index"] + 1 + + df_census = context.stage("census") + df_hts_households, df_hts_person, df_hts_trip = context.stage("hts") + df_hts_person["person_weight"] *=df_census["weight"].sum()/df_hts_person["person_weight"].sum() + df_hts_households["household_weight"] *=df_census["weight"].sum()/df_hts_households["household_weight"].sum() + # get age class + df_person_eq["age_class"] = pd.cut(df_person_eq["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS) + df_census["age_class"] = pd.cut(df_census["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS) + df_hts_person["age_class"] = pd.cut(df_hts_person["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS) + + # get vehicule class + df_person_eq["vehicles_class"] = pd.cut(df_person_eq["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS) + df_hts_households["vehicles_class"] = pd.cut(df_hts_households["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS) + + + df_eq_travel = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"]) + df_hts_travel = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class","person_weight"]],on=["person_id"]) + + print("Generate tables ...") + # Age purpose analysis + analysis_age_purpose = pd.pivot_table(df_eq_travel,"person_id",index="age_class",columns="following_purpose",aggfunc="count") + analysis_age_purpose = analysis_age_purpose/sampling_rate + analysis_age_purpose.to_csv(f"{analysis_output_path}/{prefix}age_purpose.csv") + + # Compare age volume + analysis_age_class = pd.concat([df_census.groupby("age_class")["weight"].sum(),df_person_eq.groupby("age_class")["person_id"].count()],axis=1).reset_index() + analysis_age_class.columns = ["Age class","INSEE","EQASIM"] + analysis_age_class["Proportion_INSEE"] = analysis_age_class["INSEE"] /df_census["weight"].sum() + analysis_age_class["Proportion_EQASIM"] = analysis_age_class["EQASIM"] /len(df_person_eq) + analysis_age_class["EQASIM"] = analysis_age_class["EQASIM"]/sampling_rate + analysis_age_class.to_csv(f"{analysis_output_path}/{prefix}age.csv") + + # Compare vehicle volume + analysis_vehicles_class = pd.concat([df_hts_households.groupby("vehicles_class")["household_weight"].sum(),df_person_eq.groupby("vehicles_class")["household_id"].nunique()],axis=1).reset_index() + analysis_vehicles_class.columns = ["Number of vehicles class","HTS","EQASIM"] + analysis_vehicles_class["Proportion_HTS"] = analysis_vehicles_class["HTS"] / df_hts_households["household_weight"].sum() + analysis_vehicles_class["Proportion_EQASIM"] = analysis_vehicles_class["EQASIM"] / df_person_eq["household_id"].nunique() + analysis_vehicles_class.to_csv(f"{analysis_output_path}/{prefix}nbr_vehicle.csv") + + # Compare license volume + analysis_license_class = pd.concat([df_hts_person.groupby("has_license")["person_weight"].sum(),df_person_eq.groupby("has_license")["person_id"].count()],axis=1).reset_index() + analysis_license_class.columns = ["Possession of license","HTS","EQASIM"] + analysis_license_class["Proportion_HTS"] = analysis_license_class["HTS"] /df_hts_person["person_weight"].sum() + analysis_license_class["Proportion_EQASIM"] = analysis_license_class["EQASIM"] /len(df_person_eq) + analysis_license_class["EQASIM"] = analysis_license_class["EQASIM"]/sampling_rate + analysis_license_class.to_csv(f"{analysis_output_path}/{prefix}license.csv") + + # Compare travel volume + analysis_travel = pd.concat([df_hts_travel.groupby("age_class")["person_weight"].sum(),df_eq_travel.groupby("age_class")["person_id"].count()],axis=1).reset_index() + analysis_travel.columns = ["Age class","HTS","EQASIM"] + analysis_travel["Proportion_HTS"] = analysis_travel["HTS"] /df_hts_travel["person_weight"].sum() + analysis_travel["Proportion_EQASIM"] = analysis_travel["EQASIM"] /len(df_eq_travel) + analysis_travel["EQASIM"] = analysis_travel["EQASIM"]/sampling_rate + analysis_travel.to_csv(f"{analysis_output_path}/{prefix}travel.csv") + + # Compare distance + df_hts_travel["routed_distance"] = df_hts_travel["routed_distance"]/1000 if "routed_distance" in df_hts_travel.columns else df_hts_travel["euclidean_distance"]/1000 + df_hts_travel["distance_class"] = pd.cut(df_hts_travel["routed_distance"],list(np.arange(100))+[np.inf]) + + df_spatial = pd.merge(df_trip_eq, df_location_eq.rename(columns = { + "activity_index": "preceding_activity_index", + "geometry": "preceding_geometry" + }), how = "left", on = ["person_id", "preceding_activity_index"]) + + df_spatial = pd.merge(df_spatial, df_location_eq.rename(columns = { + "activity_index": "following_activity_index", + "geometry": "following_geometry" + }), how = "left", on = ["person_id", "following_activity_index"]) + df_spatial["distance"] = df_spatial.apply(lambda x:distance( x["preceding_geometry"],x["following_geometry"])/1000,axis=1) + df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(100))+[np.inf]) + + analysis_distance = pd.concat([df_hts_travel.groupby("distance_class")["person_weight"].sum(),df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index() + analysis_distance.columns = ["Distance class","HTS","EQASIM"] + analysis_distance["Proportion_HTS"] = analysis_distance["HTS"] / analysis_distance["HTS"].sum() + analysis_distance["Proportion_EQASIM"] = analysis_distance["EQASIM"] / len(df_spatial) + analysis_distance["EQASIM"] = analysis_distance["EQASIM"]/ sampling_rate + analysis_distance.to_csv(f"{analysis_output_path}/{prefix}distance.csv") + + + + + + + + + + diff --git a/docs/population.md b/docs/population.md index bb479bf3..4bf64326 100644 --- a/docs/population.md +++ b/docs/population.md @@ -450,3 +450,11 @@ folder as: `{output_prefix}_{age group}_{trip pupose}.html` Note: With `analysis_from_file` at False, the last synthetic population is studied by default. Also if `output_prefix` and `comparison_file_prefix` refer to the same outputs, or `comparison_file_prefix` is not specified, then only a volume visualisation of this particular population is produced. + + +### Comparaison population to source data + +Using the population pipeline in the Analysis directory, you can generate multiple tables comparing composition of synthetic population to source data. Right now the tables generated compare : population volume by age range, households volume by number of vehicles, population volume with a license and without, trip volume by age range and trip volume by length. +Complementary from the synthetic population only, a table of population volume by age range and trip purpose is also created. + +To be able to use this pipeline, you must already have create a synthetic population. Then you need to open the `config.yml` and add the `analysis.synthesis.population` stage in the `run` section. \ No newline at end of file diff --git a/synthesis/output.py b/synthesis/output.py index 84c52a36..a1561b48 100644 --- a/synthesis/output.py +++ b/synthesis/output.py @@ -6,8 +6,10 @@ import sqlite3 import math import numpy as np +from analysis.synthesis.population import ANALYSIS_FOLDER def configure(context): + context.stage("synthesis.population.enriched") context.stage("synthesis.population.activities") @@ -22,7 +24,8 @@ def configure(context): context.config("output_path") context.config("output_prefix", "ile_de_france_") context.config("output_formats", ["csv", "gpkg"]) - + context.config("sampling_rate") + if context.config("mode_choice", False): context.stage("matsim.simulation.prepare") @@ -270,4 +273,4 @@ def execute(context): clean_gpkg(path) if "geoparquet" in output_formats: path = "%s/%strips.geoparquet" % (output_path, output_prefix) - df_spatial.to_parquet(path) + df_spatial.to_parquet(path) \ No newline at end of file