From 16b49211273e531ca3db904d7be1ba277ddf74ce Mon Sep 17 00:00:00 2001 From: Arthur BURIANNE Date: Wed, 20 Sep 2023 17:29:06 +0200 Subject: [PATCH 01/15] alpha version --- data/spatial/uu.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 data/spatial/uu.py diff --git a/data/spatial/uu.py b/data/spatial/uu.py new file mode 100644 index 00000000..1271dd1a --- /dev/null +++ b/data/spatial/uu.py @@ -0,0 +1,35 @@ +import pandas as pd +import os +import zipfile +import numpy as np +""" +Loads the "unités urbaines" national file +""" + +def configure(context): + context.stage("data.spatial.municipalities") + + context.config("data_path") + context.config("uu_path", "uu/UU2020_au_01-01-2023.zip") + context.config("uu_xlsx", "UU2020_au_01-01-2023.xlsx") + +def execute(context): + with zipfile.ZipFile("{}/{}".format( + context.config("data_path"), context.config("uu_path"))) as archive: + with archive.open(context.config("uu_xlsx")) as f: + df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5) + + df = df[["CODGEO","STATUT_2017"]].copy() + df = df.set_axis(["commune_id","type_uu"],axis='columns') + + # Clean unités urbaines + df["type_uu"] = df["type_uu"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"rural"}) + assert np.all(~df["type_uu"].isna()) + df["type_uu"] = df["type_uu"].astype("category") + + df_municipalities = context.stage("data.spatial.municipalities") + requested_communes = set(df_municipalities["commune_id"].unique()) + df = df[df["commune_id"].isin(requested_communes)] + + return df + From 6438c7d429dfdfd884b7bf5734f3f813cc321a88 Mon Sep 17 00:00:00 2001 From: Arthur BURIANNE Date: Wed, 20 Sep 2023 17:30:37 +0200 Subject: [PATCH 02/15] alpha version with scripts add... --- data/census/cleaned.py | 10 +++++++++- data/hts/entd/cleaned.py | 17 +++++++++++++++++ data/hts/entd/filtered.py | 17 ----------------- data/hts/entd/raw.py | 2 +- data/hts/entd/reweighted.py | 5 ++++- data/hts/hts.py | 4 ++-- data/hts/output.py | 1 + data/sirene/raw_geoloc.py | 1 - synthesis/population/matched.py | 12 +++++++++--- synthesis/population/spatial/home/zones.py | 1 + 10 files changed, 44 insertions(+), 26 deletions(-) diff --git a/data/census/cleaned.py b/data/census/cleaned.py index bc76889e..ab8da20e 100644 --- a/data/census/cleaned.py +++ b/data/census/cleaned.py @@ -12,9 +12,11 @@ def configure(context): context.stage("data.census.raw") context.stage("data.spatial.codes") + context.stage("data.spatial.uu") def execute(context): df = context.stage("data.census.raw") + uu = context.stage("data.spatial.uu") # Construct household IDs for persons with NUMMI != Z df_household_ids = df[["CANTVILLE", "NUMMI"]] @@ -111,6 +113,12 @@ def execute(context): # Consumption units df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id") + # UU match + df = pd.merge(df,uu,on="commune_id",how="left") + df.loc[df["commune_id"] == "undefined","type_uu"] = "rural" + df["commune_id"] = df["commune_id"].astype("category") + assert ~np.any(df["type_uu"].isna()) + return df[[ "person_id", "household_id", "weight", "iris_id", "commune_id", "departement_id", @@ -118,5 +126,5 @@ def execute(context): "commute_mode", "employed", "studies", "number_of_vehicles", "household_size", "work_outside_region", "education_outside_region", - "consumption_units", "socioprofessional_class" + "consumption_units", "socioprofessional_class","type_uu" ]] diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py index 840d1b96..a597027d 100644 --- a/data/hts/entd/cleaned.py +++ b/data/hts/entd/cleaned.py @@ -115,6 +115,11 @@ def execute(context): df_trips["origin_departement_id"] = df_trips["V2_MORIDEP"].fillna("undefined").astype("category") df_trips["destination_departement_id"] = df_trips["V2_MDESDEP"].fillna("undefined").astype("category") + # Clean unités urbaines + df_households["type_uu"] = df_households["numcom_UU2010"].replace({"B":"suburb","C":"central_city","I":"isolated_city","R":"rural"}) + assert np.all(~df_households["type_uu"].isna()) + df_households["type_uu"] = df_households["type_uu"].astype("category") + # Clean employment df_persons["employed"] = df_persons["SITUA"].isin([1, 2]) @@ -237,6 +242,18 @@ def execute(context): # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10 + + # Only keep trips and households that still have a person + df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] + df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + + # Finish up + df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"]] + df_persons = df_persons[hts.PERSON_COLUMNS] + df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]] + + hts.check(df_households, df_persons, df_trips) + return df_households, df_persons, df_trips def calculate_income_class(df): diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py index d7c8acf7..e29473b8 100644 --- a/data/hts/entd/filtered.py +++ b/data/hts/entd/filtered.py @@ -1,5 +1,4 @@ import data.hts.hts as hts -import numpy as np """ This stage filters out ENTD observations which live or work outside of @@ -8,26 +7,10 @@ def configure(context): context.stage("data.hts.entd.cleaned") - context.stage("data.spatial.codes") def execute(context): - df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned") - # Filter for non-residents - requested_departments = df_codes["departement_id"].unique() - f = df_persons["departement_id"].astype(str).isin(requested_departments) - df_persons = df_persons[f] - - # Filter for people going outside of the area (because they have NaN distances) - remove_ids = set() - - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) - - df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] - # Only keep trips and households that still have a person df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] diff --git a/data/hts/entd/raw.py b/data/hts/entd/raw.py index 27f525bc..f4bdd91a 100644 --- a/data/hts/entd/raw.py +++ b/data/hts/entd/raw.py @@ -14,7 +14,7 @@ Q_TCM_MENAGE_COLUMNS = [ "NPERS", "PONDV1", "TrancheRevenuMensuel", - "DEP", "idENT_MEN", "RG" + "DEP", "idENT_MEN", "RG", "numcom_UU2010" ] Q_INDIVIDU_COLUMNS = [ diff --git a/data/hts/entd/reweighted.py b/data/hts/entd/reweighted.py index 517a3ca9..b1aa1b96 100644 --- a/data/hts/entd/reweighted.py +++ b/data/hts/entd/reweighted.py @@ -1,10 +1,13 @@ import numpy as np def configure(context): + # context.stage("data.hts.entd.filtered") + context.stage("data.hts.entd.cleaned") context.stage("data.hts.entd.filtered") + def execute(context): - df_households, df_persons, df_trips = context.stage("data.hts.entd.filtered") + df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned") # ENTD defines multiple weights. For comparison with EGT we keep them in the # data set for the previous stages. In this one we override the weight, diff --git a/data/hts/hts.py b/data/hts/hts.py index d8c26dd2..1dd4f022 100644 --- a/data/hts/hts.py +++ b/data/hts/hts.py @@ -181,7 +181,7 @@ def check_activity_types(df_trips): error_count = np.count_nonzero(f) print("Trips with inconsistent activity types: %d" % error_count) - return error_count == 0 + return error_count == 1 def compute_first_last(df_trips): assert "person_id" in df_trips @@ -222,7 +222,7 @@ def calculate_consumption_units(df_persons): HOUSEHOLD_COLUMNS = [ "household_id", "household_weight", "household_size", "number_of_vehicles", "number_of_bikes", "departement_id", - "consumption_units", # "income_class" + "consumption_units", "type_uu" # "income_class" ] PERSON_COLUMNS = [ diff --git a/data/hts/output.py b/data/hts/output.py index cee14cad..2a2089a3 100644 --- a/data/hts/output.py +++ b/data/hts/output.py @@ -11,6 +11,7 @@ def configure(context): context.stage("data.hts.selected") + context.stage("data.hts.entd.reweighted") context.config("output_path") context.config("output_prefix", "ile_de_france_") diff --git a/data/sirene/raw_geoloc.py b/data/sirene/raw_geoloc.py index 7887710c..92da2cc2 100644 --- a/data/sirene/raw_geoloc.py +++ b/data/sirene/raw_geoloc.py @@ -8,7 +8,6 @@ def configure(context): context.config("data_path") context.config("siret_geo_path", "sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip") - context.stage("data.spatial.codes") diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index 586de567..715f4a1e 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -162,7 +162,7 @@ def execute(context): # Load data df_source_households, df_source_persons, df_source_trips = context.stage("hts") df_source = pd.merge(df_source_persons, df_source_households) - + df_target = context.stage("synthesis.population.sampled") # Define matching attributes @@ -179,13 +179,18 @@ def execute(context): df_target["any_cars"] = df_target["number_of_vehicles"] > 0 df_source["any_cars"] = df_source["number_of_vehicles"] > 0 - columns = ["sex", "any_cars", "age_class", "socioprofessional_class"] + columns = ["type_uu","sex", "any_cars", "age_class", "socioprofessional_class"] if "income_class" in df_source: columns += ["income_class"] - columns += ["departement_id"] + # Perform statistical matching df_source = df_source.rename(columns = { "person_id": "hts_id" }) + for column in columns: + assert column in df_source + assert column in df_target + + df_assignment, levels = parallel_statistical_matching( context, df_source, "hts_id", "person_weight", @@ -193,6 +198,7 @@ def execute(context): columns, minimum_observations = context.config("matching_minimum_observations")) + df_target = pd.merge(df_target, df_assignment, on = "person_id") assert len(df_target) == len(df_assignment) diff --git a/synthesis/population/spatial/home/zones.py b/synthesis/population/spatial/home/zones.py index 2964fdc5..6e93c2ac 100644 --- a/synthesis/population/spatial/home/zones.py +++ b/synthesis/population/spatial/home/zones.py @@ -35,6 +35,7 @@ def execute(context): df_municipalities = context.stage("data.spatial.municipalities").set_index("commune_id") df_municipalities["population"] = context.stage("data.spatial.population").groupby("commune_id")["population"].sum() + df_households["commune_id"] = df_households["commune_id"].cat.add_categories( sorted(set(df_municipalities.index.unique()) - set(df_households["commune_id"].cat.categories))) From 92937652f480b84d3087779297c633632be95d6d Mon Sep 17 00:00:00 2001 From: Tarek Chouaki Date: Mon, 30 Oct 2023 15:52:59 +0100 Subject: [PATCH 03/15] fix: properly handling cities with districts --- data/spatial/uu.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/data/spatial/uu.py b/data/spatial/uu.py index 1271dd1a..ac1d2af6 100644 --- a/data/spatial/uu.py +++ b/data/spatial/uu.py @@ -21,8 +21,22 @@ def execute(context): df = df[["CODGEO","STATUT_2017"]].copy() df = df.set_axis(["commune_id","type_uu"],axis='columns') - - # Clean unités urbaines + + # Cities that have districts are not detailed in the UU file, only the whole city is mentioned + # However the municipalities file details the districts with their respective INSEE codes + cities_with_districts = {"75056": [str(75101 + i) for i in (range(20))], # Paris + "69123": [str(69001 + i) for i in range(9)], # Lyon + "13055": [str(13201) for i in range(15)]} # Marseilles + + # Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts + for city_code in cities_with_districts: + uu_type = df[df["commune_id"] == city_code].iloc[0].loc["type_uu"] + df.drop(df[df["commune_id"] == city_code].index, inplace=True) + new_lines = {"commune_id": [district_id for district_id in cities_with_districts[city_code]], + "type_uu": [uu_type for i in range(len(cities_with_districts[city_code]))]} + df = pd.concat([df, pd.DataFrame.from_dict(new_lines)]) + + # Clean unités urbaines df["type_uu"] = df["type_uu"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"rural"}) assert np.all(~df["type_uu"].isna()) df["type_uu"] = df["type_uu"].astype("category") From 0eb0d85cf18f3f3a04449e34505dd0f93685c9a3 Mon Sep 17 00:00:00 2001 From: Arthur BURIANNE Date: Tue, 5 Dec 2023 09:16:55 +0100 Subject: [PATCH 04/15] config file update --- config.yml | 2 +- config_bordeaux_0.1.yml | 44 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 config_bordeaux_0.1.yml diff --git a/config.yml b/config.yml index 77a0b0e6..4836cd84 100644 --- a/config.yml +++ b/config.yml @@ -7,7 +7,7 @@ working_directory: cache # This section defines which parts of the pipeline should be run run: - synthesis.output # To create the output population in the output_path (see below) - #- matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) + - matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) # Here the configuraiton of the pipeline starts config: diff --git a/config_bordeaux_0.1.yml b/config_bordeaux_0.1.yml new file mode 100644 index 00000000..8f0de320 --- /dev/null +++ b/config_bordeaux_0.1.yml @@ -0,0 +1,44 @@ +## Synthetic population pipeline for Île-de-France +## based on the synpp package + +# This is the path to a directory where the pipeline can store temporary data +working_directory: C:/Users/arthur.burianne/Documents/tech_Lab/UU_tmp_0.1 + +# This section defines which parts of the pipeline should be run +run: + - synthesis.output # To create the output population in the output_path (see below) + #- documentation.paper # to create plots + - matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) + +# Here the configuraiton of the pipeline starts +config: + # Some general configuration + processes: 4 + + # Define whether to use ENTD or EGT as the household travel survey (HTS) + hts: entd + + # Define sampling rate and random seed for the output population + sampling_rate: 0.1 + random_seed: 1234 + acquisition_sample_size: 1 + home_location_sampling: weighted + departments: ["33"] + regions: [] + mode_choice: true + + # Paths to the input data and where the output should be stored + data_path: C:/Users/arthur.burianne/Documents/tech_Lab/bordeaux_data + output_path: C:/Users/arthur.burianne/Documents/tech_Lab/UU_output_0.1 + + # Data exceptions + population_year: 19 + + + # Only interesting if you run the simulation + java_memory: 14G + + # Uncommented below to enable vehicle fleet generation + # generate_vehicles_file: True + # generate_vehicles_method: fleet_sample + # vehicles_data_year: 2015 From 800c16bb2da3de526661474b88082c07ab691af0 Mon Sep 17 00:00:00 2001 From: Arthur BURIANNE Date: Tue, 30 Jan 2024 11:11:02 +0100 Subject: [PATCH 05/15] clean branch from extra config files for interlab use --- .gitignore | 1 + config_bordeaux_0.1.yml | 10 +++++----- matsim/runtime/eqasim.py | 2 +- synthesis/output.py | 15 +++++++++++---- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 9bbd7396..6ac50b40 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ data/lyon_2015 .vscode config_local_*.yml +config_bordeaux*.yml \ No newline at end of file diff --git a/config_bordeaux_0.1.yml b/config_bordeaux_0.1.yml index 8f0de320..1c96632b 100644 --- a/config_bordeaux_0.1.yml +++ b/config_bordeaux_0.1.yml @@ -2,18 +2,18 @@ ## based on the synpp package # This is the path to a directory where the pipeline can store temporary data -working_directory: C:/Users/arthur.burianne/Documents/tech_Lab/UU_tmp_0.1 +working_directory: C:/Users/arthur.burianne/Documents/techlab/simulations/bordeaux_uu_10_tmp # This section defines which parts of the pipeline should be run run: - synthesis.output # To create the output population in the output_path (see below) #- documentation.paper # to create plots - - matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) + #- matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) # Here the configuraiton of the pipeline starts config: # Some general configuration - processes: 4 + processes: 8 # Define whether to use ENTD or EGT as the household travel survey (HTS) hts: entd @@ -28,8 +28,8 @@ config: mode_choice: true # Paths to the input data and where the output should be stored - data_path: C:/Users/arthur.burianne/Documents/tech_Lab/bordeaux_data - output_path: C:/Users/arthur.burianne/Documents/tech_Lab/UU_output_0.1 + data_path: C:/Users/arthur.burianne/Documents/sources_data/bordeaux_data + output_path: C:/Users/arthur.burianne/Documents/techlab/simulations/bordeaux_uu_10_output # Data exceptions population_year: 19 diff --git a/matsim/runtime/eqasim.py b/matsim/runtime/eqasim.py index 1fedbd55..59f7a196 100644 --- a/matsim/runtime/eqasim.py +++ b/matsim/runtime/eqasim.py @@ -6,7 +6,7 @@ import matsim.runtime.maven as maven DEFAULT_EQASIM_VERSION = "1.3.1" -DEFAULT_EQASIM_COMMIT = "7cbe85b" +DEFAULT_EQASIM_COMMIT = "bf6de83" def configure(context): context.stage("matsim.runtime.git") diff --git a/synthesis/output.py b/synthesis/output.py index 2438063f..c80d08f9 100644 --- a/synthesis/output.py +++ b/synthesis/output.py @@ -131,14 +131,21 @@ def execute(context): if context.config("mode_choice"): df_mode_choice = pd.read_csv( - "{}/ile_de_france_tripModes.csv".format(context.path("matsim.simulation.prepare")), + "{}/{}tripModes.csv".format(context.path("matsim.simulation.prepare"), output_prefix), delimiter = ";") df_mode_choice = df_mode_choice.rename(columns = { - "personId": "person_id", "tripId": "trip_index", "mode" : "mode"}) + "personId": "person_id", "mode" : "mode","person_trip_id": "trip_index"}) - df_trips = pd.merge(df_trips, df_mode_choice, on = [ - "person_id", "trip_index"], how="left", validate = "one_to_one") + merging_columns = ["person_id", "trip_index"] + columns_to_keep = [column for column in df_trips.columns if column not in df_mode_choice.columns] + columns_to_keep.extend(merging_columns) + df_trips = df_trips[columns_to_keep] + + # df_mode_choice = df_mode_choice[["person_id", "trip_index"]] + # "tripId": "trip_index" + + df_trips = pd.merge(df_trips, df_mode_choice, on = merging_columns, how="left", validate = "one_to_one") assert not np.any(df_trips["mode"].isna()) From d262be900072511bc2eb5c3d830f613c805a6b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 09:58:01 +0100 Subject: [PATCH 06/15] cleanup --- .gitignore | 1 - config.yml | 2 +- config_bordeaux_0.1.yml | 44 ---------------------- data/census/cleaned.py | 14 +++---- data/hts/entd/cleaned.py | 26 +++++-------- data/hts/hts.py | 4 +- data/hts/output.py | 1 - data/sirene/raw_geoloc.py | 1 + data/spatial/{uu.py => urban_type.py} | 22 +++++------ matsim/runtime/eqasim.py | 2 +- synthesis/output.py | 13 ++----- synthesis/population/matched.py | 7 +--- synthesis/population/spatial/home/zones.py | 1 - 13 files changed, 37 insertions(+), 101 deletions(-) delete mode 100644 config_bordeaux_0.1.yml rename data/spatial/{uu.py => urban_type.py} (73%) diff --git a/.gitignore b/.gitignore index 6ac50b40..9bbd7396 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,3 @@ data/lyon_2015 .vscode config_local_*.yml -config_bordeaux*.yml \ No newline at end of file diff --git a/config.yml b/config.yml index 4836cd84..77a0b0e6 100644 --- a/config.yml +++ b/config.yml @@ -7,7 +7,7 @@ working_directory: cache # This section defines which parts of the pipeline should be run run: - synthesis.output # To create the output population in the output_path (see below) - - matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) + #- matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) # Here the configuraiton of the pipeline starts config: diff --git a/config_bordeaux_0.1.yml b/config_bordeaux_0.1.yml deleted file mode 100644 index 1c96632b..00000000 --- a/config_bordeaux_0.1.yml +++ /dev/null @@ -1,44 +0,0 @@ -## Synthetic population pipeline for Île-de-France -## based on the synpp package - -# This is the path to a directory where the pipeline can store temporary data -working_directory: C:/Users/arthur.burianne/Documents/techlab/simulations/bordeaux_uu_10_tmp - -# This section defines which parts of the pipeline should be run -run: - - synthesis.output # To create the output population in the output_path (see below) - #- documentation.paper # to create plots - #- matsim.output # Uncomment, if you want to run the full simulation (you'll need Java for that) - -# Here the configuraiton of the pipeline starts -config: - # Some general configuration - processes: 8 - - # Define whether to use ENTD or EGT as the household travel survey (HTS) - hts: entd - - # Define sampling rate and random seed for the output population - sampling_rate: 0.1 - random_seed: 1234 - acquisition_sample_size: 1 - home_location_sampling: weighted - departments: ["33"] - regions: [] - mode_choice: true - - # Paths to the input data and where the output should be stored - data_path: C:/Users/arthur.burianne/Documents/sources_data/bordeaux_data - output_path: C:/Users/arthur.burianne/Documents/techlab/simulations/bordeaux_uu_10_output - - # Data exceptions - population_year: 19 - - - # Only interesting if you run the simulation - java_memory: 14G - - # Uncommented below to enable vehicle fleet generation - # generate_vehicles_file: True - # generate_vehicles_method: fleet_sample - # vehicles_data_year: 2015 diff --git a/data/census/cleaned.py b/data/census/cleaned.py index 81d2648c..f5fe524f 100644 --- a/data/census/cleaned.py +++ b/data/census/cleaned.py @@ -12,11 +12,11 @@ def configure(context): context.stage("data.census.raw") context.stage("data.spatial.codes") - context.stage("data.spatial.uu") + context.stage("data.spatial.urban_type") def execute(context): df = context.stage("data.census.raw") - uu = context.stage("data.spatial.uu") + df_urban_type = context.stage("data.spatial.urban_type") # Construct household IDs for persons with NUMMI != Z df_household_ids = df[["CANTVILLE", "NUMMI"]] @@ -98,11 +98,11 @@ def execute(context): # Consumption units df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id") - # UU match - df = pd.merge(df,uu,on="commune_id",how="left") - df.loc[df["commune_id"] == "undefined","type_uu"] = "rural" + # Impute urban type + df = pd.merge(df, df_urban_type, on = "commune_id", how = "left") + df.loc[df["commune_id"] == "undefined", "urban_type"] = "none" df["commune_id"] = df["commune_id"].astype("category") - assert ~np.any(df["type_uu"].isna()) + assert ~np.any(df["urban_type"].isna()) return df[[ "person_id", "household_id", "weight", @@ -110,5 +110,5 @@ def execute(context): "age", "sex", "couple", "commute_mode", "employed", "studies", "number_of_vehicles", "household_size", - "consumption_units", "socioprofessional_class","type_uu" + "consumption_units", "socioprofessional_class", "urban_type" ]] diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py index a597027d..a8814c94 100644 --- a/data/hts/entd/cleaned.py +++ b/data/hts/entd/cleaned.py @@ -115,10 +115,16 @@ def execute(context): df_trips["origin_departement_id"] = df_trips["V2_MORIDEP"].fillna("undefined").astype("category") df_trips["destination_departement_id"] = df_trips["V2_MDESDEP"].fillna("undefined").astype("category") - # Clean unités urbaines - df_households["type_uu"] = df_households["numcom_UU2010"].replace({"B":"suburb","C":"central_city","I":"isolated_city","R":"rural"}) - assert np.all(~df_households["type_uu"].isna()) - df_households["type_uu"] = df_households["type_uu"].astype("category") + # Clean urban type + df_households["urban_type"] = df_households["numcom_UU2010"].replace({ + "B": "suburb", + "C": "central_city", + "I": "isolated_city", + "R": "none" + }) + + assert np.all(~df_households["urban_type"].isna()) + df_households["urban_type"] = df_households["urban_type"].astype("category") # Clean employment df_persons["employed"] = df_persons["SITUA"].isin([1, 2]) @@ -242,18 +248,6 @@ def execute(context): # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10 - - # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] - - # Finish up - df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"]] - df_persons = df_persons[hts.PERSON_COLUMNS] - df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]] - - hts.check(df_households, df_persons, df_trips) - return df_households, df_persons, df_trips def calculate_income_class(df): diff --git a/data/hts/hts.py b/data/hts/hts.py index 1dd4f022..77858c53 100644 --- a/data/hts/hts.py +++ b/data/hts/hts.py @@ -181,7 +181,7 @@ def check_activity_types(df_trips): error_count = np.count_nonzero(f) print("Trips with inconsistent activity types: %d" % error_count) - return error_count == 1 + return error_count == 0 def compute_first_last(df_trips): assert "person_id" in df_trips @@ -222,7 +222,7 @@ def calculate_consumption_units(df_persons): HOUSEHOLD_COLUMNS = [ "household_id", "household_weight", "household_size", "number_of_vehicles", "number_of_bikes", "departement_id", - "consumption_units", "type_uu" # "income_class" + "consumption_units", "urban_type" # "income_class" ] PERSON_COLUMNS = [ diff --git a/data/hts/output.py b/data/hts/output.py index 2a2089a3..cee14cad 100644 --- a/data/hts/output.py +++ b/data/hts/output.py @@ -11,7 +11,6 @@ def configure(context): context.stage("data.hts.selected") - context.stage("data.hts.entd.reweighted") context.config("output_path") context.config("output_prefix", "ile_de_france_") diff --git a/data/sirene/raw_geoloc.py b/data/sirene/raw_geoloc.py index 92da2cc2..7887710c 100644 --- a/data/sirene/raw_geoloc.py +++ b/data/sirene/raw_geoloc.py @@ -8,6 +8,7 @@ def configure(context): context.config("data_path") context.config("siret_geo_path", "sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip") + context.stage("data.spatial.codes") diff --git a/data/spatial/uu.py b/data/spatial/urban_type.py similarity index 73% rename from data/spatial/uu.py rename to data/spatial/urban_type.py index ac1d2af6..5a506ee9 100644 --- a/data/spatial/uu.py +++ b/data/spatial/urban_type.py @@ -2,31 +2,30 @@ import os import zipfile import numpy as np -""" -Loads the "unités urbaines" national file -""" + +# Loads the input data for the urban type (unité urbain) def configure(context): context.stage("data.spatial.municipalities") context.config("data_path") - context.config("uu_path", "uu/UU2020_au_01-01-2023.zip") - context.config("uu_xlsx", "UU2020_au_01-01-2023.xlsx") + context.config("urban_type_path", "uu/UU2020_au_01-01-2023.zip") + context.config("urban_type_file", "UU2020_au_01-01-2023.xlsx") def execute(context): with zipfile.ZipFile("{}/{}".format( - context.config("data_path"), context.config("uu_path"))) as archive: - with archive.open(context.config("uu_xlsx")) as f: + context.config("data_path"), context.config("urban_type_path"))) as archive: + with archive.open(context.config("urban_type_file")) as f: df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5) - df = df[["CODGEO","STATUT_2017"]].copy() - df = df.set_axis(["commune_id","type_uu"],axis='columns') + df = df[["CODGEO", "STATUT_2017"]].copy() + df = df.set_axis(["commune_id", "type_uu"], axis = "columns") # Cities that have districts are not detailed in the UU file, only the whole city is mentioned # However the municipalities file details the districts with their respective INSEE codes cities_with_districts = {"75056": [str(75101 + i) for i in (range(20))], # Paris "69123": [str(69001 + i) for i in range(9)], # Lyon - "13055": [str(13201) for i in range(15)]} # Marseilles + "13055": [str(13201 + i) for i in range(15)]} # Marseilles # Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts for city_code in cities_with_districts: @@ -37,7 +36,7 @@ def execute(context): df = pd.concat([df, pd.DataFrame.from_dict(new_lines)]) # Clean unités urbaines - df["type_uu"] = df["type_uu"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"rural"}) + df["type_uu"] = df["type_uu"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"}) assert np.all(~df["type_uu"].isna()) df["type_uu"] = df["type_uu"].astype("category") @@ -46,4 +45,3 @@ def execute(context): df = df[df["commune_id"].isin(requested_communes)] return df - diff --git a/matsim/runtime/eqasim.py b/matsim/runtime/eqasim.py index 59f7a196..1fedbd55 100644 --- a/matsim/runtime/eqasim.py +++ b/matsim/runtime/eqasim.py @@ -6,7 +6,7 @@ import matsim.runtime.maven as maven DEFAULT_EQASIM_VERSION = "1.3.1" -DEFAULT_EQASIM_COMMIT = "bf6de83" +DEFAULT_EQASIM_COMMIT = "7cbe85b" def configure(context): context.stage("matsim.runtime.git") diff --git a/synthesis/output.py b/synthesis/output.py index c80d08f9..7ac6295e 100644 --- a/synthesis/output.py +++ b/synthesis/output.py @@ -135,17 +135,10 @@ def execute(context): delimiter = ";") df_mode_choice = df_mode_choice.rename(columns = { - "personId": "person_id", "mode" : "mode","person_trip_id": "trip_index"}) + "personId": "person_id", "tripId": "trip_index", "mode" : "mode"}) - merging_columns = ["person_id", "trip_index"] - columns_to_keep = [column for column in df_trips.columns if column not in df_mode_choice.columns] - columns_to_keep.extend(merging_columns) - df_trips = df_trips[columns_to_keep] - - # df_mode_choice = df_mode_choice[["person_id", "trip_index"]] - # "tripId": "trip_index" - - df_trips = pd.merge(df_trips, df_mode_choice, on = merging_columns, how="left", validate = "one_to_one") + df_trips = pd.merge(df_trips, df_mode_choice, on = [ + "person_id", "trip_index"], how="left", validate = "one_to_one") assert not np.any(df_trips["mode"].isna()) diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index 715f4a1e..ef136b2d 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -162,7 +162,7 @@ def execute(context): # Load data df_source_households, df_source_persons, df_source_trips = context.stage("hts") df_source = pd.merge(df_source_persons, df_source_households) - + df_target = context.stage("synthesis.population.sampled") # Define matching attributes @@ -179,10 +179,9 @@ def execute(context): df_target["any_cars"] = df_target["number_of_vehicles"] > 0 df_source["any_cars"] = df_source["number_of_vehicles"] > 0 - columns = ["type_uu","sex", "any_cars", "age_class", "socioprofessional_class"] + columns = ["urban_type","sex", "any_cars", "age_class", "socioprofessional_class"] if "income_class" in df_source: columns += ["income_class"] - # Perform statistical matching df_source = df_source.rename(columns = { "person_id": "hts_id" }) @@ -190,7 +189,6 @@ def execute(context): assert column in df_source assert column in df_target - df_assignment, levels = parallel_statistical_matching( context, df_source, "hts_id", "person_weight", @@ -198,7 +196,6 @@ def execute(context): columns, minimum_observations = context.config("matching_minimum_observations")) - df_target = pd.merge(df_target, df_assignment, on = "person_id") assert len(df_target) == len(df_assignment) diff --git a/synthesis/population/spatial/home/zones.py b/synthesis/population/spatial/home/zones.py index 6e93c2ac..2964fdc5 100644 --- a/synthesis/population/spatial/home/zones.py +++ b/synthesis/population/spatial/home/zones.py @@ -35,7 +35,6 @@ def execute(context): df_municipalities = context.stage("data.spatial.municipalities").set_index("commune_id") df_municipalities["population"] = context.stage("data.spatial.population").groupby("commune_id")["population"].sum() - df_households["commune_id"] = df_households["commune_id"].cat.add_categories( sorted(set(df_municipalities.index.unique()) - set(df_households["commune_id"].cat.categories))) From bad2c1c1b5bdcc7e8cc9b5d985985f28da4795a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 10:01:09 +0100 Subject: [PATCH 07/15] further cleanup --- data/hts/entd/filtered.py | 17 +++++++++++++++++ data/hts/entd/reweighted.py | 5 +---- synthesis/population/matched.py | 2 +- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py index e29473b8..d7c8acf7 100644 --- a/data/hts/entd/filtered.py +++ b/data/hts/entd/filtered.py @@ -1,4 +1,5 @@ import data.hts.hts as hts +import numpy as np """ This stage filters out ENTD observations which live or work outside of @@ -7,10 +8,26 @@ def configure(context): context.stage("data.hts.entd.cleaned") + context.stage("data.spatial.codes") def execute(context): + df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned") + # Filter for non-residents + requested_departments = df_codes["departement_id"].unique() + f = df_persons["departement_id"].astype(str).isin(requested_departments) + df_persons = df_persons[f] + + # Filter for people going outside of the area (because they have NaN distances) + remove_ids = set() + + remove_ids |= set(df_trips[ + ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) + ]["person_id"].unique()) + + df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] + # Only keep trips and households that still have a person df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] diff --git a/data/hts/entd/reweighted.py b/data/hts/entd/reweighted.py index b1aa1b96..517a3ca9 100644 --- a/data/hts/entd/reweighted.py +++ b/data/hts/entd/reweighted.py @@ -1,13 +1,10 @@ import numpy as np def configure(context): - # context.stage("data.hts.entd.filtered") - context.stage("data.hts.entd.cleaned") context.stage("data.hts.entd.filtered") - def execute(context): - df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned") + df_households, df_persons, df_trips = context.stage("data.hts.entd.filtered") # ENTD defines multiple weights. For comparison with EGT we keep them in the # data set for the previous stages. In this one we override the weight, diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index ef136b2d..a41b2a48 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -179,7 +179,7 @@ def execute(context): df_target["any_cars"] = df_target["number_of_vehicles"] > 0 df_source["any_cars"] = df_source["number_of_vehicles"] > 0 - columns = ["urban_type","sex", "any_cars", "age_class", "socioprofessional_class"] + columns = ["urban_type", "sex", "any_cars", "age_class", "socioprofessional_class"] if "income_class" in df_source: columns += ["income_class"] # Perform statistical matching From 529204885ff7e888ab7ff6f604d36498d2e0b339 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 12:02:25 +0100 Subject: [PATCH 08/15] make matching attributes configurable --- CHANGELOG.md | 2 ++ synthesis/population/matched.py | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97c4914f..d5fca840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ **Under development** +- feat: make statistical matching attribute list configurable +- feat: add urban type classifiation (unité urbaine) - feat: functionality to make use of INSEE population projection data - update: don't remove households with people not living/studying in Île-de-France anymore to be more consistent with other use cases - fix bug where always one household_id existed twice diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index a41b2a48..4eec6b0c 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -19,10 +19,15 @@ "entd": data.hts.entd.cleaned.calculate_income_class, } +DEFAULT_MATCHING_ATTRIBUTES = [ + "urban_type", "sex", "any_cars", "age_class", "socioprofessional_class" +] + def configure(context): context.config("processes") context.config("random_seed") context.config("matching_minimum_observations", 20) + context.config("matching_attributes", DEFAULT_MATCHING_ATTRIBUTES) context.stage("synthesis.population.sampled") context.stage("synthesis.population.income") @@ -165,22 +170,24 @@ def execute(context): df_target = context.stage("synthesis.population.sampled") + columns = context.config("matching_attributes") + # Define matching attributes AGE_BOUNDARIES = [14, 29, 44, 59, 74, 1000] - df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True) - df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True) + + if "age_class" in columns: + df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True) + df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True) - if "income_class" in df_source: + if "income_class" in columns: df_income = context.stage("synthesis.population.income")[["household_id", "household_income"]] df_target = pd.merge(df_target, df_income) df_target["income_class"] = INCOME_CLASS[hts](df_target) - df_target["any_cars"] = df_target["number_of_vehicles"] > 0 - df_source["any_cars"] = df_source["number_of_vehicles"] > 0 - - columns = ["urban_type", "sex", "any_cars", "age_class", "socioprofessional_class"] - if "income_class" in df_source: columns += ["income_class"] + if "any_cars" in columns: + df_target["any_cars"] = df_target["number_of_vehicles"] > 0 + df_source["any_cars"] = df_source["number_of_vehicles"] > 0 # Perform statistical matching df_source = df_source.rename(columns = { "person_id": "hts_id" }) From adb97ad08c4cfaa220222e30c5c819504f1422ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 12:24:42 +0100 Subject: [PATCH 09/15] monkey patching openpyxl to read excel sheet --- data/spatial/urban_type.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/data/spatial/urban_type.py b/data/spatial/urban_type.py index 5a506ee9..46ac3192 100644 --- a/data/spatial/urban_type.py +++ b/data/spatial/urban_type.py @@ -3,6 +3,20 @@ import zipfile import numpy as np +# START Money patching openpyxl to parse INSEE file +from openpyxl.styles.colors import WHITE, RGB +__old_rgb_set__ = RGB.__set__ + +def __rgb_set_fixed__(self, instance, value): + try: + __old_rgb_set__(self, instance, value) + except ValueError as e: + if e.args[0] == 'Colors must be aRGB hex values': + __old_rgb_set__(self, instance, WHITE) + +RGB.__set__ = __rgb_set_fixed__ +# END Monkey patching openpyxl + # Loads the input data for the urban type (unité urbain) def configure(context): @@ -10,12 +24,12 @@ def configure(context): context.config("data_path") context.config("urban_type_path", "uu/UU2020_au_01-01-2023.zip") - context.config("urban_type_file", "UU2020_au_01-01-2023.xlsx") def execute(context): with zipfile.ZipFile("{}/{}".format( context.config("data_path"), context.config("urban_type_path"))) as archive: - with archive.open(context.config("urban_type_file")) as f: + assert len(archive.filelist) == 1 + with archive.open(archive.filelist[0]) as f: df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5) df = df[["CODGEO", "STATUT_2017"]].copy() @@ -45,3 +59,9 @@ def execute(context): df = df[df["commune_id"].isin(requested_communes)] return df + +def validate(context): + if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))): + raise RuntimeError("Urban type data is not available") + + return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))) From ed7c9899b16559483b83a30bb9a5d97d2b7572db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 12:32:54 +0100 Subject: [PATCH 10/15] make configurable --- data/census/cleaned.py | 19 ++++++++++++------- data/hts/entd/filtered.py | 2 +- data/hts/hts.py | 2 +- synthesis/population/matched.py | 6 ++++++ 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/data/census/cleaned.py b/data/census/cleaned.py index f5fe524f..e7a1f45c 100644 --- a/data/census/cleaned.py +++ b/data/census/cleaned.py @@ -12,11 +12,12 @@ def configure(context): context.stage("data.census.raw") context.stage("data.spatial.codes") - context.stage("data.spatial.urban_type") + + if context.config("use_urban_type", False): + context.stage("data.spatial.urban_type") def execute(context): df = context.stage("data.census.raw") - df_urban_type = context.stage("data.spatial.urban_type") # Construct household IDs for persons with NUMMI != Z df_household_ids = df[["CANTVILLE", "NUMMI"]] @@ -98,11 +99,15 @@ def execute(context): # Consumption units df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id") - # Impute urban type - df = pd.merge(df, df_urban_type, on = "commune_id", how = "left") - df.loc[df["commune_id"] == "undefined", "urban_type"] = "none" - df["commune_id"] = df["commune_id"].astype("category") - assert ~np.any(df["urban_type"].isna()) + + if context.config("use_urban_type", False): + df_urban_type = context.stage("data.spatial.urban_type") + + # Impute urban type + df = pd.merge(df, df_urban_type, on = "commune_id", how = "left") + df.loc[df["commune_id"] == "undefined", "urban_type"] = "none" + df["commune_id"] = df["commune_id"].astype("category") + assert ~np.any(df["urban_type"].isna()) return df[[ "person_id", "household_id", "weight", diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py index d7c8acf7..9fc4793c 100644 --- a/data/hts/entd/filtered.py +++ b/data/hts/entd/filtered.py @@ -33,7 +33,7 @@ def execute(context): df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up - df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"]] + df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]] df_persons = df_persons[hts.PERSON_COLUMNS] df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]] diff --git a/data/hts/hts.py b/data/hts/hts.py index 77858c53..d8c26dd2 100644 --- a/data/hts/hts.py +++ b/data/hts/hts.py @@ -222,7 +222,7 @@ def calculate_consumption_units(df_persons): HOUSEHOLD_COLUMNS = [ "household_id", "household_weight", "household_size", "number_of_vehicles", "number_of_bikes", "departement_id", - "consumption_units", "urban_type" # "income_class" + "consumption_units", # "income_class" ] PERSON_COLUMNS = [ diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index 4eec6b0c..a76aaeb2 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -171,6 +171,12 @@ def execute(context): df_target = context.stage("synthesis.population.sampled") columns = context.config("matching_attributes") + + try: + default_index = columns.index("*default*") + del columns[default_index] + columns.insert(default_index, DEFAULT_MATCHING_ATTRIBUTES) + except ValueError: pass # Define matching attributes AGE_BOUNDARIES = [14, 29, 44, 59, 74, 1000] From 4acafe4e54a64d6c7e7ada0d087bc4817400f4fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 12:45:48 +0100 Subject: [PATCH 11/15] add test data to ENTD --- tests/testdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testdata.py b/tests/testdata.py index 59e5a1cb..905d2284 100644 --- a/tests/testdata.py +++ b/tests/testdata.py @@ -301,7 +301,7 @@ def create(output_path): "De 1 000", "De 1 200", "De 1 500", "De 1800", "De 2 000", "De 2 500", "De 3 000", "De 4 000", "De 6 000", "10 000" - ]) + ]), numcom_UU2010 = random.choice(["B", "C", "I", "R"]) )) for person_index in range(HTS_HOUSEHOLD_MEMBERS): From f9eca48b113b362cfeaf0f415eab374365e4949a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 12:50:51 +0100 Subject: [PATCH 12/15] add documentation --- docs/population.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/population.md b/docs/population.md index 666d1b1a..774b7813 100644 --- a/docs/population.md +++ b/docs/population.md @@ -313,3 +313,25 @@ config: # [...] projection_scenario: 00_central ``` + +### Urban type + +The pipeline allows to work with INSEE's urban type classification (unité urbaine) that distinguishes municipalities in *center cities*, *suburbs*, *isolated cities*, and unclassified ones. To impute the data (currently only for some HTS), activate it via the configuration: + +```yaml +config: + # [...] + use_urban_type: true +``` + +In order to make use of it for activity chain matching, you can set a custom list of matching attributes like so: + +```yaml +config: + # [...] + matching_attributes: ["urban_type", "*default*"] +``` + +The `*default*` trigger will be replaced by the default list of matching attributes. + +Note that not all HTS implement the urban type, so matching may not work with some implementations. Most of them, however, contain the data, we just need to update the code to read them in. From d21ccc7c27b28f8ead2f3e62c01720abc3727689 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 12:59:10 +0100 Subject: [PATCH 13/15] update tests --- data/census/cleaned.py | 23 +++++++++++++---------- synthesis/population/matched.py | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/data/census/cleaned.py b/data/census/cleaned.py index e7a1f45c..789d0adb 100644 --- a/data/census/cleaned.py +++ b/data/census/cleaned.py @@ -99,9 +99,19 @@ def execute(context): # Consumption units df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id") + df = df[[ + "person_id", "household_id", "weight", + "iris_id", "commune_id", "departement_id", + "age", "sex", "couple", + "commute_mode", "employed", + "studies", "number_of_vehicles", "household_size", + "consumption_units", "socioprofessional_class" + ]] - if context.config("use_urban_type", False): - df_urban_type = context.stage("data.spatial.urban_type") + if context.config("use_urban_type"): + df_urban_type = context.stage("data.spatial.urban_type")[[ + "commune_id", "urban_type" + ]] # Impute urban type df = pd.merge(df, df_urban_type, on = "commune_id", how = "left") @@ -109,11 +119,4 @@ def execute(context): df["commune_id"] = df["commune_id"].astype("category") assert ~np.any(df["urban_type"].isna()) - return df[[ - "person_id", "household_id", "weight", - "iris_id", "commune_id", "departement_id", - "age", "sex", "couple", - "commute_mode", "employed", - "studies", "number_of_vehicles", "household_size", - "consumption_units", "socioprofessional_class", "urban_type" - ]] + return df diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index a76aaeb2..6edcf62f 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -20,7 +20,7 @@ } DEFAULT_MATCHING_ATTRIBUTES = [ - "urban_type", "sex", "any_cars", "age_class", "socioprofessional_class" + "sex", "any_cars", "age_class", "socioprofessional_class" ] def configure(context): From 4d4e5bfcfbcc77cd19fbcf4f179adaaae453dc5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 17:36:06 +0100 Subject: [PATCH 14/15] testing and egt --- data/hts/egt/cleaned.py | 21 +++++++++++++++ data/hts/egt/filtered.py | 13 +++++++--- data/spatial/urban_type.py | 28 ++++++++++++-------- synthesis/population/matched.py | 18 ++++++++----- tests/test_determinism.py | 12 +++++++-- tests/test_pipeline.py | 37 +++++++++++++++++++++----- tests/testdata.py | 46 +++++++++++++++++++++++++-------- 7 files changed, 134 insertions(+), 41 deletions(-) diff --git a/data/hts/egt/cleaned.py b/data/hts/egt/cleaned.py index ec7530af..197da72b 100644 --- a/data/hts/egt/cleaned.py +++ b/data/hts/egt/cleaned.py @@ -10,6 +10,9 @@ def configure(context): context.stage("data.hts.egt.raw") + if context.config("use_urban_type", False): + context.stage("data.spatial.urban_type") + INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6] PURPOSE_MAP = { @@ -111,6 +114,24 @@ def execute(context): df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1 df_households["income_class"] = df_households["income_class"].astype(int) + # Impute urban type + if context.config("use_urban_type"): + df_urban_type = context.stage("data.spatial.urban_type")[[ + "commune_id", "urban_type" + ]] + + # Household municipality + df_households["commune_id"] = df_households["RESCOMM"].astype("category") + df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left") + assert np.all(~df_persons["commune_id"].isna()) + + # Impute urban type + df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left") + df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category") + + df_households.drop(columns = ["commune_id"]) + df_persons.drop(columns = ["commune_id"]) + # Trip purpose df_trips["following_purpose"] = "other" df_trips["preceding_purpose"] = "other" diff --git a/data/hts/egt/filtered.py b/data/hts/egt/filtered.py index 62cf1b8f..c2286055 100644 --- a/data/hts/egt/filtered.py +++ b/data/hts/egt/filtered.py @@ -12,7 +12,6 @@ def configure(context): def execute(context): df_codes = context.stage("data.spatial.codes") - assert (df_codes["region_id"] == 11).all() # Otherwise EGT doesn't make sense df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned") @@ -39,9 +38,15 @@ def execute(context): df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up - df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]] - df_persons = df_persons[hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]] - df_trips = df_trips[hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]] + household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"] + df_households = df_households[household_columns] + + person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"] + if "urban_type" in df_persons: person_columns.append("urban_type") + df_persons = df_persons[person_columns] + + trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"] + df_trips = df_trips[trip_columns] hts.check(df_households, df_persons, df_trips) diff --git a/data/spatial/urban_type.py b/data/spatial/urban_type.py index 46ac3192..7e5c0c26 100644 --- a/data/spatial/urban_type.py +++ b/data/spatial/urban_type.py @@ -23,7 +23,7 @@ def configure(context): context.stage("data.spatial.municipalities") context.config("data_path") - context.config("urban_type_path", "uu/UU2020_au_01-01-2023.zip") + context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip") def execute(context): with zipfile.ZipFile("{}/{}".format( @@ -33,7 +33,7 @@ def execute(context): df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5) df = df[["CODGEO", "STATUT_2017"]].copy() - df = df.set_axis(["commune_id", "type_uu"], axis = "columns") + df = df.set_axis(["commune_id", "urban_type"], axis = "columns") # Cities that have districts are not detailed in the UU file, only the whole city is mentioned # However the municipalities file details the districts with their respective INSEE codes @@ -43,21 +43,27 @@ def execute(context): # Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts for city_code in cities_with_districts: - uu_type = df[df["commune_id"] == city_code].iloc[0].loc["type_uu"] - df.drop(df[df["commune_id"] == city_code].index, inplace=True) - new_lines = {"commune_id": [district_id for district_id in cities_with_districts[city_code]], - "type_uu": [uu_type for i in range(len(cities_with_districts[city_code]))]} - df = pd.concat([df, pd.DataFrame.from_dict(new_lines)]) + base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"] + replacement_codes = cities_with_districts[city_code] + + df = pd.concat([df, pd.DataFrame({ + "commune_id": replacement_codes, + "urban_type": [base_type] * len(replacement_codes) + })]) + + df = df[~df["commune_id"].isin(cities_with_districts.keys())] # Clean unités urbaines - df["type_uu"] = df["type_uu"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"}) - assert np.all(~df["type_uu"].isna()) - df["type_uu"] = df["type_uu"].astype("category") + df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"}) + assert np.all(~df["urban_type"].isna()) + df["urban_type"] = df["urban_type"].astype("category") df_municipalities = context.stage("data.spatial.municipalities") requested_communes = set(df_municipalities["commune_id"].unique()) df = df[df["commune_id"].isin(requested_communes)] - + + assert len(df["commune_id"].unique()) == len(df) + return df def validate(context): diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index 6edcf62f..18647e72 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -20,7 +20,8 @@ } DEFAULT_MATCHING_ATTRIBUTES = [ - "sex", "any_cars", "age_class", "socioprofessional_class" + "sex", "any_cars", "age_class", "socioprofessional_class", + "departement_id" ] def configure(context): @@ -117,6 +118,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ progress.update(np.count_nonzero(unassigned_mask)) + if np.count_nonzero(unassigned_mask) > 0: + raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?") + assert np.count_nonzero(unassigned_mask) == 0 assert np.count_nonzero(assigned_indices == -1) == 0 @@ -174,8 +178,7 @@ def execute(context): try: default_index = columns.index("*default*") - del columns[default_index] - columns.insert(default_index, DEFAULT_MATCHING_ATTRIBUTES) + columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES except ValueError: pass # Define matching attributes @@ -199,9 +202,12 @@ def execute(context): df_source = df_source.rename(columns = { "person_id": "hts_id" }) for column in columns: - assert column in df_source - assert column in df_target - + if not column in df_source: + raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column)) + + if not column in df_target: + raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column)) + df_assignment, levels = parallel_statistical_matching( context, df_source, "hts_id", "person_weight", diff --git a/tests/test_determinism.py b/tests/test_determinism.py index ba02a407..bf43e5ed 100644 --- a/tests/test_determinism.py +++ b/tests/test_determinism.py @@ -54,7 +54,11 @@ def _test_determinism(index, data_path, tmpdir): regions = [10, 11], sampling_rate = 1.0, hts = "entd", random_seed = 1000, processes = 1, secloc_maximum_iterations = 10, - maven_skip_tests = True + maven_skip_tests = True, + matching_attributes = [ + "sex", "any_cars", "age_class", "socioprofessional_class", + "income_class", "departement_id" + ] ) stages = [ @@ -111,7 +115,11 @@ def _test_determinism_matsim(index, data_path, tmpdir): regions = [10, 11], sampling_rate = 1.0, hts = "entd", random_seed = 1000, processes = 1, secloc_maximum_iterations = 10, - maven_skip_tests = True + maven_skip_tests = True, + matching_attributes = [ + "sex", "any_cars", "age_class", "socioprofessional_class", + "income_class", "departement_id" + ] ) stages = [ diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 430a277f..fa8fddd8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -2,6 +2,7 @@ import os import hashlib from . import testdata +import pandas as pd def test_data(tmpdir): data_path = str(tmpdir.mkdir("data")) @@ -34,7 +35,7 @@ def test_data(tmpdir): assert os.path.isfile("%s/ile_de_france_hts_trips.csv" % output_path) assert os.path.isfile("%s/ile_de_france_sirene.gpkg" % output_path) -def run_population(tmpdir, hts, mode_choice): +def run_population(tmpdir, hts, update = {}): data_path = str(tmpdir.mkdir("data")) testdata.create(data_path) @@ -45,9 +46,9 @@ def run_population(tmpdir, hts, mode_choice): regions = [10, 11], sampling_rate = 1.0, hts = hts, random_seed = 1000, processes = 1, secloc_maximum_iterations = 10, - maven_skip_tests = True, - mode_choice = mode_choice + maven_skip_tests = True ) + config.update(update) stages = [ dict(descriptor = "synthesis.output"), @@ -62,11 +63,33 @@ def run_population(tmpdir, hts, mode_choice): assert os.path.isfile("%s/ile_de_france_trips.gpkg" % output_path) assert os.path.isfile("%s/ile_de_france_meta.json" % output_path) + assert 2235 == len(pd.read_csv("%s/ile_de_france_activities.csv" % output_path, usecols = ["household_id"], sep = ";")) + assert 447 == len(pd.read_csv("%s/ile_de_france_persons.csv" % output_path, usecols = ["household_id"], sep = ";")) + assert 149 == len(pd.read_csv("%s/ile_de_france_households.csv" % output_path, usecols = ["household_id"], sep = ";")) + def test_population_with_entd(tmpdir): - run_population(tmpdir, "entd", False) + run_population(tmpdir, "entd") + +def test_population_with_egt(tmpdir): + run_population(tmpdir, "egt") def test_population_with_mode_choice(tmpdir): - run_population(tmpdir, "entd", True) + run_population(tmpdir, "entd", { "mode_choice": True }) + +def test_population_with_urban_type(tmpdir): + run_population(tmpdir, "entd", { + "use_urban_type": True, + "matching_attributes": [ + "urban_type", "*default*" + ], + "matching_minimum_observations": 5 + }) -#def test_population_with_egt(tmpdir): -# run_population(tmpdir, "entd") # TODO: Fix this! +def test_population_with_urban_type_and_egt(tmpdir): + run_population(tmpdir, "egt", { + "use_urban_type": True, + "matching_attributes": [ + "urban_type", "*default*" + ], + "matching_minimum_observations": 5 + }) diff --git a/tests/testdata.py b/tests/testdata.py index 905d2284..1db0d225 100644 --- a/tests/testdata.py +++ b/tests/testdata.py @@ -301,7 +301,7 @@ def create(output_path): "De 1 000", "De 1 200", "De 1 500", "De 1800", "De 2 000", "De 2 500", "De 3 000", "De 4 000", "De 6 000", "10 000" - ]), numcom_UU2010 = random.choice(["B", "C", "I", "R"]) + ]), numcom_UU2010 = ["B", "C", "I", "R"][household_index % 4] )) for person_index in range(HTS_HOUSEHOLD_MEMBERS): @@ -388,8 +388,9 @@ def create(output_path): trips = [] ) + person_index = 0 for household_index in range(HTS_HOUSEHOLDS): - household_id = household_index + household_id = household_index * 1000 + 50 municipality = random.choice(df["municipality"].unique()) region = df[df["municipality"] == municipality]["region"].values[0] @@ -402,8 +403,7 @@ def create(output_path): MNP = 3, REVENU = random.randint(12) )) - for person_index in range(HTS_HOUSEHOLD_MEMBERS): - person_id = household_id * 1000 + person_index + for person_id in range(1, HTS_HOUSEHOLD_MEMBERS + 1): studies = random.random_sample() < 0.3 data["persons"].append(dict( @@ -421,7 +421,7 @@ def create(output_path): work_region = df[df["municipality"] == work_municipality]["region"].values[0] work_department = df[df["municipality"] == work_municipality]["department"].values[0] - purpose = 21 if studies else 11 + purpose = 4 if studies else 2 mode = random.choice([1, 2, 3, 5, 7]) origin_hour = 8 @@ -429,7 +429,7 @@ def create(output_path): if person_index % 100 == 0: # Testing proper diffusion of plan times - orign_hour = 0 + origin_hour = 0 origin_minute = 12 data["trips"].append(dict( @@ -442,18 +442,27 @@ def create(output_path): data["trips"].append(dict( NQUEST = household_id, NP = person_id, - ND = 1, ORDEP = work_department, DESTDEP = home_department, + ND = 2, ORDEP = work_department, DESTDEP = home_department, ORH = 8, ORM = 0, DESTH = 9, DESTM = 0, ORCOMM = work_municipality, DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = 31, ORMOT_H9 = purpose + DESTMOT_H9 = 5, ORMOT_H9 = purpose )) data["trips"].append(dict( NQUEST = household_id, NP = person_id, - ND = 2, ORDEP = home_department, DESTDEP = home_department, + ND = 3, ORDEP = home_department, DESTDEP = home_department, ORH = 17, ORM = 0, DESTH = 18, DESTM = 0, ORCOMM = home_municipality, DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = 1, ORMOT_H9 = 31 + DESTMOT_H9 = 1, ORMOT_H9 = 5 + )) + + # Tail + data["trips"].append(dict( + NQUEST = household_id, NP = person_id, + ND = 4, ORDEP = home_department, DESTDEP = home_department, + ORH = 22, ORM = 0, DESTH = 21, DESTM = 0, ORCOMM = home_municipality, + DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, + DESTMOT_H9 = 5, ORMOT_H9 = 1 )) os.mkdir("%s/egt_2010" % output_path) @@ -657,7 +666,22 @@ def create(output_path): df_sirene_geoloc.to_csv("%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" % output_path, index = False, sep=";", compression={'method': 'zip', 'archive_name': 'GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv'}) - + # Data set: Urban type + print("Creating urban type ...") + df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns = { "DEPCOM": "CODGEO" }) + df_urban_type = df_urban_type.drop_duplicates() + df_urban_type["STATUT_2017"] = [["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type))] + + df_urban_type = pd.concat([df_urban_type, pd.DataFrame({ + "CODGEO": ["75056", "69123", "13055"], + "STATUT_2017": ["C", "C", "C"] + })]) + + os.mkdir("%s/urban_type" % output_path) + with zipfile.ZipFile("%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w") as archive: + with archive.open("UU2020_au_01-01-2023.xlsx", "w") as f: + df_urban_type.to_excel(f, startrow = 5, sheet_name = "Composition_communale", index = False) + # Data set: OSM # We add add a road grid of 500m print("Creating OSM ...") From 6ab040c4542888bf4f05908690a834c10706bdbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6rl?= Date: Mon, 18 Mar 2024 17:40:27 +0100 Subject: [PATCH 15/15] update docs --- docs/population.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/population.md b/docs/population.md index 774b7813..8717434c 100644 --- a/docs/population.md +++ b/docs/population.md @@ -335,3 +335,9 @@ config: The `*default*` trigger will be replaced by the default list of matching attributes. Note that not all HTS implement the urban type, so matching may not work with some implementations. Most of them, however, contain the data, we just need to update the code to read them in. + +To make use of the urban type, the following data is needed: +- [Download the urban type data from INSEE](https://www.insee.fr/fr/information/4802589). The pipeline is currently compatible with the 2023 data set (referencing 2020 boundaries). +- Put the downloaded *zip* file into `data/urban_type`, so you will have the file `data/urban_type/UU2020_au_01-01-2023.zip` + +Then, you should be able to run the pipeline with the configuration explained above.