diff --git a/analysis/grid/comparison_flow_volume.py b/analysis/grid/comparison_flow_volume.py new file mode 100644 index 00000000..b2506ea1 --- /dev/null +++ b/analysis/grid/comparison_flow_volume.py @@ -0,0 +1,116 @@ +import pandas as pd +import geopandas as gpd + +import plotly.express as px + + +SAMPLING_RATE = 0.05 + +def configure(context): + + if not context.config("analysis_from_file",False) : + context.stage("synthesis.population.trips") + context.stage("synthesis.population.spatial.locations") + context.stage("synthesis.population.enriched") + context.stage("data.spatial.departments") + + context.config("comparison_file_prefix",None) + context.config("output_prefix", "ile_de_france_") + context.config("output_formats", ["csv", "gpkg"]) + context.config("output_path") + context.config("data_path") + +def stat_grid(df_trips,df_locations,df_persons,df_grid): + + # Write spatial trips + df_spatial = pd.merge(df_trips, df_locations[[ + "person_id", "activity_index", "geometry" + ]].rename(columns = { + "activity_index": "following_activity_index", + }), how = "left", on = ["person_id", "following_activity_index"]) + df_spatial = pd.merge(df_spatial,df_persons,how = "left", on = ["person_id",]) + df_spatial = gpd.GeoDataFrame(df_spatial, crs = "EPSG:2154").to_crs("4326") + + df_stats = gpd.sjoin(df_grid,df_spatial,how="left") + return df_stats[['id_carr_1km', 'geometry','person_id', 'following_purpose', 'household_id', 'age']] +def execute(context): + + figures = { + "Yrs:0-10":{"min_age": 0, "max_age": 10,}, + "Yrs:11-14":{"min_age": 11, "max_age": 14,}, + "Yrs:15-18":{"min_age": 15, "max_age": 17,}, + "Yrs:18-25":{"min_age": 18, "max_age": 25,}, + "Yrs:25-50":{"min_age": 26, "max_age": 50,}, + "Yrs:50-65":{"min_age": 51, "max_age": 65,}, + "Yrs:65-75":{"min_age": 66, "max_age": 75,}, + "Yrs:75+":{"min_age": 76, "max_age": 110,},} + comparison_file = context.config("output_prefix") if context.config("comparison_file_prefix") is None else context.config("comparison_file_prefix") + + if not context.config("analysis_from_file"): + print("Récupération simu données ...") + # from simulation cache + df_trips = context.stage("synthesis.population.trips") + df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id","age"]] + df_locations = context.stage("synthesis.population.spatial.locations")[[ + "person_id", "activity_index", "geometry" + ]] + df_trips["preceding_activity_index"] = df_trips["trip_index"] + df_trips["following_activity_index"] = df_trips["trip_index"] + 1 + + else : + # from file trips, activites and person + print("Récupération données ...") + df_trips = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]] + df_locations = gpd.read_parquet(f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg') + df_persons = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',sep=';')[["person_id", "household_id","age"]] + print("Récupération comp données ...") + df_trips_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]] + df_locations_comp = gpd.read_parquet(f'{context.config("output_path")}/{comparison_file}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{comparison_file}activities.gpkg') + df_persons_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}persons.csv',sep=';')[["person_id", "household_id","age"]] + + list_purpose = list(df_trips["following_purpose"].unique()) + + # grid 1km of location data + df_departments = context.stage("data.spatial.departments") + poly_dep = df_departments.unary_union + df_grids = gpd.read_file( + f'{context.config("data_path")}/grid/grille200m_metropole.gpkg', + mask=poly_dep, + ) + df_grids = df_grids.to_crs("4326") + df_grid = df_grids[["id_carr_1km","geometry"]].dissolve(by="id_carr_1km").reset_index() + + df_stats = stat_grid(df_trips,df_locations,df_persons,df_grid) + df_grids = stat_grid(df_trips_comp,df_locations_comp,df_persons_comp,df_grid) + point = df_grid.unary_union.centroid # a changé avec ploy_dep + print("Printing grids...") + for prefix, figure in figures.items(): + df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])] + df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index() + df_select_age = df_select_age[~(df_select_age["geometry"].isna())] + df_select_age["following_purpose"] = df_select_age["following_purpose"].astype('str') + + df_grids_age = df_grids[df_grids["age"].between(figure["min_age"],figure["max_age"])] + df_grids_age = df_grids_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index() + df_grids_age = df_grids_age[~(df_grids_age["geometry"].isna())] + df_grids_age["following_purpose"] = df_grids_age["following_purpose"].astype('str') + + for purpose in list_purpose : + df_select = df_select_age[df_select_age["following_purpose"]==purpose].rename(columns={"person_id":"count"}) + df_grids_select = df_grids_age[df_grids_age["following_purpose"]==purpose].rename(columns={"person_id":"count"}) + if context.config("output_prefix") == comparison_file : + df_select = gpd.sjoin(df_select,df_grid,how='right',predicate="contains").fillna(0) + df_select = df_select[df_select["count"] != 0] + fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds', + mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Localisation flow distribution for {prefix} group with {purpose} purpose") + fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html') + else : + df_grids_select = gpd.sjoin(df_grids_select,df_grid,how='right',predicate="contains").fillna(0) + df_select = gpd.sjoin(df_select,df_grids_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0) + df_select["volume_difference"] = df_select["volume_studied_simu"] - df_select["volume_compared_simu"] + df_select = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)] + df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"] + px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu","pourcentage_vol"], + mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html') + + \ No newline at end of file diff --git a/data/bpe/cleaned.py b/data/bpe/cleaned.py index e425e3b4..30e1cad3 100644 --- a/data/bpe/cleaned.py +++ b/data/bpe/cleaned.py @@ -57,6 +57,9 @@ def execute(context): df["activity_type"] = df["activity_type"].astype("category") + #Add + df = df.rename(columns={"TYPEQU":"education_type"}) + df["weight"] = 500 # Clean coordinates df["x"] = df["LAMBERT_X"].astype(str).str.replace(",", ".").astype(float) df["y"] = df["LAMBERT_Y"].astype(str).str.replace(",", ".").astype(float) @@ -134,7 +137,7 @@ def execute(context): df.loc[outside_indices, "imputed"] = True # Package up data set - df = df[["enterprise_id", "activity_type", "commune_id", "imputed", "x", "y"]] + df = df[["enterprise_id", "activity_type","education_type", "commune_id", "imputed", "x", "y","weight"]] df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y),crs="EPSG:2154") diff --git a/data/external/education.py b/data/external/education.py new file mode 100644 index 00000000..78950ce1 --- /dev/null +++ b/data/external/education.py @@ -0,0 +1,33 @@ +import shapely.geometry as geo +import numpy as np +import pandas as pd +import geopandas as gpd + +def configure(context): + context.stage("data.bpe.cleaned") + context.stage("data.spatial.municipalities") + + context.config("data_path") + context.config("education_file", "education/education_addresses.geojson") + +def execute(context): + df_locations = context.stage("data.bpe.cleaned")[[ + "activity_type", "education_type", "commune_id","weight", "geometry" + ]] + + df_locations = df_locations[df_locations["activity_type"] == "education"] + df_locations = df_locations[["activity_type","education_type", "commune_id", "geometry"]].copy() + df_locations["fake"] = False + + df_zones = context.stage("data.spatial.municipalities") + required_communes = set(df_zones["commune_id"].unique()) + + + df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["education_type", "commune_id","weight", "geometry"]] + df_education["fake"] = False + df_education = df_education.to_crs("2154") + df_education["activity_type"] = "education" + list_type = set(df_education["education_type"].unique()) + df_locations = pd.concat([df_locations[~(df_locations["education_type"].str.startswith(tuple(list_type)))],df_education[df_education["commune_id"].isin(required_communes)]]) + + return df_locations diff --git a/data/od/cleaned.py b/data/od/cleaned.py index c8cf81a2..e13348f1 100644 --- a/data/od/cleaned.py +++ b/data/od/cleaned.py @@ -58,12 +58,22 @@ def execute(context): assert not np.any(df_work["commute_mode"].isna()) + # Clean age range for education + df_education["age_range"] = np.nan + df_education.loc[df_education["AGEREV10"] <= 6, "age_range"] = "primary_school" + df_education.loc[df_education["AGEREV10"] == 11, "age_range"] = "middle_school" + df_education.loc[df_education["AGEREV10"] == 15, "age_range"] = "high_school" + df_education.loc[df_education["AGEREV10"] >= 18, "age_range"] = "higher_education" + df_education["age_range"] = df_education["age_range"].astype("category") + + assert not np.any(df_education["age_range"].isna()) + # Aggregate the flows print("Aggregating work ...") df_work = df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"].sum().reset_index() print("Aggregating education ...") - df_education = df_education.groupby(["origin_id", "destination_id"])["weight"].sum().reset_index() + df_education = df_education.groupby(["origin_id", "destination_id","age_range"])["weight"].sum().reset_index() df_work["weight"] = df_work["weight"].fillna(0.0) df_education["weight"] = df_education["weight"].fillna(0.0) diff --git a/data/od/raw.py b/data/od/raw.py index 0b1cad4d..41bc515b 100644 --- a/data/od/raw.py +++ b/data/od/raw.py @@ -57,7 +57,8 @@ def execute(context): "COMMUNE":"str", "ARM":"str", "IPONDI":"float", - "DCETUF":"str" + "DCETUF":"str", + "AGEREV10":"int" } with zipfile.ZipFile( diff --git a/data/od/weighted.py b/data/od/weighted.py index d0defebc..f50702f6 100644 --- a/data/od/weighted.py +++ b/data/od/weighted.py @@ -13,19 +13,23 @@ def configure(context): context.stage("data.od.cleaned") context.stage("data.spatial.codes") -def fix_origins(df, commune_ids, purpose): + context.config("education_location_source","bpe") + +def fix_origins(df, commune_ids, purpose,category): existing_ids = set(np.unique(df["origin_id"])) missing_ids = commune_ids - existing_ids + categories = set(np.unique(df[category])) rows = [] for origin_id in missing_ids: for destination_id in commune_ids: - rows.append((origin_id, destination_id, 1.0 if origin_id == destination_id else 0.0)) + for category_name in categories : + rows.append((origin_id, destination_id, category_name, 1.0 if origin_id == destination_id else 0.0)) print("Fixing %d origins for %s" % (len(missing_ids), purpose)) return pd.concat([df, pd.DataFrame.from_records( - rows, columns = ["origin_id", "destination_id", "weight"] + rows, columns = ["origin_id", "destination_id", category, "weight"] )]).sort_values(["origin_id", "destination_id"]) def execute(context): @@ -35,25 +39,29 @@ def execute(context): # Load data df_work, df_education = context.stage("data.od.cleaned") - # Aggregate work (we do not consider different modes at the moment) - df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index() - # Add missing origins - df_work = fix_origins(df_work, commune_ids, "work") - df_education = fix_origins(df_education, commune_ids, "education") + df_work = fix_origins(df_work, commune_ids, "work","commute_mode") + df_education = fix_origins(df_education, commune_ids, "education","age_range") + # Aggregate work (we do not consider different modes at the moment) + df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index() + # Compute totals df_total = df_work[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1) df_work = pd.merge(df_work, df_total, on = "origin_id") - df_total = df_education[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1) - df_education = pd.merge(df_education, df_total, on = "origin_id") - + df_total = df_education[["origin_id","age_range", "weight"]].groupby(["origin_id","age_range"]).sum().reset_index().rename({ "weight" : "total" }, axis = 1) + df_education = pd.merge(df_education, df_total, on = ["origin_id","age_range"]) + + if context.config("education_location_source") == 'bpe': + # Aggregate education (we do not consider different age range with bpe source) + df_education = df_education[["origin_id", "destination_id", "weight","total"]].groupby(["origin_id", "destination_id"]).sum().reset_index() # Compute weight df_work["weight"] /= df_work["total"] df_education["weight"] /= df_education["total"] del df_work["total"] del df_education["total"] - + df_education = df_education.fillna(0.0) + return df_work, df_education diff --git a/data/sirene/cleaned.py b/data/sirene/cleaned.py index 65df8612..9bef6da5 100644 --- a/data/sirene/cleaned.py +++ b/data/sirene/cleaned.py @@ -9,6 +9,7 @@ def configure(context): context.stage("data.sirene.raw_siren", ephemeral = True) context.stage("data.sirene.raw_siret", ephemeral = True) context.stage("data.spatial.codes") + context.config("exclude_no_employee", False) def execute(context): df_sirene_establishments = context.stage("data.sirene.raw_siret") @@ -22,6 +23,13 @@ def execute(context): df_sirene = df_sirene[ df_sirene["etatAdministratifEtablissement"] == "A" ].copy() + + if context.config("exclude_no_employee"): + # exclude "NN", "00", and NaN + df_sirene = df_sirene[ + df_sirene["trancheEffectifsEtablissement"].notna() + & ~(df_sirene["trancheEffectifsEtablissement"].isin(["NN", "00"])) + ].copy() # Define work place weights by person under salary .... df_sirene["minimum_employees"] = 1 # Includes "NN", "00", and NaN diff --git a/data/tiles/raw.py b/data/tiles/raw.py new file mode 100644 index 00000000..b42a5d33 --- /dev/null +++ b/data/tiles/raw.py @@ -0,0 +1,65 @@ +import os +import geopandas as gpd +import py7zr +import zipfile +import re +import numpy as np + +""" +This stage loads the raw data from the French population income, poverty and living standards in tiled data. +""" + +def configure(context): + context.stage("data.spatial.departments") + context.config("data_path") + context.config("tiles_path", "tiles_2019/Filosofi2019_carreaux_200m_gpkg.zip") + context.config("tiles_file", "carreaux_200m_met.gpkg") + + +def execute(context): + # Find relevant departments + df_departments = context.stage("data.spatial.departments") + print("Expecting data for {} departments".format(len(df_departments))) + poly_dep = df_departments.unary_union + if context.config("tiles_path")[-4:] == ".zip": + with zipfile.ZipFile( + "{}/{}".format(context.config("data_path"), context.config("tiles_path")) + ) as archive: + with archive.open( + re.split(r"[/.]", context.config("tiles_path"))[1] + ".7z" + ) as f: + with py7zr.SevenZipFile(f) as archive: + archive.extract(context.path(), context.config("tiles_file")) + df_tiles = gpd.read_file( + f'{context.path()}/{context.config("tiles_file")}', + mask=poly_dep, + )[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename( + columns={"idcar_200m": "home_location_id", "men": "weight"} + ) + else: + df_tiles = gpd.read_file( + f'{context.config("data_path")}/{context.config("tiles_path")}/{context.config("tiles_file")}', + mask=poly_dep, + )[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename( + columns={"idcar_200m": "home_location_id", "men": "weight"} + ) + + df_tiles["home_location_id"] = df_tiles["home_location_id"].str[14:] + df_tiles["geometry"] = df_tiles["geometry"].centroid + df_tiles["department_id"] = df_tiles["lcog_geo"].str[:2] + + for department_id in df_departments["departement_id"].values: + assert np.count_nonzero(df_tiles["department_id"] == department_id) > 0 + + return df_tiles[["home_location_id", "weight", "geometry"]] + + +def validate(context): + if not os.path.exists( + "{}/{}".format(context.config("data_path"), context.config("tiles_path")) + ): + raise RuntimeError("Tiles 2019 data is not available") + + return os.path.getsize( + "{}/{}".format(context.config("data_path"), context.config("tiles_path")) + ) \ No newline at end of file diff --git a/docs/population.md b/docs/population.md index f8ffeace..0575dca0 100644 --- a/docs/population.md +++ b/docs/population.md @@ -12,6 +12,7 @@ This guide will cover the following steps: - [Gathering the data](#section-data) - [Running the pipeline](#section-population) +- [Analysing synthetic population](#section-analysis) ## Gathering the data @@ -344,6 +345,54 @@ To make use of the urban type, the following data is needed: Then, you should be able to run the pipeline with the configuration explained above. +### Exclude entreprise with no employee + +The pipeline allows to exclude all entreprise without any employee (trancheEffectifsEtablissement is NA, "NN" or "00") indicated in Sirene data for working place distribution. It can be activate via this configuration : + +```yaml +config: + # [...] + exclude_no_employee: true +``` + +### INSEE 200m tiles data + +The pipeline allows to use INSEE 200m tiles data in order to locate population instead of using BAN or BDTOPO data. Population is located in the center of the tiles with the INSEE population weight for each tile. + +- In order to use of this location,[download the 200m grid data from INSEE](https://www.insee.fr/fr/statistiques/7655475?sommaire=7655515). The pipeline is currently compatible with 2019 data set. +- Put the downloaded *zip* file into `data/tiles_2019`, so you will have the file `data/tiles_2019/Filosofi2019_carreaux_200m_gpkg.zip` + +Then, activate it via the configuration : + +```yaml +config: + # [...] + home_location_source: tiles +``` + +This parameter can also activate use of BDTOPO data only or with BAN data to locate population with respectively `building` and `addresses` values. + +### Education activities locations + +The synthetic data generated by the pipeline so far distribute population to education locations without any distinction of age or type of educational institution. +To avoid to send yound children to high school for example, a matching of educational institution and person by age range can be activated via configuration : + +```yaml +config: + # [...] + education_location_source: weighted +``` + +For each type of institution, a weight is attributed by default in the pipeline. To realise a matching weighted with known student numbers by educational institution, the pipeline can also work with a list of educational institution from external geojson or geopackage file with `addresses` as parameter value. +This file must include `TYPEQU`, `commune_id`,`weight`and `geometry` as column with `weight` number of student and `TYPEQU` type of educational institution code similar as BPE ones. + +```yaml +config: + # [...] + education_location_source: adresses + education_file: education/education_addresses.geojson +``` + ### Income This pipeline allows using the [Bhepop2](https://github.com/tellae/bhepop2) package for income assignation. @@ -365,4 +414,30 @@ config: ``` Caution, this method will fail on communes where the Filosofi subpopulation distributions are missing. In this case, -we fall back to the `uniform` method. \ No newline at end of file +we fall back to the `uniform` method. + +## Analysing synthetic population + +In addition to creating synthetic populations, it is possible to output files for analysis. + +### Comparison population on grid + +Using the comparison_flow_volume pipeline in the Analysis directory, you can generate grids comparing the volumes of two synthetic populations on a grid of 1km² squares for each age group and each purpose of their trips. Like with population creation, the pipeline is run with the [synpp](https://github.com/eqasim-org/synpp) runner and all parameters needed must be included in the `config.yml` file. + +To be able to use this pipeline, you must already have create at least one synthetic population (1 for volume visualization and 2 for comparison) and [download France grid from INSEE](https://www.insee.fr/fr/statistiques/fichier/6214726/grille200m_gpkg.zip). From this *zip* file, you need to extract `grille200m_metropole.gpkg` and put it into `data/grid`. + +Then you need to open the `config.yml` and add the `analysis.grid.comparison_flow_volume` stage in the `run` section. To proprely use the comparison_flow_volume pipeline,you'd have to provide the following config: + +```yaml +config: + output_prefix: name_output_studied_ + comparison_file_prefix: name_output_compared_ + analysis_from_file: true +``` + +Before running it, make sur that populations have same format of file. +After running, you should find all grids for each age group and each trips' purpose in the `output` +folder as: `{output_prefix}_{age group}_{trip pupose}.html` + +Note: +With `analysis_from_file` at False, the last synthetic population is studied by default. Also if `output_prefix` and `comparison_file_prefix` refer to the same outputs, or `comparison_file_prefix` is not specified, then only a volume visualisation of this particular population is produced. diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py index 418048ab..45a32a70 100644 --- a/synthesis/locations/education.py +++ b/synthesis/locations/education.py @@ -4,45 +4,95 @@ import geopandas as gpd def configure(context): - context.stage("data.bpe.cleaned") context.stage("data.spatial.municipalities") + if context.config("education_location_source","bpe") == "addresses": + context.stage("data.external.education", alias = "location_source") + else: + context.stage("data.bpe.cleaned", alias = "location_source") + +EDUCATION_WEIGHT_MAP = [ + ("C101", 100), # Preschools + ("C102", 50), # Intercommunal preschools + ("C104", 145), # Elemantary schools + ("C105", 80), # Intercommunal elemantary schools + ("C301", 700), # General and technological high schools, multi-purpose high schools + ("C302", 285), # Professional high schools + ("C303", 100), # Agricultural high schools + ("C304", 30), # General and technological classes in professional high schools + ("C305", 30), # Professional classes in general and technological high schools + ("C403", 1000), # Business schools + ("C501", 2000), # University +] + +def fake_education(missing_communes, c, df_locations, df_zones): + # Fake education destinations as the centroid of zones that have no other destinations + print( + "Adding fake education locations for %d municipalities" + % (len(missing_communes)) + ) + + df_added = [] + + for commune_id in sorted(missing_communes): + centroid = df_zones[df_zones["commune_id"] == commune_id][ + "geometry" + ].centroid.iloc[0] + + df_added.append({"commune_id": commune_id, "geometry": centroid}) + + df_added = gpd.GeoDataFrame( + pd.DataFrame.from_records(df_added), crs=df_locations.crs + ) + df_added["fake"] = True + df_added["education_type"] = c + df_added["weight"] = 1 + + return df_added + def execute(context): - df_locations = context.stage("data.bpe.cleaned")[[ - "enterprise_id", "activity_type", "commune_id", "geometry" - ]] + df_locations = context.stage("location_source") df_locations = df_locations[df_locations["activity_type"] == "education"] - df_locations = df_locations[["commune_id", "geometry"]].copy() + df_locations = df_locations[["education_type", "commune_id","weight", "geometry"]].copy() df_locations["fake"] = False # Add education destinations to the centroid of zones that have no other destinations df_zones = context.stage("data.spatial.municipalities") - required_communes = set(df_zones["commune_id"].unique()) - missing_communes = required_communes - set(df_locations["commune_id"].unique()) + required_communes = set(df_zones["commune_id"].unique()) + + if context.config("education_location_source") != 'bpe': # either weighted or addresses + for prefix, weight in EDUCATION_WEIGHT_MAP: + df_locations.loc[df_locations["education_type"]==prefix, "weight"] = ( + weight + ) + if context.config("education_location_source") != 'bpe' : - if len(missing_communes) > 0: - print("Adding fake education locations for %d/%d municipalities" % ( - len(missing_communes), len(required_communes) - )) + + # Add education destinations in function of level education + for c in ["C1", "C2", "C3"]: + missing_communes = required_communes - set(df_locations[df_locations["education_type"].str.startswith(c)]["commune_id"].unique()) - df_added = [] + if len(missing_communes) > 0: + df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)]) + + # Add education destinations for last level education + missing_communes = required_communes - set(df_locations[~(df_locations["education_type"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique()) - for commune_id in sorted(missing_communes): - centroid = df_zones[df_zones["commune_id"] == commune_id]["geometry"].centroid.iloc[0] + if len(missing_communes) > 0: - df_added.append({ - "commune_id": commune_id, "geometry": centroid - }) + df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)]) + else : - df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_locations.crs) - df_added["fake"] = True + missing_communes = required_communes - set(df_locations["commune_id"].unique()) + if len(missing_communes) > 0: - df_locations = pd.concat([df_locations, df_added]) + df_locations = pd.concat([df_locations,fake_education(missing_communes, "C0", df_locations, df_zones)]) + df_locations["education_type"] = df_locations["education_type"].str[:2].astype("category") # Define identifiers - df_locations["location_id"] = np.arange(len(df_locations)) + df_locations["location_id"]= np.arange(len(df_locations)) df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str) - - return df_locations[["location_id", "commune_id", "fake", "geometry"]] + + return df_locations[["location_id","education_type", "commune_id","weight","fake", "geometry"]] diff --git a/synthesis/locations/home/addresses.py b/synthesis/locations/home/addresses.py index afe0e7e4..01410a13 100644 --- a/synthesis/locations/home/addresses.py +++ b/synthesis/locations/home/addresses.py @@ -15,7 +15,7 @@ adresses. For instance, the assigned addresses of a building with 10 housing units and two addresses will have a weight of 5. -If no adresses matches a buidling, its centroid is taken as the unique address. +If no adresses matches a building, its centroid is taken as the unique address. """ def configure(context): @@ -57,18 +57,18 @@ def execute(context): # Put together matched and missing addresses df_addresses = pd.concat([df_addresses, df_missing]) - df_addresses = gpd.GeoDataFrame(df_addresses, crs = df_buildings.crs) + df_addresses = gpd.GeoDataFrame(df_addresses, crs = df_buildings.crs).rename(columns={"building_id":"home_location_id"}) # Obtain weights for all addresses if context.config("home_location_weight") == "housing": - df_count = df_addresses.groupby("building_id").size().reset_index(name = "count") - df_addresses = pd.merge(df_addresses, df_count, on = "building_id") + df_count = df_addresses.groupby("home_location_id").size().reset_index(name = "count") + df_addresses = pd.merge(df_addresses, df_count, on = "home_location_id") df_addresses["weight"] = df_addresses["housing"] / df_addresses["count"] else: df_addresses["weight"] = 1.0 - return df_addresses[["building_id", "weight", "geometry"]] + return df_addresses[["home_location_id", "weight", "geometry"]] def validate(context): - assert context.config("home_location_source") in ("addresses", "buildings") + assert context.config("home_location_source") in ("addresses", "buildings","tiles") assert context.config("home_location_weight") in ("uniform", "housing") diff --git a/synthesis/locations/home/locations.py b/synthesis/locations/home/locations.py index 6c319e83..391748ec 100644 --- a/synthesis/locations/home/locations.py +++ b/synthesis/locations/home/locations.py @@ -9,7 +9,10 @@ def configure(context): context.stage("data.spatial.iris") - context.stage("synthesis.locations.home.addresses") + if context.config("home_location_source", "addresses") == "tiles": + context.stage("data.tiles.raw", alias = "location_source") + else: + context.stage("synthesis.locations.home.addresses", alias = "location_source") def execute(context): # Find required IRIS @@ -17,7 +20,7 @@ def execute(context): required_iris = set(df_iris["iris_id"].unique()) # Load all addresses and add IRIS information - df_addresses = context.stage("synthesis.locations.home.addresses") + df_addresses = context.stage("location_source") print("Imputing IRIS into addresses ...") @@ -38,7 +41,6 @@ def execute(context): len(missing_iris), len(required_iris))) df_added = [] - for iris_id in sorted(missing_iris): centroid = df_iris[df_iris["iris_id"] == iris_id]["geometry"].centroid.iloc[0] @@ -46,7 +48,7 @@ def execute(context): "iris_id": iris_id, "geometry": centroid, "commune_id": iris_id[:5], "weight" : 1, - "building_id": -1 + "home_location_id": -1 }) df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_addresses.crs) diff --git a/synthesis/population/enriched.py b/synthesis/population/enriched.py index 94a9ee6b..15fc5649 100644 --- a/synthesis/population/enriched.py +++ b/synthesis/population/enriched.py @@ -84,5 +84,12 @@ def execute(context): df_population.loc[df_population["number_of_bikes"] < df_population["household_size"], "bike_availability"] = "some" df_population.loc[df_population["number_of_bikes"] == 0, "bike_availability"] = "none" df_population["bike_availability"] = df_population["bike_availability"].astype("category") - + + # Add age range for education + df_population["age_range"] = "higher_education" + df_population.loc[df_population["age"]<=10,"age_range"] = "primary_school" + df_population.loc[df_population["age"].between(11,14),"age_range"] = "middle_school" + df_population.loc[df_population["age"].between(15,17),"age_range"] = "high_school" + df_population["age_range"] = df_population["age_range"].astype("category") + return df_population diff --git a/synthesis/population/spatial/home/locations.py b/synthesis/population/spatial/home/locations.py index 0babb70b..9347e5ec 100644 --- a/synthesis/population/spatial/home/locations.py +++ b/synthesis/population/spatial/home/locations.py @@ -6,7 +6,8 @@ def configure(context): context.stage("synthesis.population.spatial.home.zones") context.stage("synthesis.locations.home.locations") - + context.config("home_location_source", "addresses") + context.config("random_seed") def _sample_locations(context, args): @@ -39,7 +40,7 @@ def _sample_locations(context, args): # Apply selection df_homes["geometry"] = df_locations.iloc[indices]["geometry"].values - df_homes["building_id"] = df_locations.iloc[indices]["building_id"].values + df_homes["home_location_id"] = df_locations.iloc[indices]["home_location_id"].values # Update progress context.progress.update() @@ -61,5 +62,6 @@ def execute(context): )) as parallel: seeds = random.randint(10000, size = len(unique_iris_ids)) df_homes = pd.concat(parallel.map(_sample_locations, zip(unique_iris_ids, seeds))) - - return df_homes[["household_id", "commune_id", "building_id", "geometry"]] + out = ["household_id", "commune_id", "home_location_id", "geometry"] + + return df_homes[out] diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py index 909e862c..7af9963c 100644 --- a/synthesis/population/spatial/primary/candidates.py +++ b/synthesis/population/spatial/primary/candidates.py @@ -11,7 +11,15 @@ def configure(context): context.stage("synthesis.population.enriched") context.stage("synthesis.population.trips") + context.config("output_path") context.config("random_seed") + context.config("education_location_source", "bpe") + +EDUCATION_MAPPING = { + "primary_school": ["C1"], + "middle_school": ["C2"], + "high_school": ["C3"], + "higher_education": ["C4", "C5", "C6"]} def sample_destination_municipalities(context, arguments): # Load data @@ -37,7 +45,7 @@ def sample_locations(context, arguments): # Prepare state random = np.random.RandomState(random_seed) df_locations = df_locations[df_locations["commune_id"] == destination_id] - + # Determine demand df_flow = df_flow[df_flow["destination_id"] == destination_id] count = df_flow["count"].sum() @@ -47,7 +55,7 @@ def sample_locations(context, arguments): if "weight" in df_locations: weight = df_locations["weight"].values / df_locations["weight"].sum() - + location_counts = random.multinomial(count, weight) location_ids = df_locations["location_id"].values location_ids = np.repeat(location_ids, location_counts) @@ -67,7 +75,7 @@ def sample_locations(context, arguments): return df_result -def process(context, purpose, random, df_persons, df_od, df_locations): +def process(context, purpose, random, df_persons, df_od, df_locations,step_name): df_persons = df_persons[df_persons["has_%s_trip" % purpose]] # Sample commute flows based on population @@ -78,7 +86,7 @@ def process(context, purpose, random, df_persons, df_od, df_locations): df_flow = [] - with context.progress(label = "Sampling %s municipalities" % purpose, total = len(df_demand)) as progress: + with context.progress(label = "Sampling %s municipalities" % step_name, total = len(df_demand)) as progress: with context.parallel(dict(df_od = df_od)) as parallel: for df_partial in parallel.imap_unordered(sample_destination_municipalities, df_demand.itertuples(index = False, name = None)): df_flow.append(df_partial) @@ -102,7 +110,7 @@ def process(context, purpose, random, df_persons, df_od, df_locations): def execute(context): # Prepare population data - df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id"]].copy() + df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id", "age_range"]].copy() df_trips = context.stage("synthesis.population.trips") df_persons["has_work_trip"] = df_persons["person_id"].isin(df_trips[ @@ -125,18 +133,26 @@ def execute(context): df_locations = context.stage("synthesis.locations.work") df_locations["weight"] = df_locations["employees"] df_work = process(context, "work", random, df_persons, - df_work_od, df_locations + df_work_od, df_locations, "work" ) df_locations = context.stage("synthesis.locations.education") - df_education = process(context, "education", random, df_persons, - df_education_od, df_locations - ) + if context.config("education_location_source") == 'bpe': + df_education = process(context, "education", random, df_persons, df_education_od, df_locations,"education") + else : + df_education = [] + for prefix, education_type in EDUCATION_MAPPING.items(): + df_education.append( + process(context, "education", random, + df_persons[df_persons["age_range"]==prefix], + df_education_od[df_education_od["age_range"]==prefix],df_locations[df_locations["education_type"].isin(education_type)],prefix) + ) + df_education = pd.concat(df_education) return dict( work_candidates = df_work, education_candidates = df_education, persons = df_persons[df_persons["has_work_trip"] | df_persons["has_education_trip"]][[ - "person_id", "household_id", "commune_id", "has_work_trip", "has_education_trip" + "person_id", "household_id", "age_range", "commune_id", "has_work_trip", "has_education_trip" ]] ) diff --git a/synthesis/population/spatial/primary/locations.py b/synthesis/population/spatial/primary/locations.py index 14b3a586..136e18ac 100644 --- a/synthesis/population/spatial/primary/locations.py +++ b/synthesis/population/spatial/primary/locations.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import geopandas as gpd +from .candidates import EDUCATION_MAPPING def configure(context): context.stage("synthesis.population.spatial.primary.candidates") @@ -9,6 +10,9 @@ def configure(context): context.stage("synthesis.locations.work") context.stage("synthesis.locations.education") + context.config("education_location_source", "bpe") + + def define_distance_ordering(df_persons, df_candidates, progress): indices = [] @@ -106,13 +110,18 @@ def execute(context): df_work_candidates = pd.merge(df_work_candidates, df_locations, how = "left", on = "location_id") df_work_candidates = gpd.GeoDataFrame(df_work_candidates) - df_locations = context.stage("synthesis.locations.education")[["location_id", "geometry"]] + df_locations = context.stage("synthesis.locations.education")[["education_type", "location_id", "geometry"]] df_education_candidates = data["education_candidates"] df_education_candidates = pd.merge(df_education_candidates, df_locations, how = "left", on = "location_id") df_education_candidates = gpd.GeoDataFrame(df_education_candidates) # Assign destinations df_work = process(context, "work", df_work, df_work_candidates) - df_education = process(context, "education", df_education, df_education_candidates) - + if context.config("education_location_source") == 'bpe': + df_education = process(context, "education", df_education, df_education_candidates) + else : + education = [] + for prefix, education_type in EDUCATION_MAPPING.items(): + education.append(process(context, prefix,df_education[df_education["age_range"]==prefix],df_education_candidates[df_education_candidates["education_type"].isin(education_type)])) + df_education = pd.concat(education).sort_index() return df_work, df_education diff --git a/tests/testdata.py b/tests/testdata.py index e00d1b86..6e75f71d 100644 --- a/tests/testdata.py +++ b/tests/testdata.py @@ -597,8 +597,9 @@ def create(output_path): )) df_education["ARM"] = "Z" df_education["IPONDI"] = 1.0 + df_education["AGEREV10"] = 1 - columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI"] + columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI","AGEREV10"] df_education.columns = columns with zipfile.ZipFile("%s/rp_2019/RP2019_MOBSCO_csv.zip" % output_path, "w") as archive: