Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improved location of activities and outputs analysis #252

Merged
merged 24 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
cd68936
new param: exclude no employee
Jun 3, 2024
30ceb28
new home location source: tiles insee
Jun 3, 2024
88351a9
categorisation education : age distribution
Jun 3, 2024
ee147df
categorisation education : weight with type & correction
Jun 3, 2024
85ddbdb
categorisation education : weight and location with file
Jun 6, 2024
0dd4e21
Merge branch 'eqasim-org:develop' into feat/new_params_categorisation
MarieMcLaurent Jun 10, 2024
a25cc5d
categorisation education : setting new param education_location_source
Jun 10, 2024
0d6490a
new output: map graphs analysis with age and flow purpose
Jul 1, 2024
dce59e7
feat: option for not filtering entd on resquested departments
vincent-leblond Jun 3, 2024
460ce26
fix: remove person with problem in entd
vincent-leblond Jun 3, 2024
569f840
Merge branch 'eqasim-org:develop' into feat/new_params_categorisation
MarieMcLaurent Jul 11, 2024
30709d3
categorisation education : new distribution of education od with age …
Jul 25, 2024
fdf37e9
clean up and correction output
Jul 29, 2024
dcc9ff0
change docs and test
Aug 5, 2024
b7b920f
Merge branch 'develop' into feat/new_params_categorisation
MarieMcLaurent Aug 5, 2024
614c24e
fix: test fake agerange
Aug 5, 2024
4d158ee
move flow_output to analysis
Aug 6, 2024
9af623b
rename flow_output.py to comparison_flow_volume.py
Aug 6, 2024
cff4cb0
update docs for analysis and correction analysis
Aug 7, 2024
191a797
fix: docs
vincent-leblond Aug 8, 2024
863ab98
Merge branch 'develop' into feat/new_params_categorisation
MarieMcLaurent Aug 26, 2024
bfaae74
fx: add changes from requests and recommandations
Sep 6, 2024
acf3b80
fix : remove errors
Sep 23, 2024
497c9b2
Merge branch 'develop' into feat/new_params_categorisation
MarieMcLaurent Sep 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions analysis/grid/comparison_flow_volume.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import pandas as pd
import geopandas as gpd

import plotly.express as px


SAMPLING_RATE = 0.05

def configure(context):

if not context.config("analysis_from_file",False) :
context.stage("synthesis.population.trips")
context.stage("synthesis.population.spatial.locations")
context.stage("synthesis.population.enriched")
context.stage("data.spatial.departments")

context.config("comparison_file_prefix",None)
context.config("output_prefix", "ile_de_france_")
context.config("output_formats", ["csv", "gpkg"])
context.config("output_path")
context.config("data_path")

def stat_grid(df_trips,df_locations,df_persons,df_grid):

# Write spatial trips
df_spatial = pd.merge(df_trips, df_locations[[
"person_id", "activity_index", "geometry"
]].rename(columns = {
"activity_index": "following_activity_index",
}), how = "left", on = ["person_id", "following_activity_index"])
df_spatial = pd.merge(df_spatial,df_persons,how = "left", on = ["person_id",])
df_spatial = gpd.GeoDataFrame(df_spatial, crs = "EPSG:2154").to_crs("4326")

df_stats = gpd.sjoin(df_grid,df_spatial,how="left")
return df_stats[['id_carr_1km', 'geometry','person_id', 'following_purpose', 'household_id', 'age']]
def execute(context):

figures = {
"Yrs:0-10":{"min_age": 0, "max_age": 10,},
"Yrs:11-14":{"min_age": 11, "max_age": 14,},
"Yrs:15-18":{"min_age": 15, "max_age": 17,},
"Yrs:18-25":{"min_age": 18, "max_age": 25,},
"Yrs:25-50":{"min_age": 26, "max_age": 50,},
"Yrs:50-65":{"min_age": 51, "max_age": 65,},
"Yrs:65-75":{"min_age": 66, "max_age": 75,},
"Yrs:75+":{"min_age": 76, "max_age": 110,},}
comparison_file = context.config("output_prefix") if context.config("comparison_file_prefix") is None else context.config("comparison_file_prefix")

if not context.config("analysis_from_file"):
print("Récupération simu données ...")
# from simulation cache
df_trips = context.stage("synthesis.population.trips")
df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id","age"]]
df_locations = context.stage("synthesis.population.spatial.locations")[[
"person_id", "activity_index", "geometry"
]]
df_trips["preceding_activity_index"] = df_trips["trip_index"]
df_trips["following_activity_index"] = df_trips["trip_index"] + 1

else :
# from file trips, activites and person
print("Récupération données ...")
df_trips = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]]
df_locations = gpd.read_parquet(f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg')
df_persons = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',sep=';')[["person_id", "household_id","age"]]
print("Récupération comp données ...")
df_trips_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]]
df_locations_comp = gpd.read_parquet(f'{context.config("output_path")}/{comparison_file}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{comparison_file}activities.gpkg')
df_persons_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}persons.csv',sep=';')[["person_id", "household_id","age"]]

list_purpose = list(df_trips["following_purpose"].unique())

# grid 1km of location data
df_departments = context.stage("data.spatial.departments")
poly_dep = df_departments.unary_union
df_grids = gpd.read_file(
f'{context.config("data_path")}/grid/grille200m_metropole.gpkg',
mask=poly_dep,
)
df_grids = df_grids.to_crs("4326")
df_grid = df_grids[["id_carr_1km","geometry"]].dissolve(by="id_carr_1km").reset_index()

df_stats = stat_grid(df_trips,df_locations,df_persons,df_grid)
df_grids = stat_grid(df_trips_comp,df_locations_comp,df_persons_comp,df_grid)
point = df_grid.unary_union.centroid # a changé avec ploy_dep
print("Printing grids...")
for prefix, figure in figures.items():
df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])]
df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
df_select_age = df_select_age[~(df_select_age["geometry"].isna())]
df_select_age["following_purpose"] = df_select_age["following_purpose"].astype('str')

df_grids_age = df_grids[df_grids["age"].between(figure["min_age"],figure["max_age"])]
df_grids_age = df_grids_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
df_grids_age = df_grids_age[~(df_grids_age["geometry"].isna())]
df_grids_age["following_purpose"] = df_grids_age["following_purpose"].astype('str')

for purpose in list_purpose :
df_select = df_select_age[df_select_age["following_purpose"]==purpose].rename(columns={"person_id":"count"})
df_grids_select = df_grids_age[df_grids_age["following_purpose"]==purpose].rename(columns={"person_id":"count"})
if context.config("output_prefix") == comparison_file :
df_select = gpd.sjoin(df_select,df_grid,how='right',predicate="contains").fillna(0)
df_select = df_select[df_select["count"] != 0]
fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds',
mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Localisation flow distribution for {prefix} group with {purpose} purpose")
fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
else :
df_grids_select = gpd.sjoin(df_grids_select,df_grid,how='right',predicate="contains").fillna(0)
df_select = gpd.sjoin(df_select,df_grids_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0)
df_select["volume_difference"] = df_select["volume_studied_simu"] - df_select["volume_compared_simu"]
df_select = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)]
df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"]
px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu","pourcentage_vol"],
mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')


2 changes: 1 addition & 1 deletion data/bpe/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def execute(context):
df.loc[outside_indices, "imputed"] = True

# Package up data set
df = df[["enterprise_id", "activity_type", "commune_id", "imputed", "x", "y"]]
df = df[["enterprise_id", "activity_type","TYPEQU", "commune_id", "imputed", "x", "y"]]

df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y),crs="EPSG:2154")

Expand Down
2 changes: 2 additions & 0 deletions data/hts/entd/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ def execute(context):

# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10

hts.fix_activity_types(df_trips)
MarieMcLaurent marked this conversation as resolved.
Show resolved Hide resolved

# Fix activity types (because of 1 inconsistent ENTD data)
hts.fix_activity_types(df_trips)
Expand Down
12 changes: 11 additions & 1 deletion data/od/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,22 @@ def execute(context):

assert not np.any(df_work["commute_mode"].isna())

# Clean age range for education
df_education["age_range"] = np.nan
df_education.loc[df_education["AGEREV10"] <= 6, "age_range"] = "primary_school"
df_education.loc[df_education["AGEREV10"] == 11, "age_range"] = "middle_school"
df_education.loc[df_education["AGEREV10"] == 15, "age_range"] = "high_school"
df_education.loc[df_education["AGEREV10"] >= 18, "age_range"] = "higher_education"
df_education["age_range"] = df_education["age_range"].astype("category")

assert not np.any(df_education["age_range"].isna())

# Aggregate the flows
print("Aggregating work ...")
df_work = df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"].sum().reset_index()

print("Aggregating education ...")
df_education = df_education.groupby(["origin_id", "destination_id"])["weight"].sum().reset_index()
df_education = df_education.groupby(["origin_id", "destination_id","age_range"])["weight"].sum().reset_index()

df_work["weight"] = df_work["weight"].fillna(0.0)
df_education["weight"] = df_education["weight"].fillna(0.0)
Expand Down
3 changes: 2 additions & 1 deletion data/od/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def execute(context):
"COMMUNE":"str",
"ARM":"str",
"IPONDI":"float",
"DCETUF":"str"
"DCETUF":"str",
"AGEREV10":"int"
}

with zipfile.ZipFile(
Expand Down
32 changes: 20 additions & 12 deletions data/od/weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,23 @@ def configure(context):
context.stage("data.od.cleaned")
context.stage("data.spatial.codes")

def fix_origins(df, commune_ids, purpose):
context.config("education_location_source","bpe")

def fix_origins(df, commune_ids, purpose,category):
existing_ids = set(np.unique(df["origin_id"]))
missing_ids = commune_ids - existing_ids
categories = set(np.unique(df[category]))

rows = []
for origin_id in missing_ids:
for destination_id in commune_ids:
rows.append((origin_id, destination_id, 1.0 if origin_id == destination_id else 0.0))
for category_name in categories :
rows.append((origin_id, destination_id, category_name, 1.0/len(categories) if origin_id == destination_id else 0.0))
sebhoerl marked this conversation as resolved.
Show resolved Hide resolved

print("Fixing %d origins for %s" % (len(missing_ids), purpose))

return pd.concat([df, pd.DataFrame.from_records(
rows, columns = ["origin_id", "destination_id", "weight"]
rows, columns = ["origin_id", "destination_id", category, "weight"]
)]).sort_values(["origin_id", "destination_id"])

def execute(context):
Expand All @@ -35,25 +39,29 @@ def execute(context):
# Load data
df_work, df_education = context.stage("data.od.cleaned")

# Aggregate work (we do not consider different modes at the moment)
df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index()

# Add missing origins
df_work = fix_origins(df_work, commune_ids, "work")
df_education = fix_origins(df_education, commune_ids, "education")
df_work = fix_origins(df_work, commune_ids, "work","commute_mode")
df_education = fix_origins(df_education, commune_ids, "education","age_range")

# Aggregate work (we do not consider different modes at the moment)
df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index()

sebhoerl marked this conversation as resolved.
Show resolved Hide resolved
# Compute totals
df_total = df_work[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1)
df_work = pd.merge(df_work, df_total, on = "origin_id")

df_total = df_education[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1)
df_education = pd.merge(df_education, df_total, on = "origin_id")

df_total = df_education[["origin_id","age_range", "weight"]].groupby(["origin_id","age_range"]).sum().reset_index().rename({ "weight" : "total" }, axis = 1)
df_education = pd.merge(df_education, df_total, on = ["origin_id","age_range"])

if context.config("education_location_source") == 'bpe':
# Aggregate education (we do not consider different age range with bpe source)
df_education = df_education[["origin_id", "destination_id", "weight","total"]].groupby(["origin_id", "destination_id"]).sum().reset_index()
sebhoerl marked this conversation as resolved.
Show resolved Hide resolved
# Compute weight
df_work["weight"] /= df_work["total"]
df_education["weight"] /= df_education["total"]

del df_work["total"]
del df_education["total"]

df_education = df_education.fillna(0.0)
sebhoerl marked this conversation as resolved.
Show resolved Hide resolved

return df_work, df_education
8 changes: 8 additions & 0 deletions data/sirene/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def configure(context):
context.stage("data.sirene.raw_siren", ephemeral = True)
context.stage("data.sirene.raw_siret", ephemeral = True)
context.stage("data.spatial.codes")
context.config("exclude_no_employee", False)

def execute(context):
df_sirene_establishments = context.stage("data.sirene.raw_siret")
Expand All @@ -22,6 +23,13 @@ def execute(context):
df_sirene = df_sirene[
df_sirene["etatAdministratifEtablissement"] == "A"
].copy()

if context.config("exclude_no_employee"):
# exclude "NN", "00", and NaN
df_sirene = df_sirene[
df_sirene["trancheEffectifsEtablissement"].notna()
& ~(df_sirene["trancheEffectifsEtablissement"].isin(["NN", "00"]))
].copy()

# Define work place weights by person under salary ....
df_sirene["minimum_employees"] = 1 # Includes "NN", "00", and NaN
Expand Down
65 changes: 65 additions & 0 deletions data/tiles/raw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import geopandas as gpd
import py7zr
import zipfile
import re
import numpy as np

"""
This stage loads the raw data from the French population income, poverty and living standards in tiled data.
"""

def configure(context):
context.stage("data.spatial.departments")
context.config("data_path")
context.config("tiles_path", "tiles_2019/Filosofi2019_carreaux_200m_gpkg.zip")
context.config("tiles_file", "carreaux_200m_met.gpkg")


def execute(context):
# Find relevant departments
df_departments = context.stage("data.spatial.departments")
print("Expecting data for {} departments".format(len(df_departments)))
poly_dep = df_departments.unary_union
if context.config("tiles_path")[-4:] == ".zip":
with zipfile.ZipFile(
"{}/{}".format(context.config("data_path"), context.config("tiles_path"))
) as archive:
with archive.open(
re.split(r"[/.]", context.config("tiles_path"))[1] + ".7z"
) as f:
with py7zr.SevenZipFile(f) as archive:
archive.extract(context.path(), context.config("tiles_file"))
df_tiles = gpd.read_file(
f'{context.path()}/{context.config("tiles_file")}',
mask=poly_dep,
)[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename(
columns={"idcar_200m": "id_tiles", "men": "weight"}
)
else:
df_tiles = gpd.read_file(
f'{context.config("data_path")}/{context.config("tiles_path")}/{context.config("tiles_file")}',
mask=poly_dep,
)[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename(
columns={"idcar_200m": "id_tiles", "men": "weight"}
)

df_tiles["id_tiles"] = df_tiles["id_tiles"].str[14:]
df_tiles["geometry"] = df_tiles["geometry"].centroid
df_tiles["department_id"] = df_tiles["lcog_geo"].str[:2]

for department_id in df_departments["departement_id"].values:
assert np.count_nonzero(df_tiles["department_id"] == department_id) > 0

return df_tiles[["id_tiles", "weight", "geometry"]]


def validate(context):
if not os.path.exists(
"{}/{}".format(context.config("data_path"), context.config("tiles_path"))
):
raise RuntimeError("Tiles 2019 data is not available")

return os.path.getsize(
"{}/{}".format(context.config("data_path"), context.config("tiles_path"))
)
Loading
Loading