From cd68936611ab97d7bf4bcf44f09bbf9f32d16518 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 3 Jun 2024 11:33:17 +0200
Subject: [PATCH 01/19] new param: exclude no employee

---
 data/sirene/cleaned.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/data/sirene/cleaned.py b/data/sirene/cleaned.py
index 65df8612..9bef6da5 100644
--- a/data/sirene/cleaned.py
+++ b/data/sirene/cleaned.py
@@ -9,6 +9,7 @@ def configure(context):
     context.stage("data.sirene.raw_siren", ephemeral = True)
     context.stage("data.sirene.raw_siret", ephemeral = True)
     context.stage("data.spatial.codes")
+    context.config("exclude_no_employee", False)
 
 def execute(context):
     df_sirene_establishments = context.stage("data.sirene.raw_siret")
@@ -22,6 +23,13 @@ def execute(context):
     df_sirene = df_sirene[
         df_sirene["etatAdministratifEtablissement"] == "A"
     ].copy()
+    
+    if context.config("exclude_no_employee"):
+        # exclude "NN", "00", and NaN
+        df_sirene = df_sirene[
+            df_sirene["trancheEffectifsEtablissement"].notna()
+            & ~(df_sirene["trancheEffectifsEtablissement"].isin(["NN", "00"]))
+        ].copy()
 
     # Define work place weights by person under salary ....
     df_sirene["minimum_employees"] = 1 # Includes "NN", "00", and NaN

From 30ceb2807a62ace66455ae99dc9f5184234ac147 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 3 Jun 2024 11:55:45 +0200
Subject: [PATCH 02/19] new home location source: tiles insee

---
 data/tiles/raw.py                             | 65 +++++++++++++++++++
 synthesis/locations/home/addresses.py         |  4 +-
 synthesis/locations/home/locations.py         | 19 ++++--
 .../population/spatial/home/locations.py      | 17 +++--
 4 files changed, 94 insertions(+), 11 deletions(-)
 create mode 100644 data/tiles/raw.py

diff --git a/data/tiles/raw.py b/data/tiles/raw.py
new file mode 100644
index 00000000..756b1a0f
--- /dev/null
+++ b/data/tiles/raw.py
@@ -0,0 +1,65 @@
+import os
+import geopandas as gpd
+import py7zr
+import zipfile
+import re
+import numpy as np
+
+"""
+This stage loads the raw data from the French population income, poverty and living standards in tiled data.
+"""
+
+def configure(context):
+    context.stage("data.spatial.departments")
+    context.config("data_path")
+    context.config("tiles_path", "tiles_2019/Filosofi2019_carreaux_200m_gpkg.zip")
+    context.config("tiles_file", "carreaux_200m_met.gpkg")
+
+
+def execute(context):
+    # Find relevant departments
+    df_departments = context.stage("data.spatial.departments")
+    print("Expecting data for {} departments".format(len(df_departments)))
+    poly_dep = df_departments.unary_union
+    if context.config("tiles_path")[-4:] == ".zip":
+        with zipfile.ZipFile(
+            "{}/{}".format(context.config("data_path"), context.config("tiles_path"))
+        ) as archive:
+            with archive.open(
+                re.split(r"[/.]", context.config("tiles_path"))[1] + ".7z"
+            ) as f:
+                with py7zr.SevenZipFile(f) as archive:
+                    archive.extract(context.path(), context.config("tiles_file"))
+                    df_tiles = gpd.read_file(
+                        f'{context.path()}/{context.config("tiles_file")}',
+                        mask=poly_dep,
+                    )[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename(
+                        columns={"idcar_200m": "id_tiles", "men": "weight"}
+                    )
+    else:
+        df_tiles = gpd.read_file(
+            f'{context.config("data_path")}/{context.config("tiles_path")}/{context.config("tiles_file")}',
+            mask=poly_dep,
+        )[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename(
+            columns={"idcar_200m": "id_tiles", "men": "weight"}
+        )
+
+    df_tiles["id_tiles"] = df_tiles["id_tiles"].str[14:]
+    df_tiles["geometry"] = df_tiles["geometry"].centroid
+    df_tiles["department_id"] = df_tiles["lcog_geo"].str[:2]
+
+    for department_id in df_departments["departement_id"].values:
+        assert np.count_nonzero(df_tiles["department_id"] == department_id) > 0
+
+    return df_tiles[["id_tiles", "weight", "geometry"]]
+
+
+def validate(context):
+    if not os.path.exists(
+        "{}/{}".format(context.config("data_path"), context.config("tiles_path"))
+    ):
+        raise RuntimeError("Tiles 2019 data is not available")
+
+    return os.path.getsize(
+        "{}/{}".format(context.config("data_path"), context.config("tiles_path"))
+    )
\ No newline at end of file
diff --git a/synthesis/locations/home/addresses.py b/synthesis/locations/home/addresses.py
index 5a34de80..79f6af71 100644
--- a/synthesis/locations/home/addresses.py
+++ b/synthesis/locations/home/addresses.py
@@ -15,7 +15,7 @@
 adresses. For instance, the assigned addresses of a building with 10 housing units
 and two addresses will have a weight of 5.
 
-If no adresses matches a buidling, its centroid is taken as the unique address.
+If no adresses matches a building, its centroid is taken as the unique address.
 """
 
 def configure(context):
@@ -70,5 +70,5 @@ def execute(context):
     return df_addresses[["building_id", "weight", "geometry"]]
 
 def validate(context):
-    assert context.config("home_location_source") in ("addresses", "buildings")
+    assert context.config("home_location_source") in ("addresses", "buildings","tiles")
     assert context.config("home_location_weight") in ("uniform", "housing")
diff --git a/synthesis/locations/home/locations.py b/synthesis/locations/home/locations.py
index 6c319e83..c35dc5ff 100644
--- a/synthesis/locations/home/locations.py
+++ b/synthesis/locations/home/locations.py
@@ -9,7 +9,10 @@
 
 def configure(context):
     context.stage("data.spatial.iris")
-    context.stage("synthesis.locations.home.addresses")
+    if context.config("home_location_source", "addresses") == "tiles":
+        context.stage("data.tiles.raw")
+    else:
+        context.stage("synthesis.locations.home.addresses")
 
 def execute(context):
     # Find required IRIS
@@ -17,7 +20,11 @@ def execute(context):
     required_iris = set(df_iris["iris_id"].unique())
     
     # Load all addresses and add IRIS information
-    df_addresses = context.stage("synthesis.locations.home.addresses")
+    df_addresses = (
+        context.stage("data.tiles.raw")
+        if context.config("home_location_source") == "tiles"
+        else context.stage("synthesis.locations.home.addresses")
+    )
 
     print("Imputing IRIS into addresses ...")
    
@@ -38,7 +45,11 @@ def execute(context):
             len(missing_iris), len(required_iris)))
 
         df_added = []
-
+        id_name = (
+            "id_tiles"
+            if context.config("home_location_source") == "tiles"
+            else "building_id"
+        )
         for iris_id in sorted(missing_iris):
             centroid = df_iris[df_iris["iris_id"] == iris_id]["geometry"].centroid.iloc[0]
 
@@ -46,7 +57,7 @@ def execute(context):
                 "iris_id": iris_id, "geometry": centroid,
                 "commune_id": iris_id[:5],
                 "weight" : 1,
-                "building_id": -1
+                id_name: -1
             })
 
         df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_addresses.crs)
diff --git a/synthesis/population/spatial/home/locations.py b/synthesis/population/spatial/home/locations.py
index 0babb70b..6733a16b 100644
--- a/synthesis/population/spatial/home/locations.py
+++ b/synthesis/population/spatial/home/locations.py
@@ -6,7 +6,8 @@
 def configure(context):
     context.stage("synthesis.population.spatial.home.zones")
     context.stage("synthesis.locations.home.locations")
-
+    context.config("home_location_source", "addresses")
+    
     context.config("random_seed")
 
 def _sample_locations(context, args):
@@ -39,8 +40,10 @@ def _sample_locations(context, args):
     
     # Apply selection
     df_homes["geometry"] = df_locations.iloc[indices]["geometry"].values
-    df_homes["building_id"] = df_locations.iloc[indices]["building_id"].values
-    
+    if context.config("home_location_source") == "tiles":
+        df_homes["id_tiles"] = df_locations.iloc[indices]["id_tiles"].values
+    else:
+        df_homes["building_id"] = df_locations.iloc[indices]["building_id"].values
     # Update progress
     context.progress.update()
 
@@ -61,5 +64,9 @@ def execute(context):
         )) as parallel:
             seeds = random.randint(10000, size = len(unique_iris_ids))
             df_homes = pd.concat(parallel.map(_sample_locations, zip(unique_iris_ids, seeds)))
-
-    return df_homes[["household_id", "commune_id", "building_id", "geometry"]]
+    out = (
+        ["household_id", "commune_id", "id_tiles", "geometry"]
+        if context.config("home_location_source") == "tiles"
+        else ["household_id", "commune_id", "building_id", "geometry"]
+    )
+    return df_homes[out]

From 88351a9f62199cda501a4d0626c95a5ec4f728a4 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 3 Jun 2024 12:23:54 +0200
Subject: [PATCH 03/19] categorisation education : age distribution

---
 data/bpe/cleaned.py                           |  2 +-
 synthesis/locations/education.py              | 58 ++++++++++++-------
 .../population/spatial/primary/candidates.py  | 28 +++++++--
 .../population/spatial/primary/locations.py   | 19 +++++-
 4 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/data/bpe/cleaned.py b/data/bpe/cleaned.py
index e425e3b4..343dc8fa 100644
--- a/data/bpe/cleaned.py
+++ b/data/bpe/cleaned.py
@@ -134,7 +134,7 @@ def execute(context):
         df.loc[outside_indices, "imputed"] = True
 
     # Package up data set
-    df = df[["enterprise_id", "activity_type", "commune_id", "imputed", "x", "y"]]
+    df = df[["enterprise_id", "activity_type","TYPEQU" "commune_id", "imputed", "x", "y"]]
 
     df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y),crs="EPSG:2154")
 
diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index 418048ab..d111399d 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -7,40 +7,58 @@ def configure(context):
     context.stage("data.bpe.cleaned")
     context.stage("data.spatial.municipalities")
 
+def fake_education(missing_communes, c, df_locations, df_zones):
+    # Fake education destinations as the centroid of zones that have no other destinations
+    print(
+        "Adding fake education locations for %d municipalities"
+        % (len(missing_communes))
+    )
+
+    df_added = []
+
+    for commune_id in sorted(missing_communes):
+        centroid = df_zones[df_zones["commune_id"] == commune_id][
+            "geometry"
+        ].centroid.iloc[0]
+
+        df_added.append({"commune_id": commune_id, "geometry": centroid})
+
+    df_added = gpd.GeoDataFrame(
+        pd.DataFrame.from_records(df_added), crs=df_locations.crs
+    )
+    df_added["fake"] = True
+    df_added["TYPEQU"] = c
+
+    return df_added
+
 def execute(context):
     df_locations = context.stage("data.bpe.cleaned")[[
-        "enterprise_id", "activity_type", "commune_id", "geometry"
+        "enterprise_id", "activity_type", "TYPEQU", "commune_id", "geometry"
     ]]
 
     df_locations = df_locations[df_locations["activity_type"] == "education"]
-    df_locations = df_locations[["commune_id", "geometry"]].copy()
+    df_locations = df_locations[["TYPEQU", "commune_id", "geometry"]].copy()
     df_locations["fake"] = False
 
     # Add education destinations to the centroid of zones that have no other destinations
     df_zones = context.stage("data.spatial.municipalities")
 
     required_communes = set(df_zones["commune_id"].unique())
-    missing_communes = required_communes - set(df_locations["commune_id"].unique())
+    # Add education destinations in function of level education
+    for c in ["C1", "C2", "C3"]:
+        missing_communes = required_communes - set(
+            df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
 
-    if len(missing_communes) > 0:
-        print("Adding fake education locations for %d/%d municipalities" % (
-            len(missing_communes), len(required_communes)
-        ))
-
-        df_added = []
-
-        for commune_id in sorted(missing_communes):
-            centroid = df_zones[df_zones["commune_id"] == commune_id]["geometry"].centroid.iloc[0]
+        if len(missing_communes) > 0:
+            df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)])
+    
+    # Add education destinations in function of level education
+    missing_communes = required_communes - set(df_locations[~(df_locations["TYPEQU"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique())
 
-            df_added.append({
-                "commune_id": commune_id, "geometry": centroid
-            })
-
-        df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_locations.crs)
-        df_added["fake"] = True
-
-        df_locations = pd.concat([df_locations, df_added])
+    if len(missing_communes) > 0:
 
+        df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
+    
     # Define identifiers
     df_locations["location_id"] = np.arange(len(df_locations))
     df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str)
diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
index e0fcd412..9cfa9f1b 100644
--- a/synthesis/population/spatial/primary/candidates.py
+++ b/synthesis/population/spatial/primary/candidates.py
@@ -13,6 +13,17 @@ def configure(context):
 
     context.config("random_seed")
 
+EDUCATION_MAPPING = {
+    "primary_school": {"min_age": 0, "max_age": 10, "type_edu": "C1"},
+    "middle_school": {"min_age": 11, "max_age": 14, "type_edu": "C2"},
+    "high_school": {"min_age": 15, "max_age": 17, "type_edu": "C3"},
+    "higher_education": {
+        "min_age": 18,
+        "max_age": 110,
+        "type_edu": ("C4", "C5", "C6"),
+    },
+}
+
 def sample_destination_municipalities(context, arguments):
     # Load data
     origin_id, count, random_seed = arguments
@@ -64,7 +75,7 @@ def sample_locations(context, arguments):
     return df_result
 
 def process(context, purpose, random, df_persons, df_od, df_locations):
-    df_persons = df_persons[df_persons["has_%s_trip" % purpose]]
+    df_persons = df_persons[df_persons["has_%s_trip" % purpose.split("_")[0]]]
 
     # Sample commute flows based on population
     df_demand = df_persons.groupby("commune_id").size().reset_index(name = "count")
@@ -98,7 +109,7 @@ def process(context, purpose, random, df_persons, df_od, df_locations):
 
 def execute(context):
     # Prepare population data
-    df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id"]].copy()
+    df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id", "age"]].copy()
     df_trips = context.stage("synthesis.population.trips")
 
     df_persons["has_work_trip"] = df_persons["person_id"].isin(df_trips[
@@ -125,14 +136,19 @@ def execute(context):
     )
 
     df_locations = context.stage("synthesis.locations.education")
-    df_education = process(context, "education", random, df_persons,
-        df_education_od, df_locations
-    )
+    df_education = []
+    for prefix, education_type in EDUCATION_MAPPING.items():
+        df_education.append(
+            process(context, "education_" + prefix, random,
+                df_persons[df_persons["age"].between( education_type["min_age"],education_type["max_age"])],
+                df_education_od,df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
+        )
+    df_education = pd.concat(df_education).sort_values(["origin_id", "destination_id"])
 
     return dict(
         work_candidates = df_work,
         education_candidates = df_education,
         persons = df_persons[df_persons["has_work_trip"] | df_persons["has_education_trip"]][[
-            "person_id", "household_id", "commune_id", "has_work_trip", "has_education_trip"
+            "person_id", "household_id", "age", "commune_id", "has_work_trip", "has_education_trip"
         ]]
     )
diff --git a/synthesis/population/spatial/primary/locations.py b/synthesis/population/spatial/primary/locations.py
index 14b3a586..4278d807 100644
--- a/synthesis/population/spatial/primary/locations.py
+++ b/synthesis/population/spatial/primary/locations.py
@@ -9,6 +9,17 @@ def configure(context):
     context.stage("synthesis.locations.work")
     context.stage("synthesis.locations.education")
 
+EDUCATION_MAPPING = {
+    "primary_school": {"min_age": 0, "max_age": 10, "type_edu": "C1"},
+    "middle_school": {"min_age": 11, "max_age": 14, "type_edu": "C2"},
+    "high_school": {"min_age": 15, "max_age": 17, "type_edu": "C3"},
+    "higher_education": {
+        "min_age": 18,
+        "max_age": 110,
+        "type_edu": ("C4", "C5", "C6"),
+    },
+}
+
 def define_distance_ordering(df_persons, df_candidates, progress):
     indices = []
 
@@ -106,13 +117,15 @@ def execute(context):
     df_work_candidates = pd.merge(df_work_candidates, df_locations, how = "left", on = "location_id")
     df_work_candidates = gpd.GeoDataFrame(df_work_candidates)
 
-    df_locations = context.stage("synthesis.locations.education")[["location_id", "geometry"]]
+    df_locations = context.stage("synthesis.locations.education")[["TYPEQU", "location_id", "geometry"]]
     df_education_candidates = data["education_candidates"]
     df_education_candidates = pd.merge(df_education_candidates, df_locations, how = "left", on = "location_id")
     df_education_candidates = gpd.GeoDataFrame(df_education_candidates)
 
     # Assign destinations
     df_work = process(context, "work", df_work, df_work_candidates)
-    df_education = process(context, "education", df_education, df_education_candidates)
-
+    education = []
+    for prefix, education_type in EDUCATION_MAPPING.items():
+        education.append(process(context, "education_" + prefix,df_education[df_education["age"].between(education_type["min_age"],education_type["max_age"])],df_education_candidates[df_education_candidates["TYPEQU"].str.startswith(education_type["type_edu"])]))
+    df_education = pd.concat(education).sort_index()
     return df_work, df_education

From ee147df5a9f154f7ce388a465d65fe81614ce0aa Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 3 Jun 2024 13:49:57 +0200
Subject: [PATCH 04/19] categorisation education : weight with type &
 correction

---
 data/bpe/cleaned.py              |  2 +-
 synthesis/locations/education.py | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/data/bpe/cleaned.py b/data/bpe/cleaned.py
index 343dc8fa..04cb7211 100644
--- a/data/bpe/cleaned.py
+++ b/data/bpe/cleaned.py
@@ -134,7 +134,7 @@ def execute(context):
         df.loc[outside_indices, "imputed"] = True
 
     # Package up data set
-    df = df[["enterprise_id", "activity_type","TYPEQU" "commune_id", "imputed", "x", "y"]]
+    df = df[["enterprise_id", "activity_type","TYPEQU", "commune_id", "imputed", "x", "y"]]
 
     df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y),crs="EPSG:2154")
 
diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index d111399d..6690dbff 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -7,6 +7,20 @@ def configure(context):
     context.stage("data.bpe.cleaned")
     context.stage("data.spatial.municipalities")
 
+EDUCATION_WEIGHT_MAP = [
+    ("C101", 100),  # Preschools
+    ("C102", 50),  # Intercommunal preschools
+    ("C104", 145),  # Elemantary schools
+    ("C105", 80),  # Intercommunal elemantary schools
+    ("C301", 700),  # General and technological high schools, multi-purpose high schools
+    ("C302", 285),  # Professional high schools
+    ("C303", 100),  # Agricultural high schools
+    ("C304", 30),  # General and technological classes in professional high schools
+    ("C305", 30),  # Professional classes in general and technological high schools
+    ("C403", 1000),  # Business schools
+    ("C5", 2000),  # University
+]
+
 def fake_education(missing_communes, c, df_locations, df_zones):
     # Fake education destinations as the centroid of zones that have no other destinations
     print(
@@ -28,6 +42,7 @@ def fake_education(missing_communes, c, df_locations, df_zones):
     )
     df_added["fake"] = True
     df_added["TYPEQU"] = c
+    df_added["weight"] = 1
 
     return df_added
 
@@ -39,7 +54,11 @@ def execute(context):
     df_locations = df_locations[df_locations["activity_type"] == "education"]
     df_locations = df_locations[["TYPEQU", "commune_id", "geometry"]].copy()
     df_locations["fake"] = False
-
+    df_locations["weight"] = 500
+    for prefix, weight in EDUCATION_WEIGHT_MAP:
+        df_locations.loc[df_locations["TYPEQU"].str.startswith(prefix), "weight"] = (
+            weight
+        )
     # Add education destinations to the centroid of zones that have no other destinations
     df_zones = context.stage("data.spatial.municipalities")
 
@@ -63,4 +82,4 @@ def execute(context):
     df_locations["location_id"] = np.arange(len(df_locations))
     df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str)
 
-    return df_locations[["location_id", "commune_id", "fake", "geometry"]]
+    return df_locations[["location_id","TYPEQU", "weight", "commune_id", "fake", "geometry"]]

From 85ddbdb76edc52844ee3c5866cde476d95c23363 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Thu, 6 Jun 2024 15:38:51 +0200
Subject: [PATCH 05/19] categorisation education : weight and location with
 file

---
 synthesis/locations/education.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index 6690dbff..ef18d190 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -7,6 +7,10 @@ def configure(context):
     context.stage("data.bpe.cleaned")
     context.stage("data.spatial.municipalities")
 
+    if context.config("education_location_source", "bpe") == 'addresses' :
+        context.config("data_path")
+        context.config("education_file", "education/education_addresses.geojson")
+
 EDUCATION_WEIGHT_MAP = [
     ("C101", 100),  # Preschools
     ("C102", 50),  # Intercommunal preschools
@@ -59,10 +63,20 @@ def execute(context):
         df_locations.loc[df_locations["TYPEQU"].str.startswith(prefix), "weight"] = (
             weight
         )
+        
     # Add education destinations to the centroid of zones that have no other destinations
     df_zones = context.stage("data.spatial.municipalities")
 
-    required_communes = set(df_zones["commune_id"].unique())
+    required_communes = set(df_zones["commune_id"].unique())    
+    
+    if context.config("education_location_source") == 'addresses':
+        # Data in model of bpe cleaned
+        df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["TYPEQU", "commune_id","weight", "geometry"]]
+        df_education["fake"] = False
+        df_education = df_education.to_crs("2154")
+        list_type = set(df_education["TYPEQU"].unique())
+        df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].isin(list_type))],df_education])
+        
     # Add education destinations in function of level education
     for c in ["C1", "C2", "C3"]:
         missing_communes = required_communes - set(

From a25cc5d0b5bd47f56207b77995fdce350805059e Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 10 Jun 2024 17:40:41 +0200
Subject: [PATCH 06/19] categorisation education : setting new param
 education_location_source

---
 synthesis/locations/education.py              | 58 +++++++++++--------
 .../population/spatial/primary/candidates.py  | 20 ++++---
 .../population/spatial/primary/locations.py   | 13 +++--
 3 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index ef18d190..836c63bc 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -58,40 +58,48 @@ def execute(context):
     df_locations = df_locations[df_locations["activity_type"] == "education"]
     df_locations = df_locations[["TYPEQU", "commune_id", "geometry"]].copy()
     df_locations["fake"] = False
-    df_locations["weight"] = 500
-    for prefix, weight in EDUCATION_WEIGHT_MAP:
-        df_locations.loc[df_locations["TYPEQU"].str.startswith(prefix), "weight"] = (
-            weight
-        )
-        
+    df_locations["weight"] = 500 
+
     # Add education destinations to the centroid of zones that have no other destinations
     df_zones = context.stage("data.spatial.municipalities")
 
-    required_communes = set(df_zones["commune_id"].unique())    
+    required_communes = set(df_zones["commune_id"].unique())  
     
-    if context.config("education_location_source") == 'addresses':
-        # Data in model of bpe cleaned
-        df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["TYPEQU", "commune_id","weight", "geometry"]]
-        df_education["fake"] = False
-        df_education = df_education.to_crs("2154")
-        list_type = set(df_education["TYPEQU"].unique())
-        df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].isin(list_type))],df_education])
+    if context.config("education_location_source") != 'bpe': # either weighted or addresses
+        for prefix, weight in EDUCATION_WEIGHT_MAP:
+            df_locations.loc[df_locations["TYPEQU"].str.startswith(prefix), "weight"] = (
+                weight
+            ) 
+        
+        if context.config("education_location_source") == 'addresses':
+            # Data in model of bpe cleaned
+            df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["TYPEQU", "commune_id","weight", "geometry"]]
+            df_education["fake"] = False
+            df_education = df_education.to_crs("2154")
+            list_type = set(df_education["TYPEQU"].unique())
+            df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].isin(list_type))],df_education])
+        
+      
+        # Add education destinations in function of level education
+        for c in ["C1", "C2", "C3"]:
+            missing_communes = required_communes - set(
+                df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
+
+            if len(missing_communes) > 0:
+                df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)])
         
-    # Add education destinations in function of level education
-    for c in ["C1", "C2", "C3"]:
-        missing_communes = required_communes - set(
-            df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
+        # Add education destinations for last level education
+        missing_communes = required_communes - set(df_locations[~(df_locations["TYPEQU"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique())
 
         if len(missing_communes) > 0:
-            df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)])
-    
-    # Add education destinations in function of level education
-    missing_communes = required_communes - set(df_locations[~(df_locations["TYPEQU"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique())
 
-    if len(missing_communes) > 0:
+            df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
+    else :
+        missing_communes = required_communes - set(df_locations["commune_id"].unique())
+        if len(missing_communes) > 0:
+
+            df_locations = pd.concat([df_locations,fake_education(missing_communes, "C", df_locations, df_zones)])
 
-        df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
-    
     # Define identifiers
     df_locations["location_id"] = np.arange(len(df_locations))
     df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str)
diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
index 9cfa9f1b..0cf666a7 100644
--- a/synthesis/population/spatial/primary/candidates.py
+++ b/synthesis/population/spatial/primary/candidates.py
@@ -12,6 +12,7 @@ def configure(context):
     context.stage("synthesis.population.trips")
 
     context.config("random_seed")
+    context.config("education_location_source", "bpe")
 
 EDUCATION_MAPPING = {
     "primary_school": {"min_age": 0, "max_age": 10, "type_edu": "C1"},
@@ -136,14 +137,17 @@ def execute(context):
     )
 
     df_locations = context.stage("synthesis.locations.education")
-    df_education = []
-    for prefix, education_type in EDUCATION_MAPPING.items():
-        df_education.append(
-            process(context, "education_" + prefix, random,
-                df_persons[df_persons["age"].between( education_type["min_age"],education_type["max_age"])],
-                df_education_od,df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
-        )
-    df_education = pd.concat(df_education).sort_values(["origin_id", "destination_id"])
+    if context.config("education_location_source") == 'bpe':
+        df_education = process(context, "education", random, df_persons, df_education_od, df_locations)
+    else :
+        df_education = []
+        for prefix, education_type in EDUCATION_MAPPING.items():
+            df_education.append(
+                process(context, "education_" + prefix, random,
+                    df_persons[df_persons["age"].between( education_type["min_age"],education_type["max_age"])],
+                    df_education_od,df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
+            )
+        df_education = pd.concat(df_education).sort_values(["origin_id", "destination_id"])
 
     return dict(
         work_candidates = df_work,
diff --git a/synthesis/population/spatial/primary/locations.py b/synthesis/population/spatial/primary/locations.py
index 4278d807..143bc2b6 100644
--- a/synthesis/population/spatial/primary/locations.py
+++ b/synthesis/population/spatial/primary/locations.py
@@ -9,6 +9,8 @@ def configure(context):
     context.stage("synthesis.locations.work")
     context.stage("synthesis.locations.education")
 
+    context.config("education_location_source", "bpe")
+
 EDUCATION_MAPPING = {
     "primary_school": {"min_age": 0, "max_age": 10, "type_edu": "C1"},
     "middle_school": {"min_age": 11, "max_age": 14, "type_edu": "C2"},
@@ -124,8 +126,11 @@ def execute(context):
 
     # Assign destinations
     df_work = process(context, "work", df_work, df_work_candidates)
-    education = []
-    for prefix, education_type in EDUCATION_MAPPING.items():
-        education.append(process(context, "education_" + prefix,df_education[df_education["age"].between(education_type["min_age"],education_type["max_age"])],df_education_candidates[df_education_candidates["TYPEQU"].str.startswith(education_type["type_edu"])]))
-    df_education = pd.concat(education).sort_index()
+    if context.config("education_location_source") == 'bpe':
+        df_education = process(context, "education", df_education, df_education_candidates)
+    else :
+        education = []
+        for prefix, education_type in EDUCATION_MAPPING.items():
+            education.append(process(context, "education_" + prefix,df_education[df_education["age"].between(education_type["min_age"],education_type["max_age"])],df_education_candidates[df_education_candidates["TYPEQU"].str.startswith(education_type["type_edu"])]))
+        df_education = pd.concat(education).sort_index()
     return df_work, df_education

From 0d6490a4140d73c88a6652ed866d1013c800204c Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 1 Jul 2024 15:49:53 +0200
Subject: [PATCH 07/19] new output: map graphs analysis with age and flow
 purpose

---
 documentation/flow_output.py | 108 +++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 documentation/flow_output.py

diff --git a/documentation/flow_output.py b/documentation/flow_output.py
new file mode 100644
index 00000000..48db200e
--- /dev/null
+++ b/documentation/flow_output.py
@@ -0,0 +1,108 @@
+import pandas as pd
+import geopandas as gpd
+import numpy as np
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as tck
+import shapely.geometry as geo
+
+import plotly.express as px 
+import documentation.plotting as plotting
+
+SAMPLING_RATE = 0.05
+
+def configure(context):
+
+    if not context.config("analysis_from_file",False) :
+        context.stage("synthesis.population.trips")
+        context.stage("synthesis.population.spatial.locations")
+        context.stage("synthesis.population.enriched")
+    context.stage("data.spatial.departments")
+
+    context.config("comparison_file",None)
+    context.config("output_prefix", "ile_de_france_")
+    context.config("output_formats", ["csv", "gpkg"])
+    context.config("output_path")
+    context.config("data_path")
+
+
+def execute(context):
+    
+    figures = {
+        "Yrs:0-10":{"min_age": 0, "max_age": 10,},
+        "Yrs:11-14":{"min_age": 11, "max_age": 14,},
+        "Yrs:15-18":{"min_age": 15, "max_age": 17,},
+        "Yrs:18-25":{"min_age": 18, "max_age": 25,},
+        "Yrs:25-50":{"min_age": 26, "max_age": 50,},
+        "Yrs:50-65":{"min_age": 51, "max_age": 65,},
+        "Yrs:65-75":{"min_age": 66, "max_age": 75,},
+        "Yrs:75+":{"min_age": 76, "max_age": 110,},}
+    
+    if not context.config("analysis_from_file"):
+        # from simulation cache
+        df_trips = context.stage("synthesis.population.trips")
+        df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id","age"]]
+        df_locations = context.stage("synthesis.population.spatial.locations")[[
+            "person_id", "activity_index", "geometry"
+        ]]
+        df_trips["preceding_activity_index"] = df_trips["trip_index"]
+        df_trips["following_activity_index"] = df_trips["trip_index"] + 1
+
+    else : 
+        # from file trips, activites and person
+        print("Récupération données ...")
+        df_trips = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]]
+        df_locations = gpd.read_parquet(f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg')
+        df_persons = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',sep=';')[["person_id", "household_id","age"]]
+
+    # Write spatial trips
+    df_spatial = pd.merge(df_trips, df_locations[[
+        "person_id", "activity_index", "geometry"
+    ]].rename(columns = {
+        "activity_index": "following_activity_index",
+    }), how = "left", on = ["person_id", "following_activity_index"])
+    df_spatial = pd.merge(df_spatial,df_persons,how = "left", on = ["person_id",])
+    df_spatial = gpd.GeoDataFrame(df_spatial, crs = "EPSG:2154").to_crs("4326")
+
+    list_purpose = list(df_spatial["following_purpose"].unique())
+
+    # grid 1km of location data
+    df_departments = context.stage("data.spatial.departments")
+    poly_dep = df_departments.unary_union
+    df_tiles = gpd.read_file(
+            f'{context.config("data_path")}/tiles_2019/grille200m_metropole.gpkg',
+            mask=poly_dep,
+        ) if context.config("comparison_file") is None else gpd.read_parquet(f'{context.config("data_path")}/tiles_2019/{context.config("comparison_file")}')
+    df_tiles = df_tiles.to_crs("4326")
+    df_tile = df_tiles[["id_carr_1km","geometry"]].dissolve(by="id_carr_1km").reset_index()
+
+    df_stats = gpd.sjoin(df_tile,df_spatial,how="left")
+    
+
+    point = df_tiles.unary_union.centroid # a changé avec ploy_dep
+    print("Impression cartes ...")
+    for prefix, figure in figures.items():
+        df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])]
+        df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
+        print(prefix)
+        df_select_age = df_select_age[~(df_select_age["geometry"].isna())]
+        df_select_age["following_purpose"] = df_select_age["following_purpose"].astype('str')
+
+        for purpose in list_purpose :
+            df_select = df_select_age[df_select_age["following_purpose"]==purpose].rename(columns={"person_id":"count"})
+            df_tiles_select = pd.DataFrame() if context.config("comparison_file") is None else df_tiles[(df_tiles["age"]==prefix)&(df_tiles["following_purpose"]==purpose)]
+            if df_tiles_select.empty :
+                df_select = gpd.sjoin(df_select.drop(columns=['index_right']),df_tile,how='right',predicate="contains").fillna(0)
+                df_select  = df_select[df_select["count"] != 0]
+                fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds',
+                                        mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose")
+                fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
+            else :
+                df_tiles_select = gpd.sjoin(df_tiles_select,df_tile,how='right',predicate="contains").fillna(0)
+                df_select = gpd.sjoin(df_select.drop(columns=['index_right']),df_tiles_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0)
+                df_select["volume_difference"] = df_select["volume_studied_simu"] - df_select["volume_compared_simu"]
+                #df_select  = df_select[df_select["volume_difference"] != 0]
+                px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu"],
+                                        mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
+
+            
\ No newline at end of file

From dce59e7afbc284cc283769b83fcfbb34947cddac Mon Sep 17 00:00:00 2001
From: Vincent Leblond <vleblond@tellae.fr>
Date: Mon, 3 Jun 2024 17:33:10 +0200
Subject: [PATCH 08/19] feat: option for not filtering entd on resquested
 departments

(cherry picked from commit a5f08efb2cd4416e5b98ac1f980a9dd78c3749ee)
---
 data/hts/entd/filtered.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py
index 9fc4793c..405da2f2 100644
--- a/data/hts/entd/filtered.py
+++ b/data/hts/entd/filtered.py
@@ -10,27 +10,31 @@ def configure(context):
     context.stage("data.hts.entd.cleaned")
     context.stage("data.spatial.codes")
 
+    context.config("filter_entd", False)
+
 def execute(context):
+    filter_entd = context.config("filter_entd")
     df_codes = context.stage("data.spatial.codes")
     df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned")
 
     # Filter for non-residents
-    requested_departments = df_codes["departement_id"].unique()
-    f = df_persons["departement_id"].astype(str).isin(requested_departments)
-    df_persons = df_persons[f]
+    if filter_entd:
+        requested_departments = df_codes["departement_id"].unique()
+        f = df_persons["departement_id"].astype(str).isin(requested_departments)
+        df_persons = df_persons[f]
 
-    # Filter for people going outside of the area (because they have NaN distances)
-    remove_ids = set()
+        # Filter for people going outside of the area (because they have NaN distances)
+        remove_ids = set()
 
-    remove_ids |= set(df_trips[
-        ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
-    ]["person_id"].unique())
+        remove_ids |= set(df_trips[
+            ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
+        ]["person_id"].unique())
 
-    df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
+        df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
 
-    # Only keep trips and households that still have a person
-    df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
-    df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+        # Only keep trips and households that still have a person
+        df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
+        df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
 
     # Finish up
     df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]

From 460ce2676169f7b442b5b275169c97437cc35664 Mon Sep 17 00:00:00 2001
From: Vincent Leblond <vleblond@tellae.fr>
Date: Mon, 3 Jun 2024 18:08:32 +0200
Subject: [PATCH 09/19] fix: remove person with problem in entd

(cherry picked from commit b2835cad0bde8e4558a04c79d336e11435e9276d)
---
 data/hts/entd/filtered.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py
index 405da2f2..2e1db1b2 100644
--- a/data/hts/entd/filtered.py
+++ b/data/hts/entd/filtered.py
@@ -31,10 +31,13 @@ def execute(context):
         ]["person_id"].unique())
 
         df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
+    
+    else:
+        df_persons = df_persons[~df_persons["person_id"].isin([34581])] # remove persons leading to activity types error
 
-        # Only keep trips and households that still have a person
-        df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
-        df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+    # Only keep trips and households that still have a person
+    df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
+    df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
 
     # Finish up
     df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]

From 30709d31c3f9a05cdff333058b7b689e878ab1f6 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Thu, 25 Jul 2024 16:29:44 +0200
Subject: [PATCH 10/19] categorisation education : new distribution of
 education od with age range

---
 data/od/cleaned.py                            | 12 ++++++-
 data/od/raw.py                                |  3 +-
 data/od/weighted.py                           | 32 ++++++++++++-------
 synthesis/locations/education.py              |  7 ++--
 .../population/spatial/primary/candidates.py  |  4 +--
 5 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/data/od/cleaned.py b/data/od/cleaned.py
index c8cf81a2..e13348f1 100644
--- a/data/od/cleaned.py
+++ b/data/od/cleaned.py
@@ -58,12 +58,22 @@ def execute(context):
     
     assert not np.any(df_work["commute_mode"].isna())
 
+    # Clean age range for education
+    df_education["age_range"] = np.nan
+    df_education.loc[df_education["AGEREV10"] <= 6, "age_range"] = "primary_school"
+    df_education.loc[df_education["AGEREV10"] == 11, "age_range"] = "middle_school"
+    df_education.loc[df_education["AGEREV10"] == 15, "age_range"] = "high_school"
+    df_education.loc[df_education["AGEREV10"] >= 18, "age_range"] = "higher_education"
+    df_education["age_range"] = df_education["age_range"].astype("category")
+    
+    assert not np.any(df_education["age_range"].isna())
+
     # Aggregate the flows
     print("Aggregating work ...")
     df_work = df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"].sum().reset_index()
 
     print("Aggregating education ...")
-    df_education = df_education.groupby(["origin_id", "destination_id"])["weight"].sum().reset_index()
+    df_education = df_education.groupby(["origin_id", "destination_id","age_range"])["weight"].sum().reset_index()
 
     df_work["weight"] = df_work["weight"].fillna(0.0)
     df_education["weight"] = df_education["weight"].fillna(0.0)
diff --git a/data/od/raw.py b/data/od/raw.py
index 0b1cad4d..41bc515b 100644
--- a/data/od/raw.py
+++ b/data/od/raw.py
@@ -57,7 +57,8 @@ def execute(context):
             "COMMUNE":"str", 
             "ARM":"str", 
             "IPONDI":"float",
-            "DCETUF":"str"
+            "DCETUF":"str",
+            "AGEREV10":"int"
         }
 
         with zipfile.ZipFile(
diff --git a/data/od/weighted.py b/data/od/weighted.py
index d0defebc..21997564 100644
--- a/data/od/weighted.py
+++ b/data/od/weighted.py
@@ -12,20 +12,24 @@
 def configure(context):
     context.stage("data.od.cleaned")
     context.stage("data.spatial.codes")
+    context.config("output_path")
+    context.config("education_location_source","bpe")
 
-def fix_origins(df, commune_ids, purpose):
+def fix_origins(df, commune_ids, purpose,category): 
     existing_ids = set(np.unique(df["origin_id"]))
     missing_ids = commune_ids - existing_ids
+    categories = set(np.unique(df[category]))
 
     rows = []
     for origin_id in missing_ids:
         for destination_id in commune_ids:
-            rows.append((origin_id, destination_id, 1.0 if origin_id == destination_id else 0.0))
+            for category_name in categories :
+                rows.append((origin_id, destination_id, category_name, 1.0/len(categories) if origin_id == destination_id else 0.0))
 
     print("Fixing %d origins for %s" % (len(missing_ids), purpose))
 
     return pd.concat([df, pd.DataFrame.from_records(
-        rows, columns = ["origin_id", "destination_id", "weight"]
+        rows, columns = ["origin_id", "destination_id", category, "weight"]
     )]).sort_values(["origin_id", "destination_id"])
 
 def execute(context):
@@ -35,25 +39,29 @@ def execute(context):
     # Load data
     df_work, df_education = context.stage("data.od.cleaned")
 
-    # Aggregate work (we do not consider different modes at the moment)
-    df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index()
-
     # Add missing origins
-    df_work = fix_origins(df_work, commune_ids, "work")
-    df_education = fix_origins(df_education, commune_ids, "education")
+    df_work = fix_origins(df_work, commune_ids, "work","commute_mode")
+    df_education = fix_origins(df_education, commune_ids, "education","age_range")
 
+    # Aggregate work (we do not consider different modes at the moment)
+    df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index()
+   
     # Compute totals
     df_total = df_work[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1)
     df_work = pd.merge(df_work, df_total, on = "origin_id")
 
-    df_total = df_education[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1)
-    df_education = pd.merge(df_education, df_total, on = "origin_id")
-
+    df_total = df_education[["origin_id","age_range", "weight"]].groupby(["origin_id","age_range"]).sum().reset_index().rename({ "weight" : "total" }, axis = 1)
+    df_education = pd.merge(df_education, df_total, on = ["origin_id","age_range"])
+    
+    if context.config("education_location_source") == 'bpe':
+        # Aggregate education (we do not consider different age range with bpe source)
+        df_education = df_education[["origin_id", "destination_id", "weight","total"]].groupby(["origin_id", "destination_id"]).sum().reset_index()    
     # Compute weight
     df_work["weight"] /= df_work["total"]
     df_education["weight"] /= df_education["total"]
 
     del df_work["total"]
     del df_education["total"]
-
+    df_education = df_education.fillna(0.0)
+    
     return df_work, df_education
diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index 836c63bc..4fbd48fc 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -77,13 +77,12 @@ def execute(context):
             df_education["fake"] = False
             df_education = df_education.to_crs("2154")
             list_type = set(df_education["TYPEQU"].unique())
-            df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].isin(list_type))],df_education])
+            df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].str.startswith(tuple(list_type)))],df_education[df_education["commune_id"].isin(required_communes)]])
         
       
         # Add education destinations in function of level education
         for c in ["C1", "C2", "C3"]:
-            missing_communes = required_communes - set(
-                df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
+            missing_communes = required_communes - set(df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
 
             if len(missing_communes) > 0:
                 df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)])
@@ -93,7 +92,7 @@ def execute(context):
 
         if len(missing_communes) > 0:
 
-            df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
+           df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
     else :
         missing_communes = required_communes - set(df_locations["commune_id"].unique())
         if len(missing_communes) > 0:
diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
index 4fda302a..14cdedd4 100644
--- a/synthesis/population/spatial/primary/candidates.py
+++ b/synthesis/population/spatial/primary/candidates.py
@@ -10,7 +10,7 @@ def configure(context):
     context.stage("synthesis.population.spatial.home.zones")
     context.stage("synthesis.population.enriched")
     context.stage("synthesis.population.trips")
-
+    context.config("output_path")
     context.config("random_seed")
     context.config("education_location_source", "bpe")
 
@@ -149,7 +149,7 @@ def execute(context):
             df_education.append(
                 process(context, "education_" + prefix, random,
                     df_persons[df_persons["age"].between( education_type["min_age"],education_type["max_age"])],
-                    df_education_od,df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
+                    df_education_od[df_education_od["age_range"]==prefix],df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
             )
         df_education = pd.concat(df_education).sort_values(["origin_id", "destination_id"])
 

From fdf37e908cdf417bbcc2acc2030e57dd868c0241 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 29 Jul 2024 14:48:54 +0200
Subject: [PATCH 11/19] clean up and correction output

---
 data/hts/entd/cleaned.py                           |  2 ++
 data/hts/entd/filtered.py                          | 13 +++++--------
 data/od/weighted.py                                |  2 +-
 documentation/flow_output.py                       |  7 ++++---
 synthesis/population/spatial/primary/candidates.py |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py
index a8814c94..e811cfed 100644
--- a/data/hts/entd/cleaned.py
+++ b/data/hts/entd/cleaned.py
@@ -247,6 +247,8 @@ def execute(context):
 
     # Socioprofessional class
     df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10
+    
+    hts.fix_activity_types(df_trips)
 
     return df_households, df_persons, df_trips
 
diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py
index 2e1db1b2..476b409b 100644
--- a/data/hts/entd/filtered.py
+++ b/data/hts/entd/filtered.py
@@ -10,10 +10,10 @@ def configure(context):
     context.stage("data.hts.entd.cleaned")
     context.stage("data.spatial.codes")
 
-    context.config("filter_entd", False)
+    context.config("filter_hts", True)
 
 def execute(context):
-    filter_entd = context.config("filter_entd")
+    filter_entd = context.config("filter_hts")
     df_codes = context.stage("data.spatial.codes")
     df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned")
 
@@ -31,13 +31,10 @@ def execute(context):
         ]["person_id"].unique())
 
         df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
-    
-    else:
-        df_persons = df_persons[~df_persons["person_id"].isin([34581])] # remove persons leading to activity types error
 
-    # Only keep trips and households that still have a person
-    df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
-    df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+        # Only keep trips and households that still have a person
+        df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
+        df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
 
     # Finish up
     df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]
diff --git a/data/od/weighted.py b/data/od/weighted.py
index 21997564..6e537f15 100644
--- a/data/od/weighted.py
+++ b/data/od/weighted.py
@@ -12,7 +12,7 @@
 def configure(context):
     context.stage("data.od.cleaned")
     context.stage("data.spatial.codes")
-    context.config("output_path")
+
     context.config("education_location_source","bpe")
 
 def fix_origins(df, commune_ids, purpose,category): 
diff --git a/documentation/flow_output.py b/documentation/flow_output.py
index 48db200e..b8cf9b1a 100644
--- a/documentation/flow_output.py
+++ b/documentation/flow_output.py
@@ -39,6 +39,7 @@ def execute(context):
         "Yrs:75+":{"min_age": 76, "max_age": 110,},}
     
     if not context.config("analysis_from_file"):
+        print("Récupération simu données ...")
         # from simulation cache
         df_trips = context.stage("synthesis.population.trips")
         df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id","age"]]
@@ -84,7 +85,6 @@ def execute(context):
     for prefix, figure in figures.items():
         df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])]
         df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
-        print(prefix)
         df_select_age = df_select_age[~(df_select_age["geometry"].isna())]
         df_select_age["following_purpose"] = df_select_age["following_purpose"].astype('str')
 
@@ -101,8 +101,9 @@ def execute(context):
                 df_tiles_select = gpd.sjoin(df_tiles_select,df_tile,how='right',predicate="contains").fillna(0)
                 df_select = gpd.sjoin(df_select.drop(columns=['index_right']),df_tiles_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0)
                 df_select["volume_difference"] = df_select["volume_studied_simu"] - df_select["volume_compared_simu"]
-                #df_select  = df_select[df_select["volume_difference"] != 0]
-                px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu"],
+                df_select  = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)]
+                df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"]
+                px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu","pourcentage_vol"],
                                         mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
 
             
\ No newline at end of file
diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
index 14cdedd4..fd61c99e 100644
--- a/synthesis/population/spatial/primary/candidates.py
+++ b/synthesis/population/spatial/primary/candidates.py
@@ -10,7 +10,7 @@ def configure(context):
     context.stage("synthesis.population.spatial.home.zones")
     context.stage("synthesis.population.enriched")
     context.stage("synthesis.population.trips")
-    context.config("output_path")
+
     context.config("random_seed")
     context.config("education_location_source", "bpe")
 

From dcc9ff07a130aeb230c8ecce8e206303c1c85e26 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 5 Aug 2024 11:53:04 +0200
Subject: [PATCH 12/19] change docs and test

---
 docs/population.md | 37 +++++++++++++++++++++++++++++++++++++
 tests/testdata.py  |  2 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/docs/population.md b/docs/population.md
index a94d9ad0..eb4a2cf0 100644
--- a/docs/population.md
+++ b/docs/population.md
@@ -343,3 +343,40 @@ To make use of the urban type, the following data is needed:
 - Put the downloaded *zip* file into `data/urban_type`, so you will have the file `data/urban_type/UU2020_au_01-01-2023.zip`
 
 Then, you should be able to run the pipeline with the configuration explained above.
+
+### No employee enterprise
+The pipeline allows to hide all entreprise without any employee indicated in Sirene data for working place distribution. It can be activate via the configuration :
+
+```yaml
+config:
+  # [...]
+  exclude_no_employee: true
+```
+### INSEE tiles 
+
+The pipeline allows to work with INSEE's 200m grid data to locate population in place of using BAN or BDTOPO data. Population is located in the centre of the tiles with the INSEE population weight for each tile. 
+
+- In order to use of this location,[download the 200m grid data from INSEE](https://www.insee.fr/fr/statistiques/7655475?sommaire=7655515). The pipeline is currently compatible with 2019 data set.
+- Put  the downloaded *zip* file into `data/tiles_2019`, so you will have the file `data/tiles_2019/Filosofi2019_carreaux_200m_gpkg.zip`
+
+Then, activate it via the configuration : 
+
+```yaml
+config:
+  # [...]
+  home_location_source: tiles
+```
+
+This parameter can also activate use of BDTOPO data only or with BAN data to locate population with respectively `building` and `addresses` values. 
+
+### Education distribution
+
+The synthetic data generated by the pipeine so far distribute population to education location without any distinction of age or type of  educational institution. To avoid to send children to university for example, a matching of educational institution and person by age range can be activate via configuration :
+
+```yaml
+config:
+  # [...]
+  education_location_source: weighted
+```
+
+For each type of institution, a weight is attribute by default in the pipeline. To realise a matching weighted with known student numbers by  educational institution, the pipeline can also work with a list of  educational institution from external geojson or geopackage file with `addresses` as parameter value. This file must include `TYPEQU`, `commune_id`,`weight`and `geometry` as column with `weight` number of student and `TYPEQU` type of educational institution code similar as bpe ones. 
\ No newline at end of file
diff --git a/tests/testdata.py b/tests/testdata.py
index 48c78399..cffd7992 100644
--- a/tests/testdata.py
+++ b/tests/testdata.py
@@ -546,7 +546,7 @@ def create(output_path):
     df_education["ARM"] = "Z"
     df_education["IPONDI"] = 1.0
 
-    columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI"]
+    columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI","AGEREV10"]
     df_education.columns = columns
 
     with zipfile.ZipFile("%s/rp_2019/RP2019_MOBSCO_csv.zip" % output_path, "w") as archive:

From 614c24e84e7d92b21302172873b58f7e95d7b47c Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 5 Aug 2024 12:12:28 +0200
Subject: [PATCH 13/19] fix: test fake agerange

---
 tests/testdata.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testdata.py b/tests/testdata.py
index a48d6c7b..b551d259 100644
--- a/tests/testdata.py
+++ b/tests/testdata.py
@@ -597,6 +597,7 @@ def create(output_path):
     ))
     df_education["ARM"] = "Z"
     df_education["IPONDI"] = 1.0
+    df_education["AGEREV10"] = 1
 
     columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI","AGEREV10"]
     df_education.columns = columns

From 4d158eed53047c50c3cc021f09b8e1a1fd9004a1 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Tue, 6 Aug 2024 11:32:19 +0200
Subject: [PATCH 14/19] move flow_output to analysis

---
 {documentation => analysis/grid}/flow_output.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {documentation => analysis/grid}/flow_output.py (100%)

diff --git a/documentation/flow_output.py b/analysis/grid/flow_output.py
similarity index 100%
rename from documentation/flow_output.py
rename to analysis/grid/flow_output.py

From 9af623bdb01886933694bed7e640cb10a3d956d3 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Tue, 6 Aug 2024 11:42:22 +0200
Subject: [PATCH 15/19] rename flow_output.py to comparison_flow_volume.py

---
 analysis/grid/{flow_output.py => comparison_flow_volume.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename analysis/grid/{flow_output.py => comparison_flow_volume.py} (100%)

diff --git a/analysis/grid/flow_output.py b/analysis/grid/comparison_flow_volume.py
similarity index 100%
rename from analysis/grid/flow_output.py
rename to analysis/grid/comparison_flow_volume.py

From cff4cb02a8a716ce77143b1c8105aef98eee943e Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Wed, 7 Aug 2024 17:30:05 +0200
Subject: [PATCH 16/19] update docs for analysis and correction analysis

---
 analysis/grid/comparison_flow_volume.py | 81 ++++++++++++++-----------
 docs/population.md                      | 26 ++++++++
 2 files changed, 70 insertions(+), 37 deletions(-)

diff --git a/analysis/grid/comparison_flow_volume.py b/analysis/grid/comparison_flow_volume.py
index b8cf9b1a..b2506ea1 100644
--- a/analysis/grid/comparison_flow_volume.py
+++ b/analysis/grid/comparison_flow_volume.py
@@ -1,13 +1,8 @@
 import pandas as pd
 import geopandas as gpd
-import numpy as np
-
-import matplotlib.pyplot as plt
-import matplotlib.ticker as tck
-import shapely.geometry as geo
 
 import plotly.express as px 
-import documentation.plotting as plotting
+
 
 SAMPLING_RATE = 0.05
 
@@ -19,13 +14,25 @@ def configure(context):
         context.stage("synthesis.population.enriched")
     context.stage("data.spatial.departments")
 
-    context.config("comparison_file",None)
+    context.config("comparison_file_prefix",None)
     context.config("output_prefix", "ile_de_france_")
     context.config("output_formats", ["csv", "gpkg"])
     context.config("output_path")
     context.config("data_path")
 
+def stat_grid(df_trips,df_locations,df_persons,df_grid):
+    
+    # Write spatial trips
+    df_spatial = pd.merge(df_trips, df_locations[[
+        "person_id", "activity_index", "geometry"
+    ]].rename(columns = {
+        "activity_index": "following_activity_index",
+    }), how = "left", on = ["person_id", "following_activity_index"])
+    df_spatial = pd.merge(df_spatial,df_persons,how = "left", on = ["person_id",])
+    df_spatial = gpd.GeoDataFrame(df_spatial, crs = "EPSG:2154").to_crs("4326")
 
+    df_stats = gpd.sjoin(df_grid,df_spatial,how="left")
+    return df_stats[['id_carr_1km', 'geometry','person_id', 'following_purpose', 'household_id', 'age']]
 def execute(context):
     
     figures = {
@@ -37,6 +44,7 @@ def execute(context):
         "Yrs:50-65":{"min_age": 51, "max_age": 65,},
         "Yrs:65-75":{"min_age": 66, "max_age": 75,},
         "Yrs:75+":{"min_age": 76, "max_age": 110,},}
+    comparison_file = context.config("output_prefix") if context.config("comparison_file_prefix") is None else context.config("comparison_file_prefix")
     
     if not context.config("analysis_from_file"):
         print("Récupération simu données ...")
@@ -49,57 +57,56 @@ def execute(context):
         df_trips["preceding_activity_index"] = df_trips["trip_index"]
         df_trips["following_activity_index"] = df_trips["trip_index"] + 1
 
-    else : 
+    else :
         # from file trips, activites and person
         print("Récupération données ...")
         df_trips = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]]
         df_locations = gpd.read_parquet(f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg')
-        df_persons = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',sep=';')[["person_id", "household_id","age"]]
-
-    # Write spatial trips
-    df_spatial = pd.merge(df_trips, df_locations[[
-        "person_id", "activity_index", "geometry"
-    ]].rename(columns = {
-        "activity_index": "following_activity_index",
-    }), how = "left", on = ["person_id", "following_activity_index"])
-    df_spatial = pd.merge(df_spatial,df_persons,how = "left", on = ["person_id",])
-    df_spatial = gpd.GeoDataFrame(df_spatial, crs = "EPSG:2154").to_crs("4326")
-
-    list_purpose = list(df_spatial["following_purpose"].unique())
+        df_persons = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',sep=';')[["person_id", "household_id","age"]]  
+    print("Récupération comp données ...")
+    df_trips_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]]
+    df_locations_comp = gpd.read_parquet(f'{context.config("output_path")}/{comparison_file}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{comparison_file}activities.gpkg')
+    df_persons_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}persons.csv',sep=';')[["person_id", "household_id","age"]]  
+    
+    list_purpose = list(df_trips["following_purpose"].unique())
 
     # grid 1km of location data
     df_departments = context.stage("data.spatial.departments")
     poly_dep = df_departments.unary_union
-    df_tiles = gpd.read_file(
-            f'{context.config("data_path")}/tiles_2019/grille200m_metropole.gpkg',
+    df_grids = gpd.read_file(
+            f'{context.config("data_path")}/grid/grille200m_metropole.gpkg',
             mask=poly_dep,
-        ) if context.config("comparison_file") is None else gpd.read_parquet(f'{context.config("data_path")}/tiles_2019/{context.config("comparison_file")}')
-    df_tiles = df_tiles.to_crs("4326")
-    df_tile = df_tiles[["id_carr_1km","geometry"]].dissolve(by="id_carr_1km").reset_index()
-
-    df_stats = gpd.sjoin(df_tile,df_spatial,how="left")
-    
-
-    point = df_tiles.unary_union.centroid # a changé avec ploy_dep
-    print("Impression cartes ...")
+        )
+    df_grids = df_grids.to_crs("4326")
+    df_grid = df_grids[["id_carr_1km","geometry"]].dissolve(by="id_carr_1km").reset_index()
+
+    df_stats = stat_grid(df_trips,df_locations,df_persons,df_grid)
+    df_grids = stat_grid(df_trips_comp,df_locations_comp,df_persons_comp,df_grid)
+    point = df_grid.unary_union.centroid # a changé avec ploy_dep
+    print("Printing grids...")
     for prefix, figure in figures.items():
         df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])]
         df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
         df_select_age = df_select_age[~(df_select_age["geometry"].isna())]
         df_select_age["following_purpose"] = df_select_age["following_purpose"].astype('str')
 
+        df_grids_age = df_grids[df_grids["age"].between(figure["min_age"],figure["max_age"])]
+        df_grids_age = df_grids_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
+        df_grids_age = df_grids_age[~(df_grids_age["geometry"].isna())]
+        df_grids_age["following_purpose"] = df_grids_age["following_purpose"].astype('str')
+
         for purpose in list_purpose :
             df_select = df_select_age[df_select_age["following_purpose"]==purpose].rename(columns={"person_id":"count"})
-            df_tiles_select = pd.DataFrame() if context.config("comparison_file") is None else df_tiles[(df_tiles["age"]==prefix)&(df_tiles["following_purpose"]==purpose)]
-            if df_tiles_select.empty :
-                df_select = gpd.sjoin(df_select.drop(columns=['index_right']),df_tile,how='right',predicate="contains").fillna(0)
+            df_grids_select = df_grids_age[df_grids_age["following_purpose"]==purpose].rename(columns={"person_id":"count"})
+            if context.config("output_prefix") == comparison_file :
+                df_select = gpd.sjoin(df_select,df_grid,how='right',predicate="contains").fillna(0)
                 df_select  = df_select[df_select["count"] != 0]
                 fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds',
-                                        mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose")
+                                        mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Localisation flow distribution for {prefix} group with {purpose} purpose")
                 fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
             else :
-                df_tiles_select = gpd.sjoin(df_tiles_select,df_tile,how='right',predicate="contains").fillna(0)
-                df_select = gpd.sjoin(df_select.drop(columns=['index_right']),df_tiles_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0)
+                df_grids_select = gpd.sjoin(df_grids_select,df_grid,how='right',predicate="contains").fillna(0)
+                df_select = gpd.sjoin(df_select,df_grids_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0)
                 df_select["volume_difference"] = df_select["volume_studied_simu"] - df_select["volume_compared_simu"]
                 df_select  = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)]
                 df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"]
diff --git a/docs/population.md b/docs/population.md
index 09c6b803..d45552d4 100644
--- a/docs/population.md
+++ b/docs/population.md
@@ -12,6 +12,7 @@ This guide will cover the following steps:
 
 - [Gathering the data](#section-data)
 - [Running the pipeline](#section-population)
+- [Analysing synthetic population](#section-analysis)
 
 ## <a name="section-data"></a>Gathering the data
 
@@ -405,3 +406,28 @@ config:
 Caution, this method will fail on communes where the Filosofi subpopulation distributions are missing. In this case,
 we fall back to the `uniform` method.
 
+
+## <a name="section-analysis"></a>Analysing synthetic population
+
+In addition to creating synthetic populations, it is possible to output files for analysis.
+
+### Comparison population on grid
+
+Using the comparison_flow_volume pipeline in the Analysis directory, you can generate grids comparing the volumes of 2 synthetic populations on a grid of 1km² squares for each age group and each purpose of their trips. Like with population creation, ths pipeline is run with the [synpp](https://github.com/eqasim-org/synpp) runner and all parameters needed must be included in the `config.yml` file.
+
+To be able to use this pipeline, you must already have create at least one synthetic population (1 for volume visualization and 2 for comparison) and [download France grid from INSEE](https://www.insee.fr/fr/statistiques/fichier/6214726/grille200m_gpkg.zip). From this *zip* file,  you need to extract `grille200m_metropole.gpkg` and put it into `data/grid`.
+
+Then you need to open the `config.yml` and add the `analysis.grid.comparison_flow_volume` stage in the `run` section. To proprely use the comparison_flow_volume pipeline,you'd have to provide the following config:
+
+```yaml
+config:
+  output_prefix: name_output_studied_
+  comparison_file_prefix: name_output_compared_
+  analysis_from_file: true
+```
+Before running it, make sur that populations have same format of file.
+After running, you should find all grids for each age group and each trips' purpose in the `output`
+folder as: `{output_prefix}_{age group}_{trip pupose}.html`
+
+Note: 
+With `analysis_from_file` at False, the last synthetic population is studied by default. Also if `output_prefix` and `comparison_file_prefix` refer to the same outputs ,or `comparison_file_prefix` is not specified, then only a volume visualisation of this particular population is produced. 
\ No newline at end of file

From 191a79706ef8494dad0699b914382ecdfd0f77e5 Mon Sep 17 00:00:00 2001
From: Vincent Leblond <vleblond@tellae.fr>
Date: Thu, 8 Aug 2024 11:51:50 +0200
Subject: [PATCH 17/19] fix: docs

---
 docs/population.md | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/docs/population.md b/docs/population.md
index d45552d4..0575dca0 100644
--- a/docs/population.md
+++ b/docs/population.md
@@ -345,22 +345,24 @@ To make use of the urban type, the following data is needed:
 
 Then, you should be able to run the pipeline with the configuration explained above.
 
-### No employee enterprise
-The pipeline allows to hide all entreprise without any employee indicated in Sirene data for working place distribution. It can be activate via the configuration :
+### Exclude entreprise with no employee
+
+The pipeline allows to exclude all entreprise without any employee (trancheEffectifsEtablissement is NA, "NN" or "00") indicated in Sirene data for working place distribution. It can be activate via this configuration :
 
 ```yaml
 config:
   # [...]
   exclude_no_employee: true
 ```
-### INSEE tiles 
 
-The pipeline allows to work with INSEE's 200m grid data to locate population in place of using BAN or BDTOPO data. Population is located in the centre of the tiles with the INSEE population weight for each tile. 
+### INSEE 200m tiles data
+
+The pipeline allows to use INSEE 200m tiles data in order to locate population instead of using BAN or BDTOPO data. Population is located in the center of the tiles with the INSEE population weight for each tile.
 
 - In order to use of this location,[download the 200m grid data from INSEE](https://www.insee.fr/fr/statistiques/7655475?sommaire=7655515). The pipeline is currently compatible with 2019 data set.
-- Put  the downloaded *zip* file into `data/tiles_2019`, so you will have the file `data/tiles_2019/Filosofi2019_carreaux_200m_gpkg.zip`
+- Put the downloaded *zip* file into `data/tiles_2019`, so you will have the file `data/tiles_2019/Filosofi2019_carreaux_200m_gpkg.zip`
 
-Then, activate it via the configuration : 
+Then, activate it via the configuration :
 
 ```yaml
 config:
@@ -368,12 +370,12 @@ config:
   home_location_source: tiles
 ```
 
-This parameter can also activate use of BDTOPO data only or with BAN data to locate population with respectively `building` and `addresses` values. 
+This parameter can also activate use of BDTOPO data only or with BAN data to locate population with respectively `building` and `addresses` values.
 
-### Education distribution
+### Education activities locations
 
-The synthetic data generated by the pipeine so far distribute population to education location without any distinction of age or type of educational institution. 
-To avoid to send children to university for example, a matching of educational institution and person by age range can be activate via configuration :
+The synthetic data generated by the pipeline so far distribute population to education locations without any distinction of age or type of educational institution.
+To avoid to send yound children to high school for example, a matching of educational institution and person by age range can be activated via configuration :
 
 ```yaml
 config:
@@ -381,8 +383,16 @@ config:
   education_location_source: weighted
 ```
 
-For each type of institution, a weight is attribute by default in the pipeline. To realise a matching weighted with known student numbers by educational institution, the pipeline can also work with a list of educational institution from external geojson or geopackage file with `addresses` as parameter value.
-This file must include `TYPEQU`, `commune_id`,`weight`and `geometry` as column with `weight` number of student and `TYPEQU` type of educational institution code similar as bpe ones. 
+For each type of institution, a weight is attributed by default in the pipeline. To realise a matching weighted with known student numbers by educational institution, the pipeline can also work with a list of educational institution from external geojson or geopackage file with `addresses` as parameter value.
+This file must include `TYPEQU`, `commune_id`,`weight`and `geometry` as column with `weight` number of student and `TYPEQU` type of educational institution code similar as BPE ones.
+
+```yaml
+config:
+  # [...]
+  education_location_source: adresses
+  education_file: education/education_addresses.geojson
+```
+
 ### Income
 
 This pipeline allows using the [Bhepop2](https://github.com/tellae/bhepop2) package for income assignation. 
@@ -406,14 +416,13 @@ config:
 Caution, this method will fail on communes where the Filosofi subpopulation distributions are missing. In this case,
 we fall back to the `uniform` method.
 
-
 ## <a name="section-analysis"></a>Analysing synthetic population
 
 In addition to creating synthetic populations, it is possible to output files for analysis.
 
 ### Comparison population on grid
 
-Using the comparison_flow_volume pipeline in the Analysis directory, you can generate grids comparing the volumes of 2 synthetic populations on a grid of 1km² squares for each age group and each purpose of their trips. Like with population creation, ths pipeline is run with the [synpp](https://github.com/eqasim-org/synpp) runner and all parameters needed must be included in the `config.yml` file.
+Using the comparison_flow_volume pipeline in the Analysis directory, you can generate grids comparing the volumes of two synthetic populations on a grid of 1km² squares for each age group and each purpose of their trips. Like with population creation, the pipeline is run with the [synpp](https://github.com/eqasim-org/synpp) runner and all parameters needed must be included in the `config.yml` file.
 
 To be able to use this pipeline, you must already have create at least one synthetic population (1 for volume visualization and 2 for comparison) and [download France grid from INSEE](https://www.insee.fr/fr/statistiques/fichier/6214726/grille200m_gpkg.zip). From this *zip* file,  you need to extract `grille200m_metropole.gpkg` and put it into `data/grid`.
 
@@ -425,9 +434,10 @@ config:
   comparison_file_prefix: name_output_compared_
   analysis_from_file: true
 ```
+
 Before running it, make sur that populations have same format of file.
 After running, you should find all grids for each age group and each trips' purpose in the `output`
 folder as: `{output_prefix}_{age group}_{trip pupose}.html`
 
-Note: 
-With `analysis_from_file` at False, the last synthetic population is studied by default. Also if `output_prefix` and `comparison_file_prefix` refer to the same outputs ,or `comparison_file_prefix` is not specified, then only a volume visualisation of this particular population is produced. 
\ No newline at end of file
+Note:
+With `analysis_from_file` at False, the last synthetic population is studied by default. Also if `output_prefix` and `comparison_file_prefix` refer to the same outputs, or `comparison_file_prefix` is not specified, then only a volume visualisation of this particular population is produced.

From bfaae74455dca75b54a88f9551d6ae4d8b365b7b Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Fri, 6 Sep 2024 17:45:07 +0200
Subject: [PATCH 18/19] fx: add changes from requests and recommandations

---
 data/bpe/cleaned.py                           |  5 +-
 data/external/education.py                    | 33 ++++++++++++
 data/hts/entd/cleaned.py                      |  2 -
 data/od/weighted.py                           |  2 +-
 data/tiles/raw.py                             |  8 +--
 synthesis/locations/education.py              | 52 ++++++++-----------
 synthesis/locations/home/addresses.py         |  8 +--
 synthesis/locations/home/locations.py         | 17 ++----
 synthesis/population/enriched.py              |  9 +++-
 .../population/spatial/home/locations.py      | 13 ++---
 .../population/spatial/primary/candidates.py  | 42 +++++++--------
 .../population/spatial/primary/locations.py   | 15 ++----
 12 files changed, 106 insertions(+), 100 deletions(-)
 create mode 100644 data/external/education.py

diff --git a/data/bpe/cleaned.py b/data/bpe/cleaned.py
index 04cb7211..30e1cad3 100644
--- a/data/bpe/cleaned.py
+++ b/data/bpe/cleaned.py
@@ -57,6 +57,9 @@ def execute(context):
 
     df["activity_type"] = df["activity_type"].astype("category")
 
+    #Add 
+    df = df.rename(columns={"TYPEQU":"education_type"})
+    df["weight"] = 500 
     # Clean coordinates
     df["x"] = df["LAMBERT_X"].astype(str).str.replace(",", ".").astype(float)
     df["y"] = df["LAMBERT_Y"].astype(str).str.replace(",", ".").astype(float)
@@ -134,7 +137,7 @@ def execute(context):
         df.loc[outside_indices, "imputed"] = True
 
     # Package up data set
-    df = df[["enterprise_id", "activity_type","TYPEQU", "commune_id", "imputed", "x", "y"]]
+    df = df[["enterprise_id", "activity_type","education_type", "commune_id", "imputed", "x", "y","weight"]]
 
     df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y),crs="EPSG:2154")
 
diff --git a/data/external/education.py b/data/external/education.py
new file mode 100644
index 00000000..78950ce1
--- /dev/null
+++ b/data/external/education.py
@@ -0,0 +1,33 @@
+import shapely.geometry as geo
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+
+def configure(context):
+    context.stage("data.bpe.cleaned")
+    context.stage("data.spatial.municipalities")
+
+    context.config("data_path")
+    context.config("education_file", "education/education_addresses.geojson")
+
+def execute(context):
+    df_locations = context.stage("data.bpe.cleaned")[[
+         "activity_type", "education_type", "commune_id","weight", "geometry"
+    ]]
+
+    df_locations = df_locations[df_locations["activity_type"] == "education"]
+    df_locations = df_locations[["activity_type","education_type", "commune_id", "geometry"]].copy()
+    df_locations["fake"] = False    
+
+    df_zones = context.stage("data.spatial.municipalities")    
+    required_communes = set(df_zones["commune_id"].unique())      
+
+
+    df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["education_type", "commune_id","weight", "geometry"]]
+    df_education["fake"] = False
+    df_education = df_education.to_crs("2154")
+    df_education["activity_type"] = "education"
+    list_type = set(df_education["education_type"].unique())
+    df_locations = pd.concat([df_locations[~(df_locations["education_type"].str.startswith(tuple(list_type)))],df_education[df_education["commune_id"].isin(required_communes)]])
+
+    return df_locations
diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py
index be2ffc94..51bfd966 100644
--- a/data/hts/entd/cleaned.py
+++ b/data/hts/entd/cleaned.py
@@ -247,8 +247,6 @@ def execute(context):
 
     # Socioprofessional class
     df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10
-    
-    hts.fix_activity_types(df_trips)
 
     # Fix activity types (because of 1 inconsistent ENTD data)
     hts.fix_activity_types(df_trips)
diff --git a/data/od/weighted.py b/data/od/weighted.py
index 6e537f15..f50702f6 100644
--- a/data/od/weighted.py
+++ b/data/od/weighted.py
@@ -24,7 +24,7 @@ def fix_origins(df, commune_ids, purpose,category):
     for origin_id in missing_ids:
         for destination_id in commune_ids:
             for category_name in categories :
-                rows.append((origin_id, destination_id, category_name, 1.0/len(categories) if origin_id == destination_id else 0.0))
+                rows.append((origin_id, destination_id, category_name, 1.0 if origin_id == destination_id else 0.0))
 
     print("Fixing %d origins for %s" % (len(missing_ids), purpose))
 
diff --git a/data/tiles/raw.py b/data/tiles/raw.py
index 756b1a0f..b42a5d33 100644
--- a/data/tiles/raw.py
+++ b/data/tiles/raw.py
@@ -34,24 +34,24 @@ def execute(context):
                         f'{context.path()}/{context.config("tiles_file")}',
                         mask=poly_dep,
                     )[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename(
-                        columns={"idcar_200m": "id_tiles", "men": "weight"}
+                        columns={"idcar_200m": "home_location_id", "men": "weight"}
                     )
     else:
         df_tiles = gpd.read_file(
             f'{context.config("data_path")}/{context.config("tiles_path")}/{context.config("tiles_file")}',
             mask=poly_dep,
         )[["idcar_200m", "lcog_geo", "ind", "men", "geometry"]].rename(
-            columns={"idcar_200m": "id_tiles", "men": "weight"}
+            columns={"idcar_200m": "home_location_id", "men": "weight"}
         )
 
-    df_tiles["id_tiles"] = df_tiles["id_tiles"].str[14:]
+    df_tiles["home_location_id"] = df_tiles["home_location_id"].str[14:]
     df_tiles["geometry"] = df_tiles["geometry"].centroid
     df_tiles["department_id"] = df_tiles["lcog_geo"].str[:2]
 
     for department_id in df_departments["departement_id"].values:
         assert np.count_nonzero(df_tiles["department_id"] == department_id) > 0
 
-    return df_tiles[["id_tiles", "weight", "geometry"]]
+    return df_tiles[["home_location_id", "weight", "geometry"]]
 
 
 def validate(context):
diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index 4fbd48fc..629e3683 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -4,12 +4,12 @@
 import geopandas as gpd
 
 def configure(context):
-    context.stage("data.bpe.cleaned")
     context.stage("data.spatial.municipalities")
 
-    if context.config("education_location_source", "bpe") == 'addresses' :
-        context.config("data_path")
-        context.config("education_file", "education/education_addresses.geojson")
+    if context.config("education_location_source","bpe") == "addresses":
+        context.stage("data.external.education", alias = "location_source")
+    else:
+        context.stage("data.bpe.cleaned", alias = "location_source")
 
 EDUCATION_WEIGHT_MAP = [
     ("C101", 100),  # Preschools
@@ -22,7 +22,7 @@ def configure(context):
     ("C304", 30),  # General and technological classes in professional high schools
     ("C305", 30),  # Professional classes in general and technological high schools
     ("C403", 1000),  # Business schools
-    ("C5", 2000),  # University
+    ("C501", 2000),  # University
 ]
 
 def fake_education(missing_communes, c, df_locations, df_zones):
@@ -45,62 +45,54 @@ def fake_education(missing_communes, c, df_locations, df_zones):
         pd.DataFrame.from_records(df_added), crs=df_locations.crs
     )
     df_added["fake"] = True
-    df_added["TYPEQU"] = c
+    df_added["education_type"] = c
     df_added["weight"] = 1
 
     return df_added
 
 def execute(context):
-    df_locations = context.stage("data.bpe.cleaned")[[
-        "enterprise_id", "activity_type", "TYPEQU", "commune_id", "geometry"
-    ]]
+    df_locations = context.stage("location_source")
 
     df_locations = df_locations[df_locations["activity_type"] == "education"]
-    df_locations = df_locations[["TYPEQU", "commune_id", "geometry"]].copy()
+    df_locations = df_locations[["education_type", "commune_id","weight", "geometry"]].copy()
     df_locations["fake"] = False
-    df_locations["weight"] = 500 
 
     # Add education destinations to the centroid of zones that have no other destinations
     df_zones = context.stage("data.spatial.municipalities")
 
     required_communes = set(df_zones["commune_id"].unique())  
-    
+        
     if context.config("education_location_source") != 'bpe': # either weighted or addresses
         for prefix, weight in EDUCATION_WEIGHT_MAP:
-            df_locations.loc[df_locations["TYPEQU"].str.startswith(prefix), "weight"] = (
+            df_locations.loc[df_locations["education_type"]==prefix, "weight"] = (
                 weight
-            ) 
-        
-        if context.config("education_location_source") == 'addresses':
-            # Data in model of bpe cleaned
-            df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["TYPEQU", "commune_id","weight", "geometry"]]
-            df_education["fake"] = False
-            df_education = df_education.to_crs("2154")
-            list_type = set(df_education["TYPEQU"].unique())
-            df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].str.startswith(tuple(list_type)))],df_education[df_education["commune_id"].isin(required_communes)]])
-        
-      
+            )  
+    if context.config("education_location_source") != 'bpe' :
+
+         
         # Add education destinations in function of level education
         for c in ["C1", "C2", "C3"]:
-            missing_communes = required_communes - set(df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
+            missing_communes = required_communes - set(df_locations[df_locations["education_type"].str.startswith(c)]["commune_id"].unique())
 
             if len(missing_communes) > 0:
                 df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)])
         
         # Add education destinations for last level education
-        missing_communes = required_communes - set(df_locations[~(df_locations["TYPEQU"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique())
+        missing_communes = required_communes - set(df_locations[~(df_locations["education_type"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique())
 
         if len(missing_communes) > 0:
 
            df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
     else :
+
         missing_communes = required_communes - set(df_locations["commune_id"].unique())
         if len(missing_communes) > 0:
 
-            df_locations = pd.concat([df_locations,fake_education(missing_communes, "C", df_locations, df_zones)])
+            df_locations = pd.concat([df_locations,fake_education(missing_communes, "C0", df_locations, df_zones)])
+    df_locations["education_type"] = df_locations["education_type"].str[:2].astype("category")
 
     # Define identifiers
-    df_locations["location_id"] = np.arange(len(df_locations))
+    df_locations["location_id"]= np.arange(len(df_locations))
     df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str)
-
-    return df_locations[["location_id","TYPEQU", "weight", "commune_id", "fake", "geometry"]]
+    print(df_locations.columns)
+    return df_locations[["location_id","education_type", "commune_id","weight","fake", "geometry"]]
diff --git a/synthesis/locations/home/addresses.py b/synthesis/locations/home/addresses.py
index d66e5388..01410a13 100644
--- a/synthesis/locations/home/addresses.py
+++ b/synthesis/locations/home/addresses.py
@@ -57,17 +57,17 @@ def execute(context):
 
     # Put together matched and missing addresses
     df_addresses = pd.concat([df_addresses, df_missing])
-    df_addresses = gpd.GeoDataFrame(df_addresses, crs = df_buildings.crs)
+    df_addresses = gpd.GeoDataFrame(df_addresses, crs = df_buildings.crs).rename(columns={"building_id":"home_location_id"})
 
     # Obtain weights for all addresses
     if context.config("home_location_weight") == "housing":
-        df_count = df_addresses.groupby("building_id").size().reset_index(name = "count")
-        df_addresses = pd.merge(df_addresses, df_count, on = "building_id")
+        df_count = df_addresses.groupby("home_location_id").size().reset_index(name = "count")
+        df_addresses = pd.merge(df_addresses, df_count, on = "home_location_id")
         df_addresses["weight"] = df_addresses["housing"] / df_addresses["count"]
     else:
         df_addresses["weight"] = 1.0
     
-    return df_addresses[["building_id", "weight", "geometry"]]
+    return df_addresses[["home_location_id", "weight", "geometry"]]
 
 def validate(context):
     assert context.config("home_location_source") in ("addresses", "buildings","tiles")
diff --git a/synthesis/locations/home/locations.py b/synthesis/locations/home/locations.py
index c35dc5ff..391748ec 100644
--- a/synthesis/locations/home/locations.py
+++ b/synthesis/locations/home/locations.py
@@ -10,9 +10,9 @@
 def configure(context):
     context.stage("data.spatial.iris")
     if context.config("home_location_source", "addresses") == "tiles":
-        context.stage("data.tiles.raw")
+        context.stage("data.tiles.raw", alias = "location_source")
     else:
-        context.stage("synthesis.locations.home.addresses")
+        context.stage("synthesis.locations.home.addresses", alias = "location_source")
 
 def execute(context):
     # Find required IRIS
@@ -20,11 +20,7 @@ def execute(context):
     required_iris = set(df_iris["iris_id"].unique())
     
     # Load all addresses and add IRIS information
-    df_addresses = (
-        context.stage("data.tiles.raw")
-        if context.config("home_location_source") == "tiles"
-        else context.stage("synthesis.locations.home.addresses")
-    )
+    df_addresses = context.stage("location_source")
 
     print("Imputing IRIS into addresses ...")
    
@@ -45,11 +41,6 @@ def execute(context):
             len(missing_iris), len(required_iris)))
 
         df_added = []
-        id_name = (
-            "id_tiles"
-            if context.config("home_location_source") == "tiles"
-            else "building_id"
-        )
         for iris_id in sorted(missing_iris):
             centroid = df_iris[df_iris["iris_id"] == iris_id]["geometry"].centroid.iloc[0]
 
@@ -57,7 +48,7 @@ def execute(context):
                 "iris_id": iris_id, "geometry": centroid,
                 "commune_id": iris_id[:5],
                 "weight" : 1,
-                id_name: -1
+                "home_location_id": -1
             })
 
         df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_addresses.crs)
diff --git a/synthesis/population/enriched.py b/synthesis/population/enriched.py
index 94a9ee6b..15fc5649 100644
--- a/synthesis/population/enriched.py
+++ b/synthesis/population/enriched.py
@@ -84,5 +84,12 @@ def execute(context):
     df_population.loc[df_population["number_of_bikes"] < df_population["household_size"], "bike_availability"] = "some"
     df_population.loc[df_population["number_of_bikes"] == 0, "bike_availability"] = "none"
     df_population["bike_availability"] = df_population["bike_availability"].astype("category")
-
+    
+    # Add age range for education
+    df_population["age_range"] = "higher_education"
+    df_population.loc[df_population["age"]<=10,"age_range"] = "primary_school"
+    df_population.loc[df_population["age"].between(11,14),"age_range"] = "middle_school"
+    df_population.loc[df_population["age"].between(15,17),"age_range"] = "high_school"
+    df_population["age_range"] = df_population["age_range"].astype("category")
+    
     return df_population
diff --git a/synthesis/population/spatial/home/locations.py b/synthesis/population/spatial/home/locations.py
index 6733a16b..9347e5ec 100644
--- a/synthesis/population/spatial/home/locations.py
+++ b/synthesis/population/spatial/home/locations.py
@@ -40,10 +40,8 @@ def _sample_locations(context, args):
     
     # Apply selection
     df_homes["geometry"] = df_locations.iloc[indices]["geometry"].values
-    if context.config("home_location_source") == "tiles":
-        df_homes["id_tiles"] = df_locations.iloc[indices]["id_tiles"].values
-    else:
-        df_homes["building_id"] = df_locations.iloc[indices]["building_id"].values
+    df_homes["home_location_id"] = df_locations.iloc[indices]["home_location_id"].values
+    
     # Update progress
     context.progress.update()
 
@@ -64,9 +62,6 @@ def execute(context):
         )) as parallel:
             seeds = random.randint(10000, size = len(unique_iris_ids))
             df_homes = pd.concat(parallel.map(_sample_locations, zip(unique_iris_ids, seeds)))
-    out = (
-        ["household_id", "commune_id", "id_tiles", "geometry"]
-        if context.config("home_location_source") == "tiles"
-        else ["household_id", "commune_id", "building_id", "geometry"]
-    )
+    out = ["household_id", "commune_id", "home_location_id", "geometry"]
+        
     return df_homes[out]
diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
index fd61c99e..414ee19a 100644
--- a/synthesis/population/spatial/primary/candidates.py
+++ b/synthesis/population/spatial/primary/candidates.py
@@ -11,19 +11,15 @@ def configure(context):
     context.stage("synthesis.population.enriched")
     context.stage("synthesis.population.trips")
 
+    context.config("output_path")
     context.config("random_seed")
     context.config("education_location_source", "bpe")
 
 EDUCATION_MAPPING = {
-    "primary_school": {"min_age": 0, "max_age": 10, "type_edu": "C1"},
-    "middle_school": {"min_age": 11, "max_age": 14, "type_edu": "C2"},
-    "high_school": {"min_age": 15, "max_age": 17, "type_edu": "C3"},
-    "higher_education": {
-        "min_age": 18,
-        "max_age": 110,
-        "type_edu": ("C4", "C5", "C6"),
-    },
-}
+    "primary_school": ["C1"],
+    "middle_school": ["C2"],
+    "high_school": ["C3"],
+    "higher_education": ["C4", "C5", "C6"]}
 
 def sample_destination_municipalities(context, arguments):
     # Load data
@@ -45,11 +41,11 @@ def sample_locations(context, arguments):
     # Load data
     destination_id, random_seed = arguments
     df_locations, df_flow = context.data("df_locations"), context.data("df_flow")
-
+    df_locations.to_csv(context.config("output_path")+"/erreur.csv")
     # Prepare state
     random = np.random.RandomState(random_seed)
     df_locations = df_locations[df_locations["commune_id"] == destination_id]
-
+    
     # Determine demand
     df_flow = df_flow[df_flow["destination_id"] == destination_id]
     count = df_flow["count"].sum()
@@ -59,7 +55,7 @@ def sample_locations(context, arguments):
 
     if "weight" in df_locations:
         weight = df_locations["weight"].values / df_locations["weight"].sum()
-
+    
     location_counts = random.multinomial(count, weight)
     location_ids = df_locations["location_id"].values
     location_ids = np.repeat(location_ids, location_counts)
@@ -79,8 +75,8 @@ def sample_locations(context, arguments):
 
     return df_result
 
-def process(context, purpose, random, df_persons, df_od, df_locations):
-    df_persons = df_persons[df_persons["has_%s_trip" % purpose.split("_")[0]]]
+def process(context, purpose, random, df_persons, df_od, df_locations,step_name):
+    df_persons = df_persons[df_persons["has_%s_trip" % purpose]]
 
     # Sample commute flows based on population
     df_demand = df_persons.groupby("commune_id").size().reset_index(name = "count")
@@ -90,7 +86,7 @@ def process(context, purpose, random, df_persons, df_od, df_locations):
 
     df_flow = []
 
-    with context.progress(label = "Sampling %s municipalities" % purpose, total = len(df_demand)) as progress:
+    with context.progress(label = "Sampling %s municipalities" % step_name, total = len(df_demand)) as progress:
         with context.parallel(dict(df_od = df_od)) as parallel:
             for df_partial in parallel.imap_unordered(sample_destination_municipalities, df_demand.itertuples(index = False, name = None)):
                 df_flow.append(df_partial)
@@ -114,7 +110,7 @@ def process(context, purpose, random, df_persons, df_od, df_locations):
 
 def execute(context):
     # Prepare population data
-    df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id", "age"]].copy()
+    df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id", "age_range"]].copy()
     df_trips = context.stage("synthesis.population.trips")
 
     df_persons["has_work_trip"] = df_persons["person_id"].isin(df_trips[
@@ -137,26 +133,26 @@ def execute(context):
     df_locations = context.stage("synthesis.locations.work")
     df_locations["weight"] = df_locations["employees"]
     df_work = process(context, "work", random, df_persons,
-        df_work_od, df_locations
+        df_work_od, df_locations, "work"
     )
 
     df_locations = context.stage("synthesis.locations.education")
     if context.config("education_location_source") == 'bpe':
-        df_education = process(context, "education", random, df_persons, df_education_od, df_locations)
+        df_education = process(context, "education", random, df_persons, df_education_od, df_locations,"education")
     else :
         df_education = []
         for prefix, education_type in EDUCATION_MAPPING.items():
             df_education.append(
-                process(context, "education_" + prefix, random,
-                    df_persons[df_persons["age"].between( education_type["min_age"],education_type["max_age"])],
-                    df_education_od[df_education_od["age_range"]==prefix],df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
+                process(context, "education", random,
+                    df_persons[df_persons["age_range"]==prefix],
+                    df_education_od[df_education_od["age_range"]==prefix],df_locations[df_locations["education_type"].isin(education_type)],prefix)
             )
-        df_education = pd.concat(df_education).sort_values(["origin_id", "destination_id"])
+        df_education = pd.concat(df_education)
 
     return dict(
         work_candidates = df_work,
         education_candidates = df_education,
         persons = df_persons[df_persons["has_work_trip"] | df_persons["has_education_trip"]][[
-            "person_id", "household_id", "age", "commune_id", "has_work_trip", "has_education_trip"
+            "person_id", "household_id", "age_range", "commune_id", "has_work_trip", "has_education_trip"
         ]]
     )
diff --git a/synthesis/population/spatial/primary/locations.py b/synthesis/population/spatial/primary/locations.py
index 143bc2b6..136e18ac 100644
--- a/synthesis/population/spatial/primary/locations.py
+++ b/synthesis/population/spatial/primary/locations.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import geopandas as gpd
+from .candidates import EDUCATION_MAPPING
 
 def configure(context):
     context.stage("synthesis.population.spatial.primary.candidates")
@@ -11,16 +12,6 @@ def configure(context):
 
     context.config("education_location_source", "bpe")
 
-EDUCATION_MAPPING = {
-    "primary_school": {"min_age": 0, "max_age": 10, "type_edu": "C1"},
-    "middle_school": {"min_age": 11, "max_age": 14, "type_edu": "C2"},
-    "high_school": {"min_age": 15, "max_age": 17, "type_edu": "C3"},
-    "higher_education": {
-        "min_age": 18,
-        "max_age": 110,
-        "type_edu": ("C4", "C5", "C6"),
-    },
-}
 
 def define_distance_ordering(df_persons, df_candidates, progress):
     indices = []
@@ -119,7 +110,7 @@ def execute(context):
     df_work_candidates = pd.merge(df_work_candidates, df_locations, how = "left", on = "location_id")
     df_work_candidates = gpd.GeoDataFrame(df_work_candidates)
 
-    df_locations = context.stage("synthesis.locations.education")[["TYPEQU", "location_id", "geometry"]]
+    df_locations = context.stage("synthesis.locations.education")[["education_type", "location_id", "geometry"]]
     df_education_candidates = data["education_candidates"]
     df_education_candidates = pd.merge(df_education_candidates, df_locations, how = "left", on = "location_id")
     df_education_candidates = gpd.GeoDataFrame(df_education_candidates)
@@ -131,6 +122,6 @@ def execute(context):
     else :
         education = []
         for prefix, education_type in EDUCATION_MAPPING.items():
-            education.append(process(context, "education_" + prefix,df_education[df_education["age"].between(education_type["min_age"],education_type["max_age"])],df_education_candidates[df_education_candidates["TYPEQU"].str.startswith(education_type["type_edu"])]))
+            education.append(process(context, prefix,df_education[df_education["age_range"]==prefix],df_education_candidates[df_education_candidates["education_type"].isin(education_type)]))
         df_education = pd.concat(education).sort_index()
     return df_work, df_education

From acf3b80ffdb072d8fb7ace1c8c54b18798770549 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 23 Sep 2024 15:49:02 +0200
Subject: [PATCH 19/19] fix : remove errors

---
 synthesis/locations/education.py                   | 2 +-
 synthesis/population/spatial/primary/candidates.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index 629e3683..45a32a70 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -94,5 +94,5 @@ def execute(context):
     # Define identifiers
     df_locations["location_id"]= np.arange(len(df_locations))
     df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str)
-    print(df_locations.columns)
+    
     return df_locations[["location_id","education_type", "commune_id","weight","fake", "geometry"]]
diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
index 414ee19a..7af9963c 100644
--- a/synthesis/population/spatial/primary/candidates.py
+++ b/synthesis/population/spatial/primary/candidates.py
@@ -41,7 +41,7 @@ def sample_locations(context, arguments):
     # Load data
     destination_id, random_seed = arguments
     df_locations, df_flow = context.data("df_locations"), context.data("df_flow")
-    df_locations.to_csv(context.config("output_path")+"/erreur.csv")
+
     # Prepare state
     random = np.random.RandomState(random_seed)
     df_locations = df_locations[df_locations["commune_id"] == destination_id]