Skip to content

Commit

Permalink
feat: use urban typology for activity chain matching (#209)
Browse files Browse the repository at this point in the history
* alpha version

* alpha version with scripts add...

* fix: properly handling cities with districts

* config file update

* clean branch from extra config files for interlab use

* cleanup

* further cleanup

* make matching attributes configurable

* monkey patching openpyxl to read excel sheet

* make configurable

* add test data to ENTD

* add documentation

* update tests

* testing and egt

* update docs

---------

Co-authored-by: Arthur BURIANNE <[email protected]>
Co-authored-by: Tarek Chouaki <[email protected]>
Co-authored-by: Sebastian Hörl <[email protected]>
  • Loading branch information
4 people authored Mar 18, 2024
1 parent 3c9b137 commit 43af03e
Show file tree
Hide file tree
Showing 13 changed files with 269 additions and 36 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

**Under development**

- feat: make statistical matching attribute list configurable
- feat: add urban type classifiation (unité urbaine)
- feat: functionality to make use of INSEE population projection data
- update: don't remove households with people not living/studying in Île-de-France anymore to be more consistent with other use cases
- fix bug where always one household_id existed twice
Expand Down
18 changes: 17 additions & 1 deletion data/census/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ def configure(context):
context.stage("data.census.raw")
context.stage("data.spatial.codes")

if context.config("use_urban_type", False):
context.stage("data.spatial.urban_type")

def execute(context):
df = context.stage("data.census.raw")

Expand Down Expand Up @@ -96,11 +99,24 @@ def execute(context):
# Consumption units
df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id")

return df[[
df = df[[
"person_id", "household_id", "weight",
"iris_id", "commune_id", "departement_id",
"age", "sex", "couple",
"commute_mode", "employed",
"studies", "number_of_vehicles", "household_size",
"consumption_units", "socioprofessional_class"
]]

if context.config("use_urban_type"):
df_urban_type = context.stage("data.spatial.urban_type")[[
"commune_id", "urban_type"
]]

# Impute urban type
df = pd.merge(df, df_urban_type, on = "commune_id", how = "left")
df.loc[df["commune_id"] == "undefined", "urban_type"] = "none"
df["commune_id"] = df["commune_id"].astype("category")
assert ~np.any(df["urban_type"].isna())

return df
21 changes: 21 additions & 0 deletions data/hts/egt/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
def configure(context):
context.stage("data.hts.egt.raw")

if context.config("use_urban_type", False):
context.stage("data.spatial.urban_type")

INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6]

PURPOSE_MAP = {
Expand Down Expand Up @@ -111,6 +114,24 @@ def execute(context):
df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1
df_households["income_class"] = df_households["income_class"].astype(int)

# Impute urban type
if context.config("use_urban_type"):
df_urban_type = context.stage("data.spatial.urban_type")[[
"commune_id", "urban_type"
]]

# Household municipality
df_households["commune_id"] = df_households["RESCOMM"].astype("category")
df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left")
assert np.all(~df_persons["commune_id"].isna())

# Impute urban type
df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left")
df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category")

df_households.drop(columns = ["commune_id"])
df_persons.drop(columns = ["commune_id"])

# Trip purpose
df_trips["following_purpose"] = "other"
df_trips["preceding_purpose"] = "other"
Expand Down
13 changes: 9 additions & 4 deletions data/hts/egt/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def configure(context):

def execute(context):
df_codes = context.stage("data.spatial.codes")
assert (df_codes["region_id"] == 11).all() # Otherwise EGT doesn't make sense

df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned")

Expand All @@ -39,9 +38,15 @@ def execute(context):
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]

# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]]
df_persons = df_persons[hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]]
df_trips = df_trips[hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]]
household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]
df_households = df_households[household_columns]

person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]
if "urban_type" in df_persons: person_columns.append("urban_type")
df_persons = df_persons[person_columns]

trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]
df_trips = df_trips[trip_columns]

hts.check(df_households, df_persons, df_trips)

Expand Down
11 changes: 11 additions & 0 deletions data/hts/entd/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,17 @@ def execute(context):
df_trips["origin_departement_id"] = df_trips["V2_MORIDEP"].fillna("undefined").astype("category")
df_trips["destination_departement_id"] = df_trips["V2_MDESDEP"].fillna("undefined").astype("category")

# Clean urban type
df_households["urban_type"] = df_households["numcom_UU2010"].replace({
"B": "suburb",
"C": "central_city",
"I": "isolated_city",
"R": "none"
})

assert np.all(~df_households["urban_type"].isna())
df_households["urban_type"] = df_households["urban_type"].astype("category")

# Clean employment
df_persons["employed"] = df_persons["SITUA"].isin([1, 2])

Expand Down
2 changes: 1 addition & 1 deletion data/hts/entd/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def execute(context):
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]

# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"]]
df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]
df_persons = df_persons[hts.PERSON_COLUMNS]
df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]]

Expand Down
2 changes: 1 addition & 1 deletion data/hts/entd/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

Q_TCM_MENAGE_COLUMNS = [
"NPERS", "PONDV1", "TrancheRevenuMensuel",
"DEP", "idENT_MEN", "RG"
"DEP", "idENT_MEN", "RG", "numcom_UU2010"
]

Q_INDIVIDU_COLUMNS = [
Expand Down
73 changes: 73 additions & 0 deletions data/spatial/urban_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd
import os
import zipfile
import numpy as np

# START Money patching openpyxl to parse INSEE file
from openpyxl.styles.colors import WHITE, RGB
__old_rgb_set__ = RGB.__set__

def __rgb_set_fixed__(self, instance, value):
try:
__old_rgb_set__(self, instance, value)
except ValueError as e:
if e.args[0] == 'Colors must be aRGB hex values':
__old_rgb_set__(self, instance, WHITE)

RGB.__set__ = __rgb_set_fixed__
# END Monkey patching openpyxl

# Loads the input data for the urban type (unité urbain)

def configure(context):
context.stage("data.spatial.municipalities")

context.config("data_path")
context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip")

def execute(context):
with zipfile.ZipFile("{}/{}".format(
context.config("data_path"), context.config("urban_type_path"))) as archive:
assert len(archive.filelist) == 1
with archive.open(archive.filelist[0]) as f:
df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5)

df = df[["CODGEO", "STATUT_2017"]].copy()
df = df.set_axis(["commune_id", "urban_type"], axis = "columns")

# Cities that have districts are not detailed in the UU file, only the whole city is mentioned
# However the municipalities file details the districts with their respective INSEE codes
cities_with_districts = {"75056": [str(75101 + i) for i in (range(20))], # Paris
"69123": [str(69001 + i) for i in range(9)], # Lyon
"13055": [str(13201 + i) for i in range(15)]} # Marseilles

# Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts
for city_code in cities_with_districts:
base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"]
replacement_codes = cities_with_districts[city_code]

df = pd.concat([df, pd.DataFrame({
"commune_id": replacement_codes,
"urban_type": [base_type] * len(replacement_codes)
})])

df = df[~df["commune_id"].isin(cities_with_districts.keys())]

# Clean unités urbaines
df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"})
assert np.all(~df["urban_type"].isna())
df["urban_type"] = df["urban_type"].astype("category")

df_municipalities = context.stage("data.spatial.municipalities")
requested_communes = set(df_municipalities["commune_id"].unique())
df = df[df["commune_id"].isin(requested_communes)]

assert len(df["commune_id"].unique()) == len(df)

return df

def validate(context):
if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))):
raise RuntimeError("Urban type data is not available")

return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("urban_type_path")))
28 changes: 28 additions & 0 deletions docs/population.md
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,31 @@ config:
# [...]
projection_scenario: 00_central
```

### Urban type

The pipeline allows to work with INSEE's urban type classification (unité urbaine) that distinguishes municipalities in *center cities*, *suburbs*, *isolated cities*, and unclassified ones. To impute the data (currently only for some HTS), activate it via the configuration:

```yaml
config:
# [...]
use_urban_type: true
```

In order to make use of it for activity chain matching, you can set a custom list of matching attributes like so:

```yaml
config:
# [...]
matching_attributes: ["urban_type", "*default*"]
```

The `*default*` trigger will be replaced by the default list of matching attributes.

Note that not all HTS implement the urban type, so matching may not work with some implementations. Most of them, however, contain the data, we just need to update the code to read them in.

To make use of the urban type, the following data is needed:
- [Download the urban type data from INSEE](https://www.insee.fr/fr/information/4802589). The pipeline is currently compatible with the 2023 data set (referencing 2020 boundaries).
- Put the downloaded *zip* file into `data/urban_type`, so you will have the file `data/urban_type/UU2020_au_01-01-2023.zip`

Then, you should be able to run the pipeline with the configuration explained above.
40 changes: 31 additions & 9 deletions synthesis/population/matched.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,16 @@
"entd": data.hts.entd.cleaned.calculate_income_class,
}

DEFAULT_MATCHING_ATTRIBUTES = [
"sex", "any_cars", "age_class", "socioprofessional_class",
"departement_id"
]

def configure(context):
context.config("processes")
context.config("random_seed")
context.config("matching_minimum_observations", 20)
context.config("matching_attributes", DEFAULT_MATCHING_ATTRIBUTES)

context.stage("synthesis.population.sampled")
context.stage("synthesis.population.income")
Expand Down Expand Up @@ -112,6 +118,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ

progress.update(np.count_nonzero(unassigned_mask))

if np.count_nonzero(unassigned_mask) > 0:
raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?")

assert np.count_nonzero(unassigned_mask) == 0
assert np.count_nonzero(assigned_indices == -1) == 0

Expand Down Expand Up @@ -165,27 +174,40 @@ def execute(context):

df_target = context.stage("synthesis.population.sampled")

columns = context.config("matching_attributes")

try:
default_index = columns.index("*default*")
columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES
except ValueError: pass

# Define matching attributes
AGE_BOUNDARIES = [14, 29, 44, 59, 74, 1000]
df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True)
df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True)

if "age_class" in columns:
df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True)
df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True)

if "income_class" in df_source:
if "income_class" in columns:
df_income = context.stage("synthesis.population.income")[["household_id", "household_income"]]

df_target = pd.merge(df_target, df_income)
df_target["income_class"] = INCOME_CLASS[hts](df_target)

df_target["any_cars"] = df_target["number_of_vehicles"] > 0
df_source["any_cars"] = df_source["number_of_vehicles"] > 0

columns = ["sex", "any_cars", "age_class", "socioprofessional_class"]
if "income_class" in df_source: columns += ["income_class"]
columns += ["departement_id"]
if "any_cars" in columns:
df_target["any_cars"] = df_target["number_of_vehicles"] > 0
df_source["any_cars"] = df_source["number_of_vehicles"] > 0

# Perform statistical matching
df_source = df_source.rename(columns = { "person_id": "hts_id" })

for column in columns:
if not column in df_source:
raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column))

if not column in df_target:
raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column))

df_assignment, levels = parallel_statistical_matching(
context,
df_source, "hts_id", "person_weight",
Expand Down
12 changes: 10 additions & 2 deletions tests/test_determinism.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ def _test_determinism(index, data_path, tmpdir):
regions = [10, 11], sampling_rate = 1.0, hts = "entd",
random_seed = 1000, processes = 1,
secloc_maximum_iterations = 10,
maven_skip_tests = True
maven_skip_tests = True,
matching_attributes = [
"sex", "any_cars", "age_class", "socioprofessional_class",
"income_class", "departement_id"
]
)

stages = [
Expand Down Expand Up @@ -111,7 +115,11 @@ def _test_determinism_matsim(index, data_path, tmpdir):
regions = [10, 11], sampling_rate = 1.0, hts = "entd",
random_seed = 1000, processes = 1,
secloc_maximum_iterations = 10,
maven_skip_tests = True
maven_skip_tests = True,
matching_attributes = [
"sex", "any_cars", "age_class", "socioprofessional_class",
"income_class", "departement_id"
]
)

stages = [
Expand Down
Loading

0 comments on commit 43af03e

Please sign in to comment.