feat: use bhepop2 package for income assignment (#243)

* feat: use bhepop2 for eqasim income assignment * todo: manage zip * corrections from rebase (Filosofi is now read from .zip file) * restore default config file * add missing configuration of income_com_path * add missing request of income_com_xlsx config * remove columns filters and rename from _read_filosofi_excel * cleanup * convert income to integer * remove pandas warning * improve error catches and code layout * include population size in plot title * remove MODALITIES constant describe attribute modalities when giving attribute selection * wip use bhepop2 * merged municipality_attributes.py in municipality.py municipality stage now returns a DataFrame containing income deciles per attributes in addition to the usual global deciles. Two columns "attribute" and "modality" have been added to specify the related attribute and the value it takes (modality). Attribute and modality for global deciles are "all". Filter on "all" attribute and modality have been added where data.income.municipality were used. * move income_uniform_sample and MAXIMUM_INCOME_FACTOR to a utils module * change MAXIMUM_INCOME_FACTOR back to original value * renamed "eqasim method" into "uniform method" * move compare_methods.py to analysis/methods/income/ * renamed bhepop2_income.py module into bhepop2.py * refactor: improve bhepop2 integration merge municipality_attributes.py into municipality.py move compare_methods.py to analysis/methods/income/ created a utils.py module in synthesis/population/income/ to store common functions added test dataset and tests --------- Co-authored-by: leo-desbureaux-tellae <[email protected]> * fix: conflicts * try to make test_determinism work (#6) * add income_assignation_method config documentation * remove blank lines * remove debug print * remove float casting * update docs * renamed "modality" column in "value" --------- Co-authored-by: Valentin LE BESCOND <[email protected]>
eqasim-org · Jul 15, 2024 · f74bd98 · f74bd98
1 parent 90e9416
commit f74bd98
Show file tree

Hide file tree

Showing 19 changed files with 497 additions and 48 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,6 @@ output
 data/nantes_2015
 data/lyon_2015
 .vscode
+.idea
 
-config_local_*.yml
+config_local_*.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 **Under development**
 
+- feat: add a new method for attributing income to housholds using the bhepop2 package
 - fix: fixed special case in repairing ENTD for completely overlapping trips
 - feat: make it possible to disable the test run of MATSim before writing everything out
 - feat: check availability of open data sources for every PR

diff --git a/analysis/methods/income/compare_methods.py b/analysis/methods/income/compare_methods.py
@@ -0,0 +1,103 @@
+from synthesis.population.income.utils import MAXIMUM_INCOME_FACTOR
+from bhepop2.tools import add_household_size_attribute, add_household_type_attribute
+from bhepop2.sources.marginal_distributions import QuantitativeMarginalDistributions
+import pandas as pd
+import os
+
+"""
+Compare income assignation methods available in Eqasim.
+
+Comparison is realised on the synthetic population of the most populated commune.
+"""
+
+COMPARE_INCOME_FOLDER = "compare_income_methods"
+
+
+def configure(context):
+    context.config("output_path")
+    context.stage("data.income.municipality")
+    context.stage("synthesis.population.income.uniform", alias="uniform")
+    context.stage("synthesis.population.income.bhepop2", alias="bhepop2")
+    context.stage("synthesis.population.sampled")
+
+
+def execute(context):
+
+    # get complete population (needed to add attributes)
+    df_population = context.stage("synthesis.population.sampled")
+    df_population = add_household_size_attribute(df_population)
+    df_population = add_household_type_attribute(df_population)
+
+    # get most populated commune
+    commune_id = df_population.groupby(["commune_id"], observed=True)["commune_id"].count().drop("undefined").idxmax()
+
+    # get income distributions by attributes
+    income_df = context.stage("data.income.municipality").query(f"commune_id == '{commune_id}'")
+    income_df = income_df.rename(
+        columns={
+            "value": "modality",
+            "q1": "D1",
+            "q2": "D2",
+            "q3": "D3",
+            "q4": "D4",
+            "q5": "D5",
+            "q6": "D6",
+            "q7": "D7",
+            "q8": "D8",
+            "q9": "D9",
+        }
+    )
+
+    households_with_attributes = df_population[[
+        "household_id", "commune_id", "size", "family_comp"
+    ]].drop_duplicates("household_id")
+
+    # get enriched population with different methods
+    uniform_pop_df = context.stage("uniform")
+    uniform_pop_df = uniform_pop_df.merge(households_with_attributes, on="household_id")
+    uniform_pop_df["household_income"] = (
+            uniform_pop_df["household_income"] * 12 / uniform_pop_df["consumption_units"]
+    )
+    uniform_pop_df = uniform_pop_df.query(f"commune_id == '{commune_id}'")
+
+    bhepop2_pop_df = context.stage("bhepop2")
+    bhepop2_pop_df = bhepop2_pop_df.merge(households_with_attributes, on="household_id")
+    bhepop2_pop_df["household_income"] = (
+            bhepop2_pop_df["household_income"] * 12 / bhepop2_pop_df["consumption_units"]
+    )
+    bhepop2_pop_df = bhepop2_pop_df.query(f"commune_id == '{commune_id}'")
+
+    # prepare populations analysis
+
+    # create a source from the Filosofi distributions
+    marginal_distributions_source = QuantitativeMarginalDistributions(
+        income_df,
+        "Filosofi",
+        ["size", "family_comp"],
+        0,
+        relative_maximum=MAXIMUM_INCOME_FACTOR,
+        delta_min=1000
+    )
+
+    # check output folder existence
+    compare_output_path = os.path.join(context.config("output_path"), COMPARE_INCOME_FOLDER)
+    if not os.path.exists(compare_output_path):
+        os.mkdir(compare_output_path)
+
+    # create an analysis instance
+    analysis = marginal_distributions_source.compare_with_populations(
+        {
+            "Uniform": uniform_pop_df,
+            "Bhepop2": bhepop2_pop_df
+        },
+        feature_name="household_income",
+        output_folder=compare_output_path
+    )
+    analysis.plot_title_format = analysis.plot_title_format + f" \n(commune={commune_id})"
+
+    analysis.generate_analysis_plots()
+    analysis.generate_analysis_error_table()
+
+    print(f"Generated compared analysis of income assignation methods in {compare_output_path}")
+
+
diff --git a/analysis/synthesis/income.py b/analysis/synthesis/income.py
@@ -8,7 +8,7 @@
 
 def configure(context):
     acquisition_sample_size = context.config("acquisition_sample_size")
-    bs.configure(context, "synthesis.population.income", acquisition_sample_size)
+    bs.configure(context, "synthesis.population.income.selected", acquisition_sample_size)
 
 def execute(context):
     acquisition_sample_size = context.config("acquisition_sample_size")
@@ -17,7 +17,7 @@ def execute(context):
     quantiles = []
 
     with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
-        for df_income in bs.get_stages(context, "synthesis.population.income", acquisition_sample_size):
+        for df_income in bs.get_stages(context, "synthesis.population.income.selected", acquisition_sample_size):
             income = 12 * df_income["household_income"] / df_income["consumption_units"]
             quantiles.append([income.quantile(p) for p in probabilities])
             progress.update()

diff --git a/config.yml b/config.yml
@@ -35,3 +35,6 @@ config:
   # generate_vehicles_file: True
   # generate_vehicles_method: fleet_sample
   # vehicles_data_year: 2015
+
+  # Uncomment to use the bhepop2 package for attributing income
+  # income_assignation_method: bhepop2
diff --git a/data/income/municipality.py b/data/income/municipality.py
@@ -3,37 +3,41 @@
 from sklearn.neighbors import KDTree
 import os
 import zipfile
+from bhepop2.tools import read_filosofi_attributes, filosofi_attributes
 
 """
 Loads and prepares income distributions by municipality:
-- Load data with centiles per municipality
-- For those which only provide median: Attach another distribution with most similar median
-- For those which are missing: Attach the distribution of the municiality with the nearest centroid
+- For global distributions
+    - Load data with centiles per municipality
+    - For those which only provide median: Attach another distribution with most similar median
+    - For those which are missing: Attach the distribution of the municiality with the nearest centroid
+- For attribute distributions, read the adequate Filosofi sheet and get the percentiles
 """
 
+
+# for now, only household size and family comp can be inferred from eqasim population
+EQASIM_INCOME_ATTRIBUTES = ["size", "family_comp"]
+
+# final columns of the income DataFrame
+INCOME_DF_COLUMNS = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "attribute", "value", "is_imputed", "is_missing", "reference_median"]
+
+
 def configure(context):
     context.config("data_path")
     context.stage("data.spatial.municipalities")
     context.config("income_com_path", "filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip")
     context.config("income_com_xlsx", "FILO2019_DISP_COM.xlsx")
     context.config("income_year", 19)
 
-def execute(context):
-    # Load income distribution
-    year = str(context.config("income_year"))
-
-    with zipfile.ZipFile("{}/{}".format(
-        context.config("data_path"), context.config("income_com_path"))) as archive:
-        with archive.open(context.config("income_com_xlsx")) as f:
-            df = pd.read_excel(f,
-                sheet_name = "ENSEMBLE", skiprows = 5
-            )[["CODGEO"] + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)]]
-            df.columns = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]
-            df["reference_median"] = df["q5"].values
 
-    # Verify spatial data for education
-    df_municipalities = context.stage("data.spatial.municipalities")
+def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_municipalities):
     requested_communes = set(df_municipalities["commune_id"].unique())
+
+    df = filsofi_sheets["ENSEMBLE"][["CODGEO"] + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)]]
+    df.columns = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]
+    df.loc[:, "reference_median"] = df["q5"].values
+
+    # filter requested communes
     df = df[df["commune_id"].isin(requested_communes)]
 
     # Find communes without data
@@ -86,7 +90,83 @@ def execute(context):
     assert len(df) == len(df["commune_id"].unique())
     assert len(requested_communes - set(df["commune_id"].unique())) == 0
 
-    return df[["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "is_imputed", "is_missing", "reference_median"]]
+    # add attribute and value "all" (ie ENSEMBLE)
+    df["attribute"] = "all"
+    df["value"] = "all"
+
+    return df[INCOME_DF_COLUMNS]
+
+
+def _income_distributions_from_filosofi_attribute_sheets(filsofi_sheets, year, df_municipalities, attributes):
+    requested_communes = set(df_municipalities["commune_id"].unique())
+
+    # read attributes
+    df_with_attributes = read_filosofi_attributes(filsofi_sheets, year, attributes, requested_communes)
+
+    df_with_attributes.rename(
+        columns={
+            "modality": "value",
+            "D1": "q1",
+            "D2": "q2",
+            "D3": "q3",
+            "D4": "q4",
+            "D5": "q5",
+            "D6": "q6",
+            "D7": "q7",
+            "D8": "q8",
+            "D9": "q9",
+        },
+        inplace=True,
+    )
+
+    # add eqasim columns, data is not imputed nor missing
+    df_with_attributes["is_imputed"] = False
+    df_with_attributes["is_missing"] = False
+
+    return df_with_attributes[INCOME_DF_COLUMNS]
+
+
+def _read_filosofi_excel(context):
+
+    # browse Filosofi attributes, select those available in Eqasim
+    attributes = []
+    sheet_list = ["ENSEMBLE"]
+    for attr in filosofi_attributes:
+        if attr["name"] in EQASIM_INCOME_ATTRIBUTES:
+            # stored attributes read from Filosofi
+            attributes.append(attr)
+            # build list of sheets to read
+            sheet_list = sheet_list + [x["sheet"] for x in attr["modalities"]]
+
+    # open and read income data file
+    with zipfile.ZipFile("{}/{}".format(
+            context.config("data_path"), context.config("income_com_path"))
+    ) as archive:
+        with archive.open(context.config("income_com_xlsx")) as f:
+            df = pd.read_excel(f, sheet_name=sheet_list, skiprows=5)
+
+    return df, attributes
+
+
+def execute(context):
+    # Verify spatial data for education
+    df_municipalities = context.stage("data.spatial.municipalities")
+
+    # Get year of income data
+    year = str(context.config("income_year"))
+
+    # Load income distribution
+    filosofi_excel, attributes = _read_filosofi_excel(context)
+
+    # Read ENSEMBLE sheet: global distributions, by commune
+    ensemble_distributions = _income_distributions_from_filosofi_ensemble_sheet(filosofi_excel, year, df_municipalities)
+
+    # Read attribute sheets: distributions on individuals with specific attribute values
+    # (ex: sheet TYPMENR_2 corresponds to households with `family_comp`=`Single_wom`)
+    attribute_distributions = _income_distributions_from_filosofi_attribute_sheets(filosofi_excel, year, df_municipalities, attributes)
+
+    return pd.concat([ensemble_distributions, attribute_distributions])
+
 
 def validate(context):
     if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("income_com_path"))):

diff --git a/docs/population.md b/docs/population.md
@@ -343,3 +343,26 @@ To make use of the urban type, the following data is needed:
 - Put the downloaded *zip* file into `data/urban_type`, so you will have the file `data/urban_type/UU2020_au_01-01-2023.zip`
 
 Then, you should be able to run the pipeline with the configuration explained above.
+
+### Income
+
+This pipeline allows using the [Bhepop2](https://github.com/tellae/bhepop2) package for income assignation. 
+
+By default, Eqasim infers income from the global income distribution by municipality from the Filosofi data set. 
+An income value is drawn from this distribution, independent of the household characteristics. This method is called
+`uniform`.
+
+Bhepop2 uses income distributions on subpopulations. For instance, Filosofi provides distributions depending on household size.
+Bhepop2 tries to match all the available distributions, instead of just the global one. This results in more
+accurate income assignation on subpopulations, but also on the global synthetic population. 
+See the [documentation](https://bhepop2.readthedocs.io/en/latest/) for more information on the affectation algorithm.
+
+To use the `bhepop2` method, provide the following config:
+
+```yaml
+config:
+  income_assignation_method: bhepop2
+```
+
+Caution, this method will fail on communes where the Filosofi subpopulation distributions are missing. In this case,
+we fall back to the `uniform` method.
diff --git a/documentation/info/collect.py b/documentation/info/collect.py
@@ -66,6 +66,7 @@ def execute(context):
 
     # Income
     df_income_municipality = context.stage("data.income.municipality")
+    df_income_municipality = df_income_municipality[(df_income_municipality["attribute"] == "all") & (df_income_municipality["value"] == "all")]
     df_income_region = context.stage("data.income.region")
 
     info["income"] = {

diff --git a/documentation/plots/income.py b/documentation/plots/income.py
@@ -19,6 +19,7 @@ def execute(context):
 
     # Income imputation
     df_income = context.stage("data.income.municipality")
+    df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")]
     df_imputed = df_income[df_income["is_imputed"]]
 
     plt.figure()

diff --git a/documentation/shapes.py b/documentation/shapes.py
@@ -14,6 +14,7 @@ def execute(context):
 
     # Spatial income distribution
     df_income = context.stage("data.income.municipality")
+    df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")]
     df_income = pd.merge(df_communes, df_income, how = "inner", on = "commune_id")
     df_income["is_imputed"] = df_income["is_imputed"].astype(int)
     df_income["commune_id"] = df_income["commune_id"].astype(str)

diff --git a/environment.yml b/environment.yml
@@ -29,3 +29,4 @@ dependencies:
 
   - pip:
     - synpp==1.5.1
+    - bhepop2==2.0.0
diff --git a/synthesis/population/enriched.py b/synthesis/population/enriched.py
@@ -16,7 +16,7 @@
 def configure(context):
     context.stage("synthesis.population.matched")
     context.stage("synthesis.population.sampled")
-    context.stage("synthesis.population.income")
+    context.stage("synthesis.population.income.selected")
 
     hts = context.config("hts")
     context.stage("data.hts.selected", alias = "hts")
@@ -53,7 +53,7 @@ def execute(context):
     ]], on = "hts_household_id")
 
     # Attach income
-    df_income = context.stage("synthesis.population.income")
+    df_income = context.stage("synthesis.population.income.selected")
     df_population = pd.merge(df_population, df_income[[
         "household_id", "household_income"
     ]], on = "household_id")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,3 +29,4 @@ dependencies:

		- pip:
		- synpp==1.5.1
		- bhepop2==2.0.0