Skip to content

Commit

Permalink
feat: use bhepop2 package for income assignment (#243)
Browse files Browse the repository at this point in the history
* feat: use bhepop2 for eqasim income assignment

* todo: manage zip

* corrections from rebase (Filosofi is now read from .zip file)

* restore default config file

* add missing configuration of income_com_path

* add missing request of income_com_xlsx config

* remove columns filters and rename from _read_filosofi_excel

* cleanup

* convert income to integer

* remove pandas warning

* improve error catches and code layout

* include population size in plot title

* remove MODALITIES constant
describe attribute modalities when giving attribute selection

* wip use bhepop2

* merged municipality_attributes.py in municipality.py

municipality stage now returns a DataFrame containing income deciles per attributes in addition to the usual global deciles.
Two columns "attribute" and "modality" have been added to specify the related attribute and the value it takes (modality).
Attribute and modality for global deciles are "all".
Filter on "all" attribute and modality have been added where data.income.municipality were used.

* move income_uniform_sample and MAXIMUM_INCOME_FACTOR to a utils module

* change MAXIMUM_INCOME_FACTOR back to original value

* renamed "eqasim method" into "uniform method"

* move compare_methods.py to analysis/methods/income/

* renamed bhepop2_income.py module into bhepop2.py

* refactor: improve bhepop2 integration

merge municipality_attributes.py into municipality.py
move compare_methods.py to analysis/methods/income/
created a utils.py module in synthesis/population/income/ to store common functions
added test dataset and tests
---------

Co-authored-by: leo-desbureaux-tellae <[email protected]>

* fix: conflicts

* try to make test_determinism work (#6)

* add income_assignation_method config documentation

* remove blank lines

* remove debug print

* remove float casting

* update docs

* renamed "modality" column in "value"

---------

Co-authored-by: Valentin LE BESCOND <[email protected]>
  • Loading branch information
leo-desbureaux-tellae and Nitnelav authored Jul 15, 2024
1 parent 90e9416 commit f74bd98
Show file tree
Hide file tree
Showing 19 changed files with 497 additions and 48 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ output
data/nantes_2015
data/lyon_2015
.vscode
.idea

config_local_*.yml
config_local_*.yml
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

**Under development**

- feat: add a new method for attributing income to housholds using the bhepop2 package
- fix: fixed special case in repairing ENTD for completely overlapping trips
- feat: make it possible to disable the test run of MATSim before writing everything out
- feat: check availability of open data sources for every PR
Expand Down
103 changes: 103 additions & 0 deletions analysis/methods/income/compare_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from synthesis.population.income.utils import MAXIMUM_INCOME_FACTOR
from bhepop2.tools import add_household_size_attribute, add_household_type_attribute
from bhepop2.sources.marginal_distributions import QuantitativeMarginalDistributions
import pandas as pd
import os

"""
Compare income assignation methods available in Eqasim.
Comparison is realised on the synthetic population of the most populated commune.
"""

COMPARE_INCOME_FOLDER = "compare_income_methods"


def configure(context):
context.config("output_path")
context.stage("data.income.municipality")
context.stage("synthesis.population.income.uniform", alias="uniform")
context.stage("synthesis.population.income.bhepop2", alias="bhepop2")
context.stage("synthesis.population.sampled")


def execute(context):

# get complete population (needed to add attributes)
df_population = context.stage("synthesis.population.sampled")
df_population = add_household_size_attribute(df_population)
df_population = add_household_type_attribute(df_population)

# get most populated commune
commune_id = df_population.groupby(["commune_id"], observed=True)["commune_id"].count().drop("undefined").idxmax()

# get income distributions by attributes
income_df = context.stage("data.income.municipality").query(f"commune_id == '{commune_id}'")
income_df = income_df.rename(
columns={
"value": "modality",
"q1": "D1",
"q2": "D2",
"q3": "D3",
"q4": "D4",
"q5": "D5",
"q6": "D6",
"q7": "D7",
"q8": "D8",
"q9": "D9",
}
)

households_with_attributes = df_population[[
"household_id", "commune_id", "size", "family_comp"
]].drop_duplicates("household_id")

# get enriched population with different methods
uniform_pop_df = context.stage("uniform")
uniform_pop_df = uniform_pop_df.merge(households_with_attributes, on="household_id")
uniform_pop_df["household_income"] = (
uniform_pop_df["household_income"] * 12 / uniform_pop_df["consumption_units"]
)
uniform_pop_df = uniform_pop_df.query(f"commune_id == '{commune_id}'")

bhepop2_pop_df = context.stage("bhepop2")
bhepop2_pop_df = bhepop2_pop_df.merge(households_with_attributes, on="household_id")
bhepop2_pop_df["household_income"] = (
bhepop2_pop_df["household_income"] * 12 / bhepop2_pop_df["consumption_units"]
)
bhepop2_pop_df = bhepop2_pop_df.query(f"commune_id == '{commune_id}'")

# prepare populations analysis

# create a source from the Filosofi distributions
marginal_distributions_source = QuantitativeMarginalDistributions(
income_df,
"Filosofi",
["size", "family_comp"],
0,
relative_maximum=MAXIMUM_INCOME_FACTOR,
delta_min=1000
)

# check output folder existence
compare_output_path = os.path.join(context.config("output_path"), COMPARE_INCOME_FOLDER)
if not os.path.exists(compare_output_path):
os.mkdir(compare_output_path)

# create an analysis instance
analysis = marginal_distributions_source.compare_with_populations(
{
"Uniform": uniform_pop_df,
"Bhepop2": bhepop2_pop_df
},
feature_name="household_income",
output_folder=compare_output_path
)
analysis.plot_title_format = analysis.plot_title_format + f" \n(commune={commune_id})"

analysis.generate_analysis_plots()
analysis.generate_analysis_error_table()

print(f"Generated compared analysis of income assignation methods in {compare_output_path}")


4 changes: 2 additions & 2 deletions analysis/synthesis/income.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
bs.configure(context, "synthesis.population.income", acquisition_sample_size)
bs.configure(context, "synthesis.population.income.selected", acquisition_sample_size)

def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
Expand All @@ -17,7 +17,7 @@ def execute(context):
quantiles = []

with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
for df_income in bs.get_stages(context, "synthesis.population.income", acquisition_sample_size):
for df_income in bs.get_stages(context, "synthesis.population.income.selected", acquisition_sample_size):
income = 12 * df_income["household_income"] / df_income["consumption_units"]
quantiles.append([income.quantile(p) for p in probabilities])
progress.update()
Expand Down
3 changes: 3 additions & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ config:
# generate_vehicles_file: True
# generate_vehicles_method: fleet_sample
# vehicles_data_year: 2015

# Uncomment to use the bhepop2 package for attributing income
# income_assignation_method: bhepop2
116 changes: 98 additions & 18 deletions data/income/municipality.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,41 @@
from sklearn.neighbors import KDTree
import os
import zipfile
from bhepop2.tools import read_filosofi_attributes, filosofi_attributes

"""
Loads and prepares income distributions by municipality:
- Load data with centiles per municipality
- For those which only provide median: Attach another distribution with most similar median
- For those which are missing: Attach the distribution of the municiality with the nearest centroid
- For global distributions
- Load data with centiles per municipality
- For those which only provide median: Attach another distribution with most similar median
- For those which are missing: Attach the distribution of the municiality with the nearest centroid
- For attribute distributions, read the adequate Filosofi sheet and get the percentiles
"""


# for now, only household size and family comp can be inferred from eqasim population
EQASIM_INCOME_ATTRIBUTES = ["size", "family_comp"]

# final columns of the income DataFrame
INCOME_DF_COLUMNS = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "attribute", "value", "is_imputed", "is_missing", "reference_median"]


def configure(context):
context.config("data_path")
context.stage("data.spatial.municipalities")
context.config("income_com_path", "filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip")
context.config("income_com_xlsx", "FILO2019_DISP_COM.xlsx")
context.config("income_year", 19)

def execute(context):
# Load income distribution
year = str(context.config("income_year"))

with zipfile.ZipFile("{}/{}".format(
context.config("data_path"), context.config("income_com_path"))) as archive:
with archive.open(context.config("income_com_xlsx")) as f:
df = pd.read_excel(f,
sheet_name = "ENSEMBLE", skiprows = 5
)[["CODGEO"] + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)]]
df.columns = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]
df["reference_median"] = df["q5"].values

# Verify spatial data for education
df_municipalities = context.stage("data.spatial.municipalities")
def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_municipalities):
requested_communes = set(df_municipalities["commune_id"].unique())

df = filsofi_sheets["ENSEMBLE"][["CODGEO"] + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)]]
df.columns = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]
df.loc[:, "reference_median"] = df["q5"].values

# filter requested communes
df = df[df["commune_id"].isin(requested_communes)]

# Find communes without data
Expand Down Expand Up @@ -86,7 +90,83 @@ def execute(context):
assert len(df) == len(df["commune_id"].unique())
assert len(requested_communes - set(df["commune_id"].unique())) == 0

return df[["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "is_imputed", "is_missing", "reference_median"]]
# add attribute and value "all" (ie ENSEMBLE)
df["attribute"] = "all"
df["value"] = "all"

return df[INCOME_DF_COLUMNS]


def _income_distributions_from_filosofi_attribute_sheets(filsofi_sheets, year, df_municipalities, attributes):
requested_communes = set(df_municipalities["commune_id"].unique())

# read attributes
df_with_attributes = read_filosofi_attributes(filsofi_sheets, year, attributes, requested_communes)

df_with_attributes.rename(
columns={
"modality": "value",
"D1": "q1",
"D2": "q2",
"D3": "q3",
"D4": "q4",
"D5": "q5",
"D6": "q6",
"D7": "q7",
"D8": "q8",
"D9": "q9",
},
inplace=True,
)

# add eqasim columns, data is not imputed nor missing
df_with_attributes["is_imputed"] = False
df_with_attributes["is_missing"] = False

return df_with_attributes[INCOME_DF_COLUMNS]


def _read_filosofi_excel(context):

# browse Filosofi attributes, select those available in Eqasim
attributes = []
sheet_list = ["ENSEMBLE"]
for attr in filosofi_attributes:
if attr["name"] in EQASIM_INCOME_ATTRIBUTES:
# stored attributes read from Filosofi
attributes.append(attr)
# build list of sheets to read
sheet_list = sheet_list + [x["sheet"] for x in attr["modalities"]]

# open and read income data file
with zipfile.ZipFile("{}/{}".format(
context.config("data_path"), context.config("income_com_path"))
) as archive:
with archive.open(context.config("income_com_xlsx")) as f:
df = pd.read_excel(f, sheet_name=sheet_list, skiprows=5)

return df, attributes


def execute(context):
# Verify spatial data for education
df_municipalities = context.stage("data.spatial.municipalities")

# Get year of income data
year = str(context.config("income_year"))

# Load income distribution
filosofi_excel, attributes = _read_filosofi_excel(context)

# Read ENSEMBLE sheet: global distributions, by commune
ensemble_distributions = _income_distributions_from_filosofi_ensemble_sheet(filosofi_excel, year, df_municipalities)

# Read attribute sheets: distributions on individuals with specific attribute values
# (ex: sheet TYPMENR_2 corresponds to households with `family_comp`=`Single_wom`)
attribute_distributions = _income_distributions_from_filosofi_attribute_sheets(filosofi_excel, year, df_municipalities, attributes)

return pd.concat([ensemble_distributions, attribute_distributions])


def validate(context):
if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("income_com_path"))):
Expand Down
23 changes: 23 additions & 0 deletions docs/population.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,3 +343,26 @@ To make use of the urban type, the following data is needed:
- Put the downloaded *zip* file into `data/urban_type`, so you will have the file `data/urban_type/UU2020_au_01-01-2023.zip`

Then, you should be able to run the pipeline with the configuration explained above.

### Income

This pipeline allows using the [Bhepop2](https://github.com/tellae/bhepop2) package for income assignation.

By default, Eqasim infers income from the global income distribution by municipality from the Filosofi data set.
An income value is drawn from this distribution, independent of the household characteristics. This method is called
`uniform`.

Bhepop2 uses income distributions on subpopulations. For instance, Filosofi provides distributions depending on household size.
Bhepop2 tries to match all the available distributions, instead of just the global one. This results in more
accurate income assignation on subpopulations, but also on the global synthetic population.
See the [documentation](https://bhepop2.readthedocs.io/en/latest/) for more information on the affectation algorithm.

To use the `bhepop2` method, provide the following config:

```yaml
config:
income_assignation_method: bhepop2
```

Caution, this method will fail on communes where the Filosofi subpopulation distributions are missing. In this case,
we fall back to the `uniform` method.
1 change: 1 addition & 0 deletions documentation/info/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def execute(context):

# Income
df_income_municipality = context.stage("data.income.municipality")
df_income_municipality = df_income_municipality[(df_income_municipality["attribute"] == "all") & (df_income_municipality["value"] == "all")]
df_income_region = context.stage("data.income.region")

info["income"] = {
Expand Down
1 change: 1 addition & 0 deletions documentation/plots/income.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def execute(context):

# Income imputation
df_income = context.stage("data.income.municipality")
df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")]
df_imputed = df_income[df_income["is_imputed"]]

plt.figure()
Expand Down
1 change: 1 addition & 0 deletions documentation/shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def execute(context):

# Spatial income distribution
df_income = context.stage("data.income.municipality")
df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")]
df_income = pd.merge(df_communes, df_income, how = "inner", on = "commune_id")
df_income["is_imputed"] = df_income["is_imputed"].astype(int)
df_income["commune_id"] = df_income["commune_id"].astype(str)
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ dependencies:

- pip:
- synpp==1.5.1
- bhepop2==2.0.0
4 changes: 2 additions & 2 deletions synthesis/population/enriched.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def configure(context):
context.stage("synthesis.population.matched")
context.stage("synthesis.population.sampled")
context.stage("synthesis.population.income")
context.stage("synthesis.population.income.selected")

hts = context.config("hts")
context.stage("data.hts.selected", alias = "hts")
Expand Down Expand Up @@ -53,7 +53,7 @@ def execute(context):
]], on = "hts_household_id")

# Attach income
df_income = context.stage("synthesis.population.income")
df_income = context.stage("synthesis.population.income.selected")
df_population = pd.merge(df_population, df_income[[
"household_id", "household_income"
]], on = "household_id")
Expand Down
Loading

0 comments on commit f74bd98

Please sign in to comment.