diff --git a/CHANGELOG.md b/CHANGELOG.md index 47f33f4e..97c4914f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ **Under development** +- feat: functionality to make use of INSEE population projection data +- update: don't remove households with people not living/studying in Île-de-France anymore to be more consistent with other use cases +- fix bug where always one household_id existed twice - Fix read order when exploring files using `glob` - Modes are only written now to `trips.csv` if `mode_choice` is activated - Update to `eqasim-java` commit `7cbe85b` diff --git a/data/census/cleaned.py b/data/census/cleaned.py index 23c0d204..79e6c1cd 100644 --- a/data/census/cleaned.py +++ b/data/census/cleaned.py @@ -93,10 +93,6 @@ def execute(context): # Socioprofessional category df["socioprofessional_class"] = df["CS1"].astype(int) - # Place of work or education - df["work_outside_region"] = df["ILT"].isin(("4", "5", "6")) - df["education_outside_region"] = df["ILETUD"].isin(("4", "5", "6")) - # Consumption units df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id") @@ -106,6 +102,5 @@ def execute(context): "age", "sex", "couple", "commute_mode", "employed", "studies", "number_of_vehicles", "household_size", - "work_outside_region", "education_outside_region", "consumption_units", "socioprofessional_class" ]] diff --git a/data/census/filtered.py b/data/census/filtered.py index 3c52b342..ecd3bdcd 100644 --- a/data/census/filtered.py +++ b/data/census/filtered.py @@ -14,16 +14,6 @@ def configure(context): def execute(context): df = context.stage("data.census.cleaned") - # We remove people who study or work in another region - f = df["work_outside_region"] | df["education_outside_region"] - remove_ids = df[f]["household_id"].unique() - - initial_households = len(df["household_id"].unique()) - removed_households = len(remove_ids) - - initial_persons = len(df["person_id"].unique()) - removed_persons = np.count_nonzero(df["household_id"].isin(remove_ids)) - # Filter requested codes df_codes = context.stage("data.spatial.codes") @@ -38,44 +28,4 @@ def execute(context): if not excess_iris == {"undefined"}: raise RuntimeError("Found additional IRIS: %s" % excess_iris) - # TODO: This filtering is not really compatible with defining multiple regions - # or departments. This used to be a filter to avoid people going outside of - # Île-de-France, but we should consider removing this filter altogether, or - # find some smarter way (e.g. using OD matrices and filter out people in - # each municipality by the share of outside workers). - df_codes = context.stage("data.spatial.codes") - - if len(df_codes["region_id"].unique()) > 1: - raise RuntimeError(""" - Multiple regions are defined, so the filtering for people going outside - of Île-de-France does not make sense in that case. Consider adjusting the - data.census.filtered stage! - """) - - print( - "Removing %d/%d (%.2f%%) households (with %d/%d persons, %.2f%%) because at least one person is working outside of Île-de-France" % ( - removed_households, initial_households, 100 * removed_households / initial_households, - removed_persons, initial_persons, 100 * removed_persons / initial_persons - )) - - context.set_info("filtered_households_share", removed_households / initial_households) - context.set_info("filtered_persons_share", removed_persons / initial_persons) - - df = df[~df["household_id"].isin(remove_ids)] - - - # Household size - df_size = df[["household_id"]].groupby("household_id").size().reset_index(name = "household_size2") - df = pd.merge(df, df_size) - - f = df["household_size"] != df["household_size2"] - print(np.count_nonzero(f)) - print(df[f]) - - print(df[df["household_id"] == 8958513]) - - assert np.all(df["household_size"] == df["household_size2"]) - print("all good") - exit() - return df diff --git a/data/census/raw.py b/data/census/raw.py index 21663cd2..73eebd4a 100644 --- a/data/census/raw.py +++ b/data/census/raw.py @@ -22,9 +22,7 @@ def configure(context): "COUPLE":"str", "CS1":"str", "DEPT":"str", - "ETUD":"str", - "ILETUD":"str", - "ILT":"str", + "ETUD":"str", "IPONDI":"str", "IRIS":"str", "REGION":"str", diff --git a/synthesis/population/sampled.py b/synthesis/population/sampled.py index c71a69ea..0ec0811b 100644 --- a/synthesis/population/sampled.py +++ b/synthesis/population/sampled.py @@ -25,14 +25,6 @@ def execute(context): sampling_rate = context.config("sampling_rate") random = np.random.RandomState(context.config("random_seed")) - # Household size - df_size = df_census[["household_id"]].groupby("household_id").size().reset_index(name = "household_size2") - df_census = pd.merge(df_census, df_size) - - assert np.all(df_census["household_size"] == df_census["household_size2"]) - print("all good") - exit() - # Perform stochastic rounding for the population (and scale weights) df_rounding = df_census[["household_id", "weight", "household_size"]].drop_duplicates("household_id") df_rounding["multiplicator"] = np.floor(df_rounding["weight"])