diff --git a/CHANGELOG.md b/CHANGELOG.md index 343092bf..10f8649e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ **Under development** +- feat : option parameter to remove filtering for requesting departements in hts - fix: secondary location model used same random seed in every parallel thread - feat: add a new method for attributing income to housholds using the bhepop2 package - fix: fixed special case in repairing ENTD for completely overlapping trips diff --git a/data/hts/edgt_44/filtered.py b/data/hts/edgt_44/filtered.py index 993a602f..df52ab89 100644 --- a/data/hts/edgt_44/filtered.py +++ b/data/hts/edgt_44/filtered.py @@ -8,28 +8,31 @@ def configure(context): context.stage("data.hts.edgt_44.cleaned") context.stage("data.spatial.codes") - + + context.config("filter_hts",True) def execute(context): + filter_edgt = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.cleaned") - # Filter for non-residents - requested_departments = df_codes["departement_id"].unique() - f = df_persons["departement_id"].astype(str).isin(requested_departments) - df_persons = df_persons[f] + if filter_edgt : + # Filter for non-residents + requested_departments = df_codes["departement_id"].unique() + f = df_persons["departement_id"].astype(str).isin(requested_departments) + df_persons = df_persons[f] - # Filter for people going outside of the area - remove_ids = set() + # Filter for people going outside of the area + remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set(df_trips[ + ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) + ]["person_id"].unique()) - df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] + df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] - # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + # Only keep trips and households that still have a person + df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] + df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up df_households = df_households[hts.HOUSEHOLD_COLUMNS] diff --git a/data/hts/edgt_lyon/filtered.py b/data/hts/edgt_lyon/filtered.py index 7f0af4ab..cf957685 100644 --- a/data/hts/edgt_lyon/filtered.py +++ b/data/hts/edgt_lyon/filtered.py @@ -18,28 +18,31 @@ def configure(context): raise RuntimeError("Unknown Lyon EDGT source (only 'cerema' and 'adisp' are supported): %s" % edgt_lyon_source) context.stage("data.spatial.codes") - + + context.config("filter_hts",True) def execute(context): + filter_edgt = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.edgt_lyon.cleaned") - # Filter for non-residents - requested_departments = df_codes["departement_id"].unique() - f = df_persons["departement_id"].astype(str).isin(requested_departments) - df_persons = df_persons[f] + if filter_edgt : + # Filter for non-residents + requested_departments = df_codes["departement_id"].unique() + f = df_persons["departement_id"].astype(str).isin(requested_departments) + df_persons = df_persons[f] - # Filter for people going outside of the area - remove_ids = set() + # Filter for people going outside of the area + remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set(df_trips[ + ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) + ]["person_id"].unique()) - df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] + df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] - # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + # Only keep trips and households that still have a person + df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] + df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up df_households = df_households[hts.HOUSEHOLD_COLUMNS] diff --git a/data/hts/egt/filtered.py b/data/hts/egt/filtered.py index c2286055..29f06604 100644 --- a/data/hts/egt/filtered.py +++ b/data/hts/egt/filtered.py @@ -10,32 +10,35 @@ def configure(context): context.stage("data.hts.egt.cleaned") context.stage("data.spatial.codes") + context.config("filter_hts",True) def execute(context): + filter_egt = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned") - # Filter for non-residents - requested_departments = df_codes["departement_id"].unique() - f = df_persons["departement_id"].astype(str).isin(requested_departments) # pandas bug! - df_persons = df_persons[f] + if filter_egt : + # Filter for non-residents + requested_departments = df_codes["departement_id"].unique() + f = df_persons["departement_id"].astype(str).isin(requested_departments) # pandas bug! + df_persons = df_persons[f] - # Filter for people going outside of the area (because they have NaN distances) - remove_ids = set() + # Filter for people going outside of the area (because they have NaN distances) + remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set(df_trips[ + ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) + ]["person_id"].unique()) - remove_ids |= set(df_persons[ - ~df_persons["departement_id"].isin(requested_departments) - ]) + remove_ids |= set(df_persons[ + ~df_persons["departement_id"].isin(requested_departments) + ]) - df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] + df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] - # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + # Only keep trips and households that still have a person + df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] + df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"] diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py index a8814c94..51bfd966 100644 --- a/data/hts/entd/cleaned.py +++ b/data/hts/entd/cleaned.py @@ -248,6 +248,9 @@ def execute(context): # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10 + # Fix activity types (because of 1 inconsistent ENTD data) + hts.fix_activity_types(df_trips) + return df_households, df_persons, df_trips def calculate_income_class(df): diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py index 9fc4793c..e9bb2ca1 100644 --- a/data/hts/entd/filtered.py +++ b/data/hts/entd/filtered.py @@ -10,27 +10,30 @@ def configure(context): context.stage("data.hts.entd.cleaned") context.stage("data.spatial.codes") + context.config("filter_hts",True) def execute(context): + filter_entd = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned") - # Filter for non-residents - requested_departments = df_codes["departement_id"].unique() - f = df_persons["departement_id"].astype(str).isin(requested_departments) - df_persons = df_persons[f] + if filter_entd : + # Filter for non-residents + requested_departments = df_codes["departement_id"].unique() + f = df_persons["departement_id"].astype(str).isin(requested_departments) + df_persons = df_persons[f] - # Filter for people going outside of the area (because they have NaN distances) - remove_ids = set() + # Filter for people going outside of the area (because they have NaN distances) + remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set(df_trips[ + ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) + ]["person_id"].unique()) - df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] + df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] - # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + # Only keep trips and households that still have a person + df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] + df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]