Skip to content

Commit

Permalink
feat : option param to remove filtering for requesting dep in hts (#249)
Browse files Browse the repository at this point in the history
* feat : option param to remove filtering for requesting dep in hts

* update changelog

* change param default value

---------

Co-authored-by: Marie Laurent <[email protected]>
  • Loading branch information
MarieMcLaurent and Marie Laurent authored Aug 20, 2024
1 parent be43c17 commit f82e503
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 57 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

**Under development**

- feat : option parameter to remove filtering for requesting departements in hts
- fix: secondary location model used same random seed in every parallel thread
- feat: add a new method for attributing income to housholds using the bhepop2 package
- fix: fixed special case in repairing ENTD for completely overlapping trips
Expand Down
31 changes: 17 additions & 14 deletions data/hts/edgt_44/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,31 @@
def configure(context):
context.stage("data.hts.edgt_44.cleaned")
context.stage("data.spatial.codes")


context.config("filter_hts",True)
def execute(context):
filter_edgt = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.cleaned")

# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
df_persons = df_persons[f]
if filter_edgt :
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
df_persons = df_persons[f]

# Filter for people going outside of the area
remove_ids = set()
# Filter for people going outside of the area
remove_ids = set()

remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())
remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())

df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]

# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]

# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS]
Expand Down
31 changes: 17 additions & 14 deletions data/hts/edgt_lyon/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,31 @@ def configure(context):
raise RuntimeError("Unknown Lyon EDGT source (only 'cerema' and 'adisp' are supported): %s" % edgt_lyon_source)

context.stage("data.spatial.codes")


context.config("filter_hts",True)
def execute(context):
filter_edgt = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.edgt_lyon.cleaned")

# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
df_persons = df_persons[f]
if filter_edgt :
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
df_persons = df_persons[f]

# Filter for people going outside of the area
remove_ids = set()
# Filter for people going outside of the area
remove_ids = set()

remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())
remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())

df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]

# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]

# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS]
Expand Down
35 changes: 19 additions & 16 deletions data/hts/egt/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,35 @@ def configure(context):
context.stage("data.hts.egt.cleaned")
context.stage("data.spatial.codes")

context.config("filter_hts",True)
def execute(context):
filter_egt = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")

df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned")

# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments) # pandas bug!
df_persons = df_persons[f]
if filter_egt :
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments) # pandas bug!
df_persons = df_persons[f]

# Filter for people going outside of the area (because they have NaN distances)
remove_ids = set()
# Filter for people going outside of the area (because they have NaN distances)
remove_ids = set()

remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())
remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())

remove_ids |= set(df_persons[
~df_persons["departement_id"].isin(requested_departments)
])
remove_ids |= set(df_persons[
~df_persons["departement_id"].isin(requested_departments)
])

df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]

# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]

# Finish up
household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]
Expand Down
3 changes: 3 additions & 0 deletions data/hts/entd/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,9 @@ def execute(context):
# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10

# Fix activity types (because of 1 inconsistent ENTD data)
hts.fix_activity_types(df_trips)

return df_households, df_persons, df_trips

def calculate_income_class(df):
Expand Down
29 changes: 16 additions & 13 deletions data/hts/entd/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,30 @@ def configure(context):
context.stage("data.hts.entd.cleaned")
context.stage("data.spatial.codes")

context.config("filter_hts",True)
def execute(context):
filter_entd = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned")

# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
df_persons = df_persons[f]
if filter_entd :
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
df_persons = df_persons[f]

# Filter for people going outside of the area (because they have NaN distances)
remove_ids = set()
# Filter for people going outside of the area (because they have NaN distances)
remove_ids = set()

remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())
remove_ids |= set(df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique())

df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]

# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]

# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]
Expand Down

0 comments on commit f82e503

Please sign in to comment.