Skip to content

Commit

Permalink
Pipeline: Extends CHR back to 2011 (#3895)
Browse files Browse the repository at this point in the history
# Description and Motivation
<!--- bulleted, high level items. use keywords (eg "closes #144" or
"fixes #4323") -->

- checks in source data excel downloads from CHR to our data/
- updates pipeline to handle formatting/labeling inconsistencies between
each year of data

## Has this been tested? How?

- tests updated and passing

## Types of changes

(leave all that apply)

- New content or feature
- Refactor / chore
  • Loading branch information
benhammondmusic authored Dec 30, 2024
1 parent e18e3d5 commit 3174573
Show file tree
Hide file tree
Showing 17 changed files with 149,685 additions and 58,147 deletions.
3 changes: 2 additions & 1 deletion cspell.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"frontend/uncompressed_assets/",
"frontend/src/assets/",
"frontend_server/",
"composer-local-dev/"
"composer-local-dev/",
"**/[^.]*"
],
"allowCompoundWords": true,
"minWordLength": 3,
Expand Down
Binary file not shown.
Binary file not shown.
Binary file added data/chr/2013CountyHealthRankingsNationalData.xls
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added data/chr/2017CountyHealthRankingsData.xls
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
202 changes: 143 additions & 59 deletions python/datasources/chr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,119 @@
from typing import List, Dict, Optional, Tuple
import pandas as pd

# NOTE: col values for numerator and denominator are NULL
"""
Files downloaded from County Health Rankings website
Recent years:
- countyhealthrankings.org/health-data/methodology-and-sources/data-documentation
Older years:
- countyhealthrankings.org/health-data/methodology-and-sources/data-documentation/national-data-documentation-2010-2022
File names vary as do the download link text, but it's generally the first file in each section.
The text contains the words `National Data`
"""

CHR_DIR = "chr"

CHR_FILE_LOOKUP = {
"2011": "2011 County Health Rankings National Data_v2_0.xls",
"2012": "2012 County Health Rankings National Data_v2_0.xls",
"2013": "2013CountyHealthRankingsNationalData.xls",
"2014": "2014 County Health Rankings Data - v6.xls",
"2015": "2015 County Health Rankings Data - v3.xls",
"2016": "2016 County Health Rankings Data - v3.xls",
"2017": "2017CountyHealthRankingsData.xls",
"2018": "2018 County Health Rankings Data - v2.xls",
"2019": "2019 County Health Rankings Data - v3.xls",
"2020": "2020 County Health Rankings Data - v2.xlsx",
"2021": "2021 County Health Rankings Data - v1.xlsx",
"2022": "2022 County Health Rankings Data - v1.xlsx",
"2023": "2023 County Health Rankings Data - v2.xlsx",
"2024": "2024_county_health_release_data_-_v1.xlsx",
}


def get_het_to_source_select_topic_all_to_race_prefix_map(year: str | None = None) -> dict[str, dict[str, str | None]]:
het_to_source_select_topic_all_to_race_prefix_map: dict[str, dict[str, str | None]] = {}

if year is None or int(year) >= 2012:
het_to_source_select_topic_all_to_race_prefix_map[std_col.EXCESSIVE_DRINKING_PREFIX] = {
"% Excessive Drinking": None
}

if year is None or int(year) >= 2015:
het_to_source_select_topic_all_to_race_prefix_map[std_col.PREVENTABLE_HOSP_PREFIX] = {
"Preventable Hosp. Rate": None
}

if year is None or year == "2019":
het_to_source_select_topic_all_to_race_prefix_map[std_col.PREVENTABLE_HOSP_PREFIX] = {
"Preventable Hosp. Rate": "Preventable Hosp. Rate"
}

if year is None or int(year) > 2019:
het_to_source_select_topic_all_to_race_prefix_map[std_col.PREVENTABLE_HOSP_PREFIX] = {
"Preventable Hospitalization Rate": "Preventable Hosp. Rate"
}

return het_to_source_select_topic_all_to_race_prefix_map


def get_het_to_source_additional_topic_all_to_race_prefix_map(
year: str | None = None,
) -> dict[str, dict[str, str | None]]:
het_to_source_additional_topic_all_to_race_prefix_map: dict[str, dict[str, str | None]] = {}

if year is None or int(year) >= 2011:
het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {"Diabetes": None}

het_to_source_select_topic_all_to_race_prefix_map: Dict[str, Dict[str, Optional[str]]] = {
std_col.PREVENTABLE_HOSP_PREFIX: {"Preventable Hospitalization Rate": "Preventable Hosp. Rate"},
std_col.EXCESSIVE_DRINKING_PREFIX: {"% Excessive Drinking": None},
if year is None or int(year) >= 2012:
het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {"% diabetic": None}

if year is None or int(year) >= 2014:
het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {"% Diabetic": None}

if year is None or int(year) >= 2016:
het_to_source_additional_topic_all_to_race_prefix_map[std_col.FREQUENT_MENTAL_DISTRESS_PREFIX] = {
"% Frequent Mental Distress": None
}

if year is None or int(year) >= 2020:
het_to_source_additional_topic_all_to_race_prefix_map[std_col.SUICIDE_PREFIX] = {"Crude Rate": "Suicide Rate"}
het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {
"% Adults with Diabetes": None
}

if year is None or int(year) >= 2023:
het_to_source_additional_topic_all_to_race_prefix_map[std_col.VOTER_PARTICIPATION_PREFIX] = {
"% Voter Turnout": None
}

return het_to_source_additional_topic_all_to_race_prefix_map


default_source_race_to_id_map = {
"(AIAN)": std_col.Race.AIAN_NH.value,
"(Asian)": std_col.Race.API_NH.value,
"(Black)": std_col.Race.BLACK_NH.value,
"(Hispanic)": std_col.Race.HISP.value,
"(White)": std_col.Race.WHITE_NH.value,
}

het_to_source_additional_topic_all_to_race_prefix_map: Dict[str, Dict[str, Optional[str]]] = {
std_col.SUICIDE_PREFIX: {"Crude Rate": "Suicide Rate"},
std_col.FREQUENT_MENTAL_DISTRESS_PREFIX: {"% Frequent Mental Distress": None},
std_col.DIABETES_PREFIX: {"% Adults with Diabetes": None},
std_col.VOTER_PARTICIPATION_PREFIX: {"% Voter Turnout": None},
source_race_few_to_id_map = {
"(Black)": std_col.Race.BLACK_NH.value,
"(Hispanic)": std_col.Race.HISP.value,
"(White)": std_col.Race.WHITE_NH.value,
}

# frequent mental distress
source_race_to_id_map = {
source_race_w_to_id_map = {
"(AIAN)": std_col.Race.AIAN_NH.value,
"(Asian)": std_col.Race.API_NH.value,
"(Black)": std_col.Race.BLACK_NH.value,
"(Hispanic)": std_col.Race.HISP.value,
"(White)": std_col.Race.WHITE_NH.value,
"(white)": std_col.Race.WHITE_NH.value,
}

# suicide 2024
source_nh_race_code_to_id_map_2024 = {
source_nh_race_to_id_map = {
"(Hispanic (all races))": std_col.Race.HISP.value,
"(Non-Hispanic AIAN)": std_col.Race.AIAN_NH.value,
"(Non-Hispanic Asian)": std_col.Race.ASIAN_NH.value,
Expand All @@ -41,19 +126,25 @@
"(Non-Hispanic White)": std_col.Race.WHITE_NH.value,
}

# suicide 2023
source_race_code_to_id_map_2023 = {
"(AIAN)": std_col.Race.AIAN_NH.value,
"(Asian)": std_col.Race.ASIAN_NH.value,
"(Black)": std_col.Race.BLACK_NH.value,
"(Hispanic)": std_col.Race.HISP.value,
"(White)": std_col.Race.WHITE_NH.value,
}

def get_race_map(year: str, sheet_name: str) -> Dict[str, str]:
"""
Returns a dict of CHR source race names to the HET standard race code.
Some source years/sheets use inconsistent labels for the column names.
"""
special_race_maps = {
("2019", "Ranked Measure Data"): source_race_few_to_id_map,
("2022", "Ranked Measure Data"): source_race_w_to_id_map,
("2022", "Additional Measure Data"): source_race_w_to_id_map,
("2024", "Additional Measure Data"): source_nh_race_to_id_map,
}
return special_race_maps.get((year, sheet_name), default_source_race_to_id_map)


source_fips_col = "FIPS"
source_per_100k = "Rate"
source_pct_rate = "%"
source_pct_rate_cols_no_symbol = ["Diabetes"]


class CHRData(DataSource):
Expand All @@ -75,7 +166,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):

dfs = []

for year in ["2023", "2024"]:
for year in CHR_FILE_LOOKUP.keys():

main_sheet_name = "Select Measure Data" if year == "2024" else "Ranked Measure Data"
main_source_df = get_df_from_chr_excel_sheet(year, main_sheet_name)
Expand All @@ -87,14 +178,15 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
}
)

# # drop national and state-level rows
year_df = year_df[~year_df[std_col.COUNTY_FIPS_COL].str.endswith("000")]
# # drop any national and state-level rows
year_df = year_df[~year_df[std_col.COUNTY_FIPS_COL].astype(str).str.endswith("000")]
melt_map = get_melt_map(year)
year_df = dataset_utils.melt_to_het_style_df(
year_df, std_col.RACE_CATEGORY_ID_COL, [std_col.COUNTY_FIPS_COL], melt_map
)
year_df[std_col.STATE_FIPS_COL] = year_df[std_col.COUNTY_FIPS_COL].str[:2]
year_df[std_col.TIME_PERIOD_COL] = year
year_df.insert(loc=0, column=std_col.TIME_PERIOD_COL, value=year)

dfs.append(year_df)

df = pd.concat(dfs)
Expand All @@ -116,8 +208,8 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
float_cols = timeview_float_cols_map[timeview]
df_for_bq, float_cols = convert_some_pct_rate_to_100k(df, float_cols)

topic_prefixes = list(het_to_source_select_topic_all_to_race_prefix_map.keys()) + list(
het_to_source_additional_topic_all_to_race_prefix_map.keys()
topic_prefixes = list(get_het_to_source_select_topic_all_to_race_prefix_map().keys()) + list(
get_het_to_source_additional_topic_all_to_race_prefix_map().keys()
)
topic_prefixes.append("chr_population")

Expand All @@ -138,19 +230,16 @@ def get_source_usecols(year: str, sheet_name: str) -> List[str]:
source_usecols = [source_fips_col]

sheet_topic_map: Dict[str, Dict[str, Optional[str]]] = {}
sheet_race_map: Dict[str, str] = {}
sheet_race_map = get_race_map(year, sheet_name)
if sheet_name in ["Ranked Measure Data", "Select Measure Data"]:
sheet_topic_map = het_to_source_select_topic_all_to_race_prefix_map
sheet_race_map = source_race_to_id_map
sheet_topic_map = get_het_to_source_select_topic_all_to_race_prefix_map(year)
if sheet_name == "Additional Measure Data":
sheet_topic_map = het_to_source_additional_topic_all_to_race_prefix_map
sheet_race_map = source_nh_race_code_to_id_map_2024 if year == "2024" else source_race_code_to_id_map_2023
sheet_topic_map = get_het_to_source_additional_topic_all_to_race_prefix_map(year)

for source_topic_all_to_race_prefix_map in sheet_topic_map.values():
for source_topic, source_topic_race_prefix in source_topic_all_to_race_prefix_map.items():
source_usecols.append(source_topic)

# some topics only have ALLs
if source_topic_race_prefix is not None:
for race_suffix in sheet_race_map.keys():
source_usecols.append(f"{source_topic_race_prefix} {race_suffix}")
Expand All @@ -167,49 +256,49 @@ def get_melt_map(year: str) -> Dict[str, Dict[str, str]]:
dict: A nested dict
"""

race_code_to_id_map = source_nh_race_code_to_id_map_2024 if year == "2024" else source_race_code_to_id_map_2023

melt_map: Dict[str, Dict[str, str]] = {}
# each topic get its own sub-mapping
for het_prefix, source_all_race_map in het_to_source_select_topic_all_to_race_prefix_map.items():
select_topic_melt_map: Dict[str, str] = {}
# maps the sources by race topic column name to the needed HET race column values

ranked_race_code_to_id_map = get_race_map(year, "Ranked Measure Data")
for het_prefix, source_all_race_map in get_het_to_source_select_topic_all_to_race_prefix_map(year).items():
topic_melt_map: Dict[str, str] = {}

for source_all, source_race_prefix in source_all_race_map.items():
select_topic_melt_map[source_all] = std_col.Race.ALL.value
topic_melt_map[source_all] = std_col.Race.ALL.value

# some topics only have ALLs
# some topics have by race columns
if source_race_prefix is not None:
for source_race_suffix, het_race_id in source_race_to_id_map.items():
select_topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id
for source_race_suffix, het_race_id in ranked_race_code_to_id_map.items():
topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id

# assign 100k or pct_rate as needed
rate_suffix = ""
source_all_col = list(source_all_race_map.keys())[0]
if source_per_100k in source_all_col:
rate_suffix = std_col.PER_100K_SUFFIX
if source_pct_rate in source_all_col:
if source_pct_rate in source_all_col or source_all_col in source_pct_rate_cols_no_symbol:
rate_suffix = std_col.PCT_RATE_SUFFIX

# set this metrics sub melt map
melt_map[f"{het_prefix}_{rate_suffix}"] = select_topic_melt_map
melt_map[f"{het_prefix}_{rate_suffix}"] = topic_melt_map

for het_prefix, source_all_race_map in het_to_source_additional_topic_all_to_race_prefix_map.items():
additional_race_code_to_id_map = get_race_map(year, "Additional Measure Data")
for het_prefix, source_all_race_map in get_het_to_source_additional_topic_all_to_race_prefix_map(year).items():
additional_topic_melt_map: Dict[str, str] = {}
# maps the sources by race topic column name to the needed HET race column values
for source_all, source_race_prefix in source_all_race_map.items():
additional_topic_melt_map[source_all] = std_col.Race.ALL.value

# some topics only have ALLs
if source_race_prefix is not None:
for source_race_suffix, het_race_id in race_code_to_id_map.items():
for source_race_suffix, het_race_id in additional_race_code_to_id_map.items():
additional_topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id

# assign 100k or pct_rate as needed
rate_suffix = ""
source_all_col = list(source_all_race_map.keys())[0]
if source_per_100k in source_all_col:
rate_suffix = std_col.PER_100K_SUFFIX
if source_pct_rate in source_all_col:
if source_pct_rate in source_all_col or source_all_col in source_pct_rate_cols_no_symbol:
rate_suffix = std_col.PCT_RATE_SUFFIX

# set this metrics sub melt map
Expand All @@ -231,15 +320,15 @@ def get_float_cols() -> Dict[str, List[str]]:
historical_float_cols = []

# include all numerical columns in the time map
all_topics = list(het_to_source_select_topic_all_to_race_prefix_map.keys()) + list(
het_to_source_additional_topic_all_to_race_prefix_map.keys()
all_topics = list(get_het_to_source_select_topic_all_to_race_prefix_map().keys()) + list(
get_het_to_source_additional_topic_all_to_race_prefix_map().keys()
)
for topic_prefix in all_topics:

# assign 100k or pct_rate as needed based on the source col name
source_dict = {
**het_to_source_select_topic_all_to_race_prefix_map,
**het_to_source_additional_topic_all_to_race_prefix_map,
**get_het_to_source_select_topic_all_to_race_prefix_map(),
**get_het_to_source_additional_topic_all_to_race_prefix_map(),
}.get(topic_prefix)

if source_dict is None:
Expand All @@ -263,12 +352,7 @@ def get_float_cols() -> Dict[str, List[str]]:
def get_df_from_chr_excel_sheet(year: str, sheet_name: str) -> pd.DataFrame:
source_usecols = get_source_usecols(year, sheet_name)

file_name_lookup = {
"2024": "2024_county_health_release_data_-_v1.xlsx",
"2023": "2023 County Health Rankings Data - v2.xlsx",
}

file_name = file_name_lookup[year]
file_name = CHR_FILE_LOOKUP[year]

return gcs_to_bq_util.load_xlsx_as_df_from_data_dir(
CHR_DIR,
Expand Down
Loading

0 comments on commit 3174573

Please sign in to comment.