Pipeline: Extends CHR back to 2011 (#3895)

# Description and Motivation  - checks in source data excel downloads from CHR to our data/ - updates pipeline to handle formatting/labeling inconsistencies between each year of data ## Has this been tested? How? - tests updated and passing ## Types of changes (leave all that apply) - New content or feature - Refactor / chore
SatcherInstitute · Dec 30, 2024 · 3174573 · 3174573
1 parent e18e3d5
commit 3174573
Show file tree

Hide file tree

Showing 17 changed files with 149,685 additions and 58,147 deletions.
diff --git a/cspell.config.json b/cspell.config.json
@@ -12,7 +12,8 @@
 		"frontend/uncompressed_assets/",
 		"frontend/src/assets/",
 		"frontend_server/",
-		"composer-local-dev/"
+		"composer-local-dev/",
+		"**/[^.]*"
 	],
 	"allowCompoundWords": true,
 	"minWordLength": 3,

diff --git a/data/chr/2011 County Health Rankings National Data_v2_0.xls b/data/chr/2011 County Health Rankings National Data_v2_0.xls
diff --git a/data/chr/2012 County Health Rankings National Data_v2_0.xls b/data/chr/2012 County Health Rankings National Data_v2_0.xls
diff --git a/data/chr/2013CountyHealthRankingsNationalData.xls b/data/chr/2013CountyHealthRankingsNationalData.xls
diff --git a/data/chr/2014 County Health Rankings Data - v6.xls b/data/chr/2014 County Health Rankings Data - v6.xls
diff --git a/data/chr/2015 County Health Rankings Data - v3.xls b/data/chr/2015 County Health Rankings Data - v3.xls
diff --git a/data/chr/2016 County Health Rankings Data - v3.xls b/data/chr/2016 County Health Rankings Data - v3.xls
diff --git a/data/chr/2017CountyHealthRankingsData.xls b/data/chr/2017CountyHealthRankingsData.xls
diff --git a/data/chr/2018 County Health Rankings Data - v2.xls b/data/chr/2018 County Health Rankings Data - v2.xls
diff --git a/data/chr/2019 County Health Rankings Data - v3.xls b/data/chr/2019 County Health Rankings Data - v3.xls
diff --git a/data/chr/2020 County Health Rankings Data - v2.xlsx b/data/chr/2020 County Health Rankings Data - v2.xlsx
diff --git a/data/chr/2021 County Health Rankings Data - v1.xlsx b/data/chr/2021 County Health Rankings Data - v1.xlsx
diff --git a/data/chr/2022 County Health Rankings Data - v1.xlsx b/data/chr/2022 County Health Rankings Data - v1.xlsx
diff --git a/python/datasources/chr.py b/python/datasources/chr.py
@@ -4,34 +4,119 @@
 from typing import List, Dict, Optional, Tuple
 import pandas as pd
 
-# NOTE: col values for numerator and denominator are NULL
+"""
+Files downloaded from County Health Rankings website
+Recent years:
+- countyhealthrankings.org/health-data/methodology-and-sources/data-documentation
+Older years:
+- countyhealthrankings.org/health-data/methodology-and-sources/data-documentation/national-data-documentation-2010-2022
+
+File names vary as do the download link text, but it's generally the first file in each section.
+The text contains the words `National Data`
+"""
 
 CHR_DIR = "chr"
 
+CHR_FILE_LOOKUP = {
+    "2011": "2011 County Health Rankings National Data_v2_0.xls",
+    "2012": "2012 County Health Rankings National Data_v2_0.xls",
+    "2013": "2013CountyHealthRankingsNationalData.xls",
+    "2014": "2014 County Health Rankings Data - v6.xls",
+    "2015": "2015 County Health Rankings Data - v3.xls",
+    "2016": "2016 County Health Rankings Data - v3.xls",
+    "2017": "2017CountyHealthRankingsData.xls",
+    "2018": "2018 County Health Rankings Data - v2.xls",
+    "2019": "2019 County Health Rankings Data - v3.xls",
+    "2020": "2020 County Health Rankings Data - v2.xlsx",
+    "2021": "2021 County Health Rankings Data - v1.xlsx",
+    "2022": "2022 County Health Rankings Data - v1.xlsx",
+    "2023": "2023 County Health Rankings Data - v2.xlsx",
+    "2024": "2024_county_health_release_data_-_v1.xlsx",
+}
+
+
+def get_het_to_source_select_topic_all_to_race_prefix_map(year: str | None = None) -> dict[str, dict[str, str | None]]:
+    het_to_source_select_topic_all_to_race_prefix_map: dict[str, dict[str, str | None]] = {}
+
+    if year is None or int(year) >= 2012:
+        het_to_source_select_topic_all_to_race_prefix_map[std_col.EXCESSIVE_DRINKING_PREFIX] = {
+            "% Excessive Drinking": None
+        }
+
+    if year is None or int(year) >= 2015:
+        het_to_source_select_topic_all_to_race_prefix_map[std_col.PREVENTABLE_HOSP_PREFIX] = {
+            "Preventable Hosp. Rate": None
+        }
+
+    if year is None or year == "2019":
+        het_to_source_select_topic_all_to_race_prefix_map[std_col.PREVENTABLE_HOSP_PREFIX] = {
+            "Preventable Hosp. Rate": "Preventable Hosp. Rate"
+        }
+
+    if year is None or int(year) > 2019:
+        het_to_source_select_topic_all_to_race_prefix_map[std_col.PREVENTABLE_HOSP_PREFIX] = {
+            "Preventable Hospitalization Rate": "Preventable Hosp. Rate"
+        }
+
+    return het_to_source_select_topic_all_to_race_prefix_map
+
+
+def get_het_to_source_additional_topic_all_to_race_prefix_map(
+    year: str | None = None,
+) -> dict[str, dict[str, str | None]]:
+    het_to_source_additional_topic_all_to_race_prefix_map: dict[str, dict[str, str | None]] = {}
+
+    if year is None or int(year) >= 2011:
+        het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {"Diabetes": None}
 
-het_to_source_select_topic_all_to_race_prefix_map: Dict[str, Dict[str, Optional[str]]] = {
-    std_col.PREVENTABLE_HOSP_PREFIX: {"Preventable Hospitalization Rate": "Preventable Hosp. Rate"},
-    std_col.EXCESSIVE_DRINKING_PREFIX: {"% Excessive Drinking": None},
+    if year is None or int(year) >= 2012:
+        het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {"% diabetic": None}
+
+    if year is None or int(year) >= 2014:
+        het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {"% Diabetic": None}
+
+    if year is None or int(year) >= 2016:
+        het_to_source_additional_topic_all_to_race_prefix_map[std_col.FREQUENT_MENTAL_DISTRESS_PREFIX] = {
+            "% Frequent Mental Distress": None
+        }
+
+    if year is None or int(year) >= 2020:
+        het_to_source_additional_topic_all_to_race_prefix_map[std_col.SUICIDE_PREFIX] = {"Crude Rate": "Suicide Rate"}
+        het_to_source_additional_topic_all_to_race_prefix_map[std_col.DIABETES_PREFIX] = {
+            "% Adults with Diabetes": None
+        }
+
+    if year is None or int(year) >= 2023:
+        het_to_source_additional_topic_all_to_race_prefix_map[std_col.VOTER_PARTICIPATION_PREFIX] = {
+            "% Voter Turnout": None
+        }
+
+    return het_to_source_additional_topic_all_to_race_prefix_map
+
+
+default_source_race_to_id_map = {
+    "(AIAN)": std_col.Race.AIAN_NH.value,
+    "(Asian)": std_col.Race.API_NH.value,
+    "(Black)": std_col.Race.BLACK_NH.value,
+    "(Hispanic)": std_col.Race.HISP.value,
+    "(White)": std_col.Race.WHITE_NH.value,
 }
 
-het_to_source_additional_topic_all_to_race_prefix_map: Dict[str, Dict[str, Optional[str]]] = {
-    std_col.SUICIDE_PREFIX: {"Crude Rate": "Suicide Rate"},
-    std_col.FREQUENT_MENTAL_DISTRESS_PREFIX: {"% Frequent Mental Distress": None},
-    std_col.DIABETES_PREFIX: {"% Adults with Diabetes": None},
-    std_col.VOTER_PARTICIPATION_PREFIX: {"% Voter Turnout": None},
+source_race_few_to_id_map = {
+    "(Black)": std_col.Race.BLACK_NH.value,
+    "(Hispanic)": std_col.Race.HISP.value,
+    "(White)": std_col.Race.WHITE_NH.value,
 }
 
-# frequent mental distress
-source_race_to_id_map = {
+source_race_w_to_id_map = {
     "(AIAN)": std_col.Race.AIAN_NH.value,
     "(Asian)": std_col.Race.API_NH.value,
     "(Black)": std_col.Race.BLACK_NH.value,
     "(Hispanic)": std_col.Race.HISP.value,
-    "(White)": std_col.Race.WHITE_NH.value,
+    "(white)": std_col.Race.WHITE_NH.value,
 }
 
-# suicide 2024
-source_nh_race_code_to_id_map_2024 = {
+source_nh_race_to_id_map = {
     "(Hispanic (all races))": std_col.Race.HISP.value,
     "(Non-Hispanic AIAN)": std_col.Race.AIAN_NH.value,
     "(Non-Hispanic Asian)": std_col.Race.ASIAN_NH.value,
@@ -41,19 +126,25 @@
     "(Non-Hispanic White)": std_col.Race.WHITE_NH.value,
 }
 
-# suicide 2023
-source_race_code_to_id_map_2023 = {
-    "(AIAN)": std_col.Race.AIAN_NH.value,
-    "(Asian)": std_col.Race.ASIAN_NH.value,
-    "(Black)": std_col.Race.BLACK_NH.value,
-    "(Hispanic)": std_col.Race.HISP.value,
-    "(White)": std_col.Race.WHITE_NH.value,
-}
+
+def get_race_map(year: str, sheet_name: str) -> Dict[str, str]:
+    """
+    Returns a dict of CHR source race names to the HET standard race code.
+    Some source years/sheets use inconsistent labels for the column names.
+    """
+    special_race_maps = {
+        ("2019", "Ranked Measure Data"): source_race_few_to_id_map,
+        ("2022", "Ranked Measure Data"): source_race_w_to_id_map,
+        ("2022", "Additional Measure Data"): source_race_w_to_id_map,
+        ("2024", "Additional Measure Data"): source_nh_race_to_id_map,
+    }
+    return special_race_maps.get((year, sheet_name), default_source_race_to_id_map)
 
 
 source_fips_col = "FIPS"
 source_per_100k = "Rate"
 source_pct_rate = "%"
+source_pct_rate_cols_no_symbol = ["Diabetes"]
 
 
 class CHRData(DataSource):
@@ -75,7 +166,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
 
         dfs = []
 
-        for year in ["2023", "2024"]:
+        for year in CHR_FILE_LOOKUP.keys():
 
             main_sheet_name = "Select Measure Data" if year == "2024" else "Ranked Measure Data"
             main_source_df = get_df_from_chr_excel_sheet(year, main_sheet_name)
@@ -87,14 +178,15 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
                 }
             )
 
-            # # drop national and state-level rows
-            year_df = year_df[~year_df[std_col.COUNTY_FIPS_COL].str.endswith("000")]
+            # # drop any national and state-level rows
+            year_df = year_df[~year_df[std_col.COUNTY_FIPS_COL].astype(str).str.endswith("000")]
             melt_map = get_melt_map(year)
             year_df = dataset_utils.melt_to_het_style_df(
                 year_df, std_col.RACE_CATEGORY_ID_COL, [std_col.COUNTY_FIPS_COL], melt_map
             )
             year_df[std_col.STATE_FIPS_COL] = year_df[std_col.COUNTY_FIPS_COL].str[:2]
-            year_df[std_col.TIME_PERIOD_COL] = year
+            year_df.insert(loc=0, column=std_col.TIME_PERIOD_COL, value=year)
+
             dfs.append(year_df)
 
         df = pd.concat(dfs)
@@ -116,8 +208,8 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
             float_cols = timeview_float_cols_map[timeview]
             df_for_bq, float_cols = convert_some_pct_rate_to_100k(df, float_cols)
 
-            topic_prefixes = list(het_to_source_select_topic_all_to_race_prefix_map.keys()) + list(
-                het_to_source_additional_topic_all_to_race_prefix_map.keys()
+            topic_prefixes = list(get_het_to_source_select_topic_all_to_race_prefix_map().keys()) + list(
+                get_het_to_source_additional_topic_all_to_race_prefix_map().keys()
             )
             topic_prefixes.append("chr_population")
 
@@ -138,19 +230,16 @@ def get_source_usecols(year: str, sheet_name: str) -> List[str]:
     source_usecols = [source_fips_col]
 
     sheet_topic_map: Dict[str, Dict[str, Optional[str]]] = {}
-    sheet_race_map: Dict[str, str] = {}
+    sheet_race_map = get_race_map(year, sheet_name)
     if sheet_name in ["Ranked Measure Data", "Select Measure Data"]:
-        sheet_topic_map = het_to_source_select_topic_all_to_race_prefix_map
-        sheet_race_map = source_race_to_id_map
+        sheet_topic_map = get_het_to_source_select_topic_all_to_race_prefix_map(year)
     if sheet_name == "Additional Measure Data":
-        sheet_topic_map = het_to_source_additional_topic_all_to_race_prefix_map
-        sheet_race_map = source_nh_race_code_to_id_map_2024 if year == "2024" else source_race_code_to_id_map_2023
+        sheet_topic_map = get_het_to_source_additional_topic_all_to_race_prefix_map(year)
 
     for source_topic_all_to_race_prefix_map in sheet_topic_map.values():
         for source_topic, source_topic_race_prefix in source_topic_all_to_race_prefix_map.items():
             source_usecols.append(source_topic)
 
-            # some topics only have ALLs
             if source_topic_race_prefix is not None:
                 for race_suffix in sheet_race_map.keys():
                     source_usecols.append(f"{source_topic_race_prefix} {race_suffix}")
@@ -167,49 +256,49 @@ def get_melt_map(year: str) -> Dict[str, Dict[str, str]]:
         dict: A nested dict
     """
 
-    race_code_to_id_map = source_nh_race_code_to_id_map_2024 if year == "2024" else source_race_code_to_id_map_2023
-
     melt_map: Dict[str, Dict[str, str]] = {}
-    # each topic get its own sub-mapping
-    for het_prefix, source_all_race_map in het_to_source_select_topic_all_to_race_prefix_map.items():
-        select_topic_melt_map: Dict[str, str] = {}
-        # maps the sources by race topic column name to the needed HET race column values
+
+    ranked_race_code_to_id_map = get_race_map(year, "Ranked Measure Data")
+    for het_prefix, source_all_race_map in get_het_to_source_select_topic_all_to_race_prefix_map(year).items():
+        topic_melt_map: Dict[str, str] = {}
+
         for source_all, source_race_prefix in source_all_race_map.items():
-            select_topic_melt_map[source_all] = std_col.Race.ALL.value
+            topic_melt_map[source_all] = std_col.Race.ALL.value
 
-            # some topics only have ALLs
+            # some topics have by race columns
             if source_race_prefix is not None:
-                for source_race_suffix, het_race_id in source_race_to_id_map.items():
-                    select_topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id
+                for source_race_suffix, het_race_id in ranked_race_code_to_id_map.items():
+                    topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id
 
         # assign 100k or pct_rate as needed
         rate_suffix = ""
         source_all_col = list(source_all_race_map.keys())[0]
         if source_per_100k in source_all_col:
             rate_suffix = std_col.PER_100K_SUFFIX
-        if source_pct_rate in source_all_col:
+        if source_pct_rate in source_all_col or source_all_col in source_pct_rate_cols_no_symbol:
             rate_suffix = std_col.PCT_RATE_SUFFIX
 
         # set this metrics sub melt map
-        melt_map[f"{het_prefix}_{rate_suffix}"] = select_topic_melt_map
+        melt_map[f"{het_prefix}_{rate_suffix}"] = topic_melt_map
 
-    for het_prefix, source_all_race_map in het_to_source_additional_topic_all_to_race_prefix_map.items():
+    additional_race_code_to_id_map = get_race_map(year, "Additional Measure Data")
+    for het_prefix, source_all_race_map in get_het_to_source_additional_topic_all_to_race_prefix_map(year).items():
         additional_topic_melt_map: Dict[str, str] = {}
         # maps the sources by race topic column name to the needed HET race column values
         for source_all, source_race_prefix in source_all_race_map.items():
             additional_topic_melt_map[source_all] = std_col.Race.ALL.value
 
             # some topics only have ALLs
             if source_race_prefix is not None:
-                for source_race_suffix, het_race_id in race_code_to_id_map.items():
+                for source_race_suffix, het_race_id in additional_race_code_to_id_map.items():
                     additional_topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id
 
         # assign 100k or pct_rate as needed
         rate_suffix = ""
         source_all_col = list(source_all_race_map.keys())[0]
         if source_per_100k in source_all_col:
             rate_suffix = std_col.PER_100K_SUFFIX
-        if source_pct_rate in source_all_col:
+        if source_pct_rate in source_all_col or source_all_col in source_pct_rate_cols_no_symbol:
             rate_suffix = std_col.PCT_RATE_SUFFIX
 
         # set this metrics sub melt map
@@ -231,15 +320,15 @@ def get_float_cols() -> Dict[str, List[str]]:
     historical_float_cols = []
 
     # include all numerical columns in the time map
-    all_topics = list(het_to_source_select_topic_all_to_race_prefix_map.keys()) + list(
-        het_to_source_additional_topic_all_to_race_prefix_map.keys()
+    all_topics = list(get_het_to_source_select_topic_all_to_race_prefix_map().keys()) + list(
+        get_het_to_source_additional_topic_all_to_race_prefix_map().keys()
     )
     for topic_prefix in all_topics:
 
         # assign 100k or pct_rate as needed based on the source col name
         source_dict = {
-            **het_to_source_select_topic_all_to_race_prefix_map,
-            **het_to_source_additional_topic_all_to_race_prefix_map,
+            **get_het_to_source_select_topic_all_to_race_prefix_map(),
+            **get_het_to_source_additional_topic_all_to_race_prefix_map(),
         }.get(topic_prefix)
 
         if source_dict is None:
@@ -263,12 +352,7 @@ def get_float_cols() -> Dict[str, List[str]]:
 def get_df_from_chr_excel_sheet(year: str, sheet_name: str) -> pd.DataFrame:
     source_usecols = get_source_usecols(year, sheet_name)
 
-    file_name_lookup = {
-        "2024": "2024_county_health_release_data_-_v1.xlsx",
-        "2023": "2023 County Health Rankings Data - v2.xlsx",
-    }
-
-    file_name = file_name_lookup[year]
+    file_name = CHR_FILE_LOOKUP[year]
 
     return gcs_to_bq_util.load_xlsx_as_df_from_data_dir(
         CHR_DIR,