Merge pull request #105 from UrbanInstitute/version0.0.4

Version0.0.4
UrbanInstitute · Oct 27, 2024 · 4b07296 · 4b07296
2 parents 458f549 + 72eabac
commit 4b07296
Show file tree

Hide file tree

Showing 88 changed files with 4,342 additions and 351 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,7 +1,11 @@
+^renv$
+^renv\.lock$
 ^syntheval\.Rproj$
 ^\.Rproj\.user$
 ^LICENSE\.md$
 ^README\.Rmd$
+^README\.qmd$
+^README_files$
 ^data-raw$
 ^test.R
-disriminators.qmd
+^project-standards.md$
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,52 @@
-.Rproj.user
+# History files
 .Rhistory
-.DS_Store
+.Rapp.history
+
+# Mac system file
+.DS_Store
+
+# Session Data files
+.RData
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
+rsconnect/
+
+# log files
+*.log
+*\.html
+
+# renv environment files
+renv/
+
+# README build files
+README_files/
+
+# Plot outputs from unit tests
+tests/testthat/Rplots.pdf
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,16 @@
 Package: syntheval
 Title: A set of tools for evaluating synthetic data utility and disclosure risk
-Version: 0.0.3
+Version: 0.0.4
 Authors@R: c(
-    person(given = "Aaron R.", family = "Williams", 
+    person(given = "Aaron R.", 
+           family = "Williams", 
            email = "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-5564-1938")),
+    person(given = "Jeremy",
+           family = "Seeman", 
+           email = "[email protected]",
+           role = "aut",
+           comment = c(ORCID = "0000-0003-3526-3209")),        
     person("Gabe", "Morrison", , "[email protected]", role = "ctb"),
     person("Elyse", "McFalls", , "[email protected]", role = "ctb")
   )
@@ -16,24 +22,31 @@ License: AGPL (>= 3)
 BugReports: https://github.com/UI-Research/syntheval/issues
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Suggests: 
+    forcats,
+    stringr,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 Imports: 
     broom,
     dplyr,
+    ggplot2,
     gower,
+    gridExtra,
     Hmisc,
     magrittr,
     parsnip,
+    pillar,
+    purrr,
     recipes,
     rlang,
     rsample,
     tibble,
     tidyr,
     tidyselect,
     tune,
+    twosamples,
     workflows,
     yardstick
 Suggestions:

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,14 +1,28 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(print,eval_data)
 export("%>%")
 export(add_discriminator_auc)
 export(add_pmse)
 export(add_pmse_ratio)
 export(add_propensities)
 export(add_propensities_tuned)
 export(add_specks)
+export(aggregate_qid_eval)
+export(convert_na_to_level)
+export(create_cormat_plot)
+export(disc_baseline)
 export(disc_mit)
 export(discrimination)
+export(eval_data)
+export(is_eval_data)
+export(plot_categorical_bar)
+export(plot_cormat)
+export(plot_numeric_hist_kde)
+export(plot_prob_qid_abs_err)
+export(plot_prob_qid_partition)
+export(prep_combined_data_for_na.rm)
+export(prep_discrete_eval_data)
 export(util_ci_overlap)
 export(util_co_occurrence)
 export(util_corr_fit)
@@ -20,4 +34,5 @@ export(util_tails)
 export(util_totals)
 export(weighted_skewness)
 importFrom(magrittr,"%>%")
+importFrom(rlang,":=")
 importFrom(rlang,.data)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# syntheval 0.0.4
+
+* Add empirical disclosure risk metrics.
+* Add comparison visualization utilities.
+* Add `na.rm` functionality to most functions to handle `NA` values.
+* Add families to roxygen2 headers.
+* Ensure that all functions return ungrouped output.
+
 # syntheval 0.0.3
 
 * Add a README with examples.

diff --git a/R/add_discriminator_auc.R b/R/add_discriminator_auc.R
@@ -18,7 +18,8 @@ add_discriminator_auc <- function(discrimination, split = TRUE) {
       dplyr::group_by(.data$.sample) %>%
       yardstick::roc_auc(".source_label", ".pred_synthetic") %>%
       dplyr::mutate(.sample = factor(.data$.sample, levels = c("training", "testing"))) %>%
-      dplyr::arrange(.data$.sample)
+      dplyr::arrange(.data$.sample) %>%
+      dplyr::ungroup()
 
   } else {
 

diff --git a/R/co_occurence.R b/R/co_occurence.R
@@ -1,10 +1,11 @@
 #' Construct a co-occurrence matrix
 #'
 #' @param data A tibble with numeric variables
+#' @param na.rm a logical indicating whether missing values should be removed.
 #'
 #' @return A co-occurrence matrix
 #' 
-co_occurrence <- function(data) {
+co_occurrence <- function(data, na.rm = FALSE) {
 
   # create a vector of variable names
   data_names <- names(data)
@@ -19,8 +20,20 @@ co_occurrence <- function(data) {
 
     for (col_name in data_names) {
 
+      row_var <- dplyr::pull(data, row_name)
+      col_var <- dplyr::pull(data, col_name)
+
+      if (na.rm) {
+
+        # remove missing values
+        na_lgl <- !is.na(row_var) & !is.na(col_var)
+        row_var <- row_var[na_lgl]
+        col_var <- col_var[na_lgl]
+
+      } 
+
       co_occurence_matrix[row_name, col_name] <- 
-        mean(dplyr::pull(data, row_name) != 0 & dplyr::pull(data, col_name) != 0)
+        mean(row_var != 0 & col_var != 0)
 
     }
 

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,132 @@
+#' American Community Survey confidential microdata
+#'
+#' An extract constructed from the 2019 American Community Survey containing a 
+#' random sample of n = 1000 Nebraska respondents.
+#'
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen, 
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler. 
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024. 
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_conf`
+#' A data frame with 1,000 rows and 11 columns:
+#' \describe{
+#'   \item{county}{fct, county}
+#'   \item{gq}{fct, group quarter kind}
+#'   \item{sex}{fct, sex}
+#'   \item{marst}{fct, marital status}
+#'   \item{hcovany}{fct, health insurance status}
+#'   \item{empstat}{fct, employment status}
+#'   \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#'   \item{age}{dbl, age (in years)}
+#'   \item{famsize}{dbl, household/family size}
+#'   \item{transit_time}{dbl, transit time to work (in minutes)}
+#'   \item{inctot}{dbl, annual income}
+#' }
+#' @source <https://usa.ipums.org/usa/>
+"acs_conf"
+
+#' American Community Survey holdout microdata
+#'
+#' An extract constructed from the 2019 American Community Survey containing a 
+#' random sample of n = 1000 Nebraska respondents. This sample is distinct from 
+#' `acs_conf` and is not used in producing the synthetic data available in this 
+#' package.
+#'
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen, 
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler. 
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024. 
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_holdout`
+#' A data frame with 1,000 rows and 11 columns:
+#' \describe{
+#'   \item{county}{fct, county}
+#'   \item{gq}{fct, group quarter kind}
+#'   \item{sex}{fct, sex}
+#'   \item{marst}{fct, marital status}
+#'   \item{hcovany}{fct, health insurance status}
+#'   \item{empstat}{fct, employment status}
+#'   \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#'   \item{age}{dbl, age (in years)}
+#'   \item{famsize}{dbl, household/family size}
+#'   \item{transit_time}{dbl, transit time to work (in minutes)}
+#'   \item{inctot}{dbl, annual income}
+#' }
+#' @source <https://usa.ipums.org/usa/>
+"acs_holdout"
+
+#' American Community Survey lower-risk synthetic data
+#'
+#' A list of 30 samples of synthetic data derived from `acs_conf`, 
+#' generated using noise infusion for both categorical and numeric random variables. 
+#' These are referred to as "lower-risk" relative to the "higher-risk" synthetic data
+#' also available in this package; the synthetic data is purely for testing purposes.
+#' 
+#' Categorical random variables are selected by resampling from a mixture of the 
+#' original multivariate cell proportions and a uniform mixture. Numeric random 
+#' variables are first modelled using regression trees, and new sampled values
+#' each have additional discrete two-sided geometric noise added to them. 
+#'
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen, 
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler. 
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024. 
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_lr_synths`
+#' A list of 30 data frames with 1,000 rows and 11 columns:
+#' \describe{
+#'   \item{county}{fct, county}
+#'   \item{gq}{fct, group quarter kind}
+#'   \item{sex}{fct, sex}
+#'   \item{marst}{fct, marital status}
+#'   \item{hcovany}{fct, health insurance status}
+#'   \item{empstat}{fct, employment status}
+#'   \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#'   \item{age}{dbl, age (in years)}
+#'   \item{famsize}{dbl, household/family size}
+#'   \item{transit_time}{dbl, transit time to work (in minutes)}
+#'   \item{inctot}{dbl, annual income}
+#' }
+#' @source <https://usa.ipums.org/usa/>
+"acs_lr_synths"
+
+
+#' American Community Survey higher-risk synthetic data
+#'
+#' A list of 30 samples of partial synthetic data derived from `acs_conf`, 
+#' generated using models that intentionally overfit to the confidential data. 
+#' These are referred to as "higher-risk" relative to the "lower-risk" synthetic 
+#' data also available in this package; the synthetic data is purely for testing purposes.
+#' 
+#' Categorical variables are primarily kept "as-is" in this partially synthetic data,
+#' with a small proportion of categorical records resampled from the data. Numeric
+#' variables are resampled from decision tree models that are intentionally designed
+#' to overfit to the confidential data.
+#' 
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen, 
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler. 
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024. 
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_hr_synths`
+#' A list of 30 data frames with 1,000 rows and 11 columns:
+#' \describe{
+#'   \item{county}{fct, county}
+#'   \item{gq}{fct, group quarter kind}
+#'   \item{sex}{fct, sex}
+#'   \item{marst}{fct, marital status}
+#'   \item{hcovany}{fct, health insurance status}
+#'   \item{empstat}{fct, employment status}
+#'   \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#'   \item{age}{dbl, age (in years)}
+#'   \item{famsize}{dbl, household/family size}
+#'   \item{transit_time}{dbl, transit time to work (in minutes)}
+#'   \item{inctot}{dbl, annual income}
+#' }
+#' @source <https://usa.ipums.org/usa/>
+"acs_hr_synths"