diff --git a/.Rbuildignore b/.Rbuildignore
index 7c7e6da..2be7410 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,7 +1,11 @@
+^renv$
+^renv\.lock$
^syntheval\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
^README\.Rmd$
+^README\.qmd$
+^README_files$
^data-raw$
^test.R
-disriminators.qmd
\ No newline at end of file
+^project-standards.md$
diff --git a/.gitignore b/.gitignore
index 6957ee4..5b26526 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,52 @@
-.Rproj.user
+# History files
.Rhistory
-.DS_Store
\ No newline at end of file
+.Rapp.history
+
+# Mac system file
+.DS_Store
+
+# Session Data files
+.RData
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
+rsconnect/
+
+# log files
+*.log
+*\.html
+
+# renv environment files
+renv/
+
+# README build files
+README_files/
+
+# Plot outputs from unit tests
+tests/testthat/Rplots.pdf
diff --git a/DESCRIPTION b/DESCRIPTION
index fff2beb..0db6a7c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,10 +1,16 @@
Package: syntheval
Title: A set of tools for evaluating synthetic data utility and disclosure risk
-Version: 0.0.3
+Version: 0.0.4
Authors@R: c(
- person(given = "Aaron R.", family = "Williams",
+ person(given = "Aaron R.",
+ family = "Williams",
email = "awilliams@urban.org", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5564-1938")),
+ person(given = "Jeremy",
+ family = "Seeman",
+ email = "jseeman@urban.org",
+ role = "aut",
+ comment = c(ORCID = "0000-0003-3526-3209")),
person("Gabe", "Morrison", , "gmorrison@urban.org", role = "ctb"),
person("Elyse", "McFalls", , "emcfalls@urban.org", role = "ctb")
)
@@ -16,17 +22,23 @@ License: AGPL (>= 3)
BugReports: https://github.com/UI-Research/syntheval/issues
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
Suggests:
+ forcats,
+ stringr,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Imports:
broom,
dplyr,
+ ggplot2,
gower,
+ gridExtra,
Hmisc,
magrittr,
parsnip,
+ pillar,
+ purrr,
recipes,
rlang,
rsample,
@@ -34,6 +46,7 @@ Imports:
tidyr,
tidyselect,
tune,
+ twosamples,
workflows,
yardstick
Suggestions:
diff --git a/NAMESPACE b/NAMESPACE
index 222f974..da8de60 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand
+S3method(print,eval_data)
export("%>%")
export(add_discriminator_auc)
export(add_pmse)
@@ -7,8 +8,21 @@ export(add_pmse_ratio)
export(add_propensities)
export(add_propensities_tuned)
export(add_specks)
+export(aggregate_qid_eval)
+export(convert_na_to_level)
+export(create_cormat_plot)
+export(disc_baseline)
export(disc_mit)
export(discrimination)
+export(eval_data)
+export(is_eval_data)
+export(plot_categorical_bar)
+export(plot_cormat)
+export(plot_numeric_hist_kde)
+export(plot_prob_qid_abs_err)
+export(plot_prob_qid_partition)
+export(prep_combined_data_for_na.rm)
+export(prep_discrete_eval_data)
export(util_ci_overlap)
export(util_co_occurrence)
export(util_corr_fit)
@@ -20,4 +34,5 @@ export(util_tails)
export(util_totals)
export(weighted_skewness)
importFrom(magrittr,"%>%")
+importFrom(rlang,":=")
importFrom(rlang,.data)
diff --git a/NEWS.md b/NEWS.md
index 4812b60..80549e8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,11 @@
+# syntheval 0.0.4
+
+* Add empirical disclosure risk metrics.
+* Add comparison visualization utilities.
+* Add `na.rm` functionality to most functions to handle `NA` values.
+* Add families to roxygen2 headers.
+* Ensure that all functions return ungrouped output.
+
# syntheval 0.0.3
* Add a README with examples.
diff --git a/R/add_discriminator_auc.R b/R/add_discriminator_auc.R
index cc8e213..3fc1de9 100644
--- a/R/add_discriminator_auc.R
+++ b/R/add_discriminator_auc.R
@@ -18,7 +18,8 @@ add_discriminator_auc <- function(discrimination, split = TRUE) {
dplyr::group_by(.data$.sample) %>%
yardstick::roc_auc(".source_label", ".pred_synthetic") %>%
dplyr::mutate(.sample = factor(.data$.sample, levels = c("training", "testing"))) %>%
- dplyr::arrange(.data$.sample)
+ dplyr::arrange(.data$.sample) %>%
+ dplyr::ungroup()
} else {
diff --git a/R/co_occurence.R b/R/co_occurence.R
index 70dbf47..316c49c 100644
--- a/R/co_occurence.R
+++ b/R/co_occurence.R
@@ -1,10 +1,11 @@
#' Construct a co-occurrence matrix
#'
#' @param data A tibble with numeric variables
+#' @param na.rm a logical indicating whether missing values should be removed.
#'
#' @return A co-occurrence matrix
#'
-co_occurrence <- function(data) {
+co_occurrence <- function(data, na.rm = FALSE) {
# create a vector of variable names
data_names <- names(data)
@@ -19,8 +20,20 @@ co_occurrence <- function(data) {
for (col_name in data_names) {
+ row_var <- dplyr::pull(data, row_name)
+ col_var <- dplyr::pull(data, col_name)
+
+ if (na.rm) {
+
+ # remove missing values
+ na_lgl <- !is.na(row_var) & !is.na(col_var)
+ row_var <- row_var[na_lgl]
+ col_var <- col_var[na_lgl]
+
+ }
+
co_occurence_matrix[row_name, col_name] <-
- mean(dplyr::pull(data, row_name) != 0 & dplyr::pull(data, col_name) != 0)
+ mean(row_var != 0 & col_var != 0)
}
diff --git a/R/data.R b/R/data.R
new file mode 100644
index 0000000..72eae02
--- /dev/null
+++ b/R/data.R
@@ -0,0 +1,132 @@
+#' American Community Survey confidential microdata
+#'
+#' An extract constructed from the 2019 American Community Survey containing a
+#' random sample of n = 1000 Nebraska respondents.
+#'
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_conf`
+#' A data frame with 1,000 rows and 11 columns:
+#' \describe{
+#' \item{county}{fct, county}
+#' \item{gq}{fct, group quarter kind}
+#' \item{sex}{fct, sex}
+#' \item{marst}{fct, marital status}
+#' \item{hcovany}{fct, health insurance status}
+#' \item{empstat}{fct, employment status}
+#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#' \item{age}{dbl, age (in years)}
+#' \item{famsize}{dbl, household/family size}
+#' \item{transit_time}{dbl, transit time to work (in minutes)}
+#' \item{inctot}{dbl, annual income}
+#' }
+#' @source
+"acs_conf"
+
+#' American Community Survey holdout microdata
+#'
+#' An extract constructed from the 2019 American Community Survey containing a
+#' random sample of n = 1000 Nebraska respondents. This sample is distinct from
+#' `acs_conf` and is not used in producing the synthetic data available in this
+#' package.
+#'
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_holdout`
+#' A data frame with 1,000 rows and 11 columns:
+#' \describe{
+#' \item{county}{fct, county}
+#' \item{gq}{fct, group quarter kind}
+#' \item{sex}{fct, sex}
+#' \item{marst}{fct, marital status}
+#' \item{hcovany}{fct, health insurance status}
+#' \item{empstat}{fct, employment status}
+#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#' \item{age}{dbl, age (in years)}
+#' \item{famsize}{dbl, household/family size}
+#' \item{transit_time}{dbl, transit time to work (in minutes)}
+#' \item{inctot}{dbl, annual income}
+#' }
+#' @source
+"acs_holdout"
+
+#' American Community Survey lower-risk synthetic data
+#'
+#' A list of 30 samples of synthetic data derived from `acs_conf`,
+#' generated using noise infusion for both categorical and numeric random variables.
+#' These are referred to as "lower-risk" relative to the "higher-risk" synthetic data
+#' also available in this package; the synthetic data is purely for testing purposes.
+#'
+#' Categorical random variables are selected by resampling from a mixture of the
+#' original multivariate cell proportions and a uniform mixture. Numeric random
+#' variables are first modelled using regression trees, and new sampled values
+#' each have additional discrete two-sided geometric noise added to them.
+#'
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_lr_synths`
+#' A list of 30 data frames with 1,000 rows and 11 columns:
+#' \describe{
+#' \item{county}{fct, county}
+#' \item{gq}{fct, group quarter kind}
+#' \item{sex}{fct, sex}
+#' \item{marst}{fct, marital status}
+#' \item{hcovany}{fct, health insurance status}
+#' \item{empstat}{fct, employment status}
+#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#' \item{age}{dbl, age (in years)}
+#' \item{famsize}{dbl, household/family size}
+#' \item{transit_time}{dbl, transit time to work (in minutes)}
+#' \item{inctot}{dbl, annual income}
+#' }
+#' @source
+"acs_lr_synths"
+
+
+#' American Community Survey higher-risk synthetic data
+#'
+#' A list of 30 samples of partial synthetic data derived from `acs_conf`,
+#' generated using models that intentionally overfit to the confidential data.
+#' These are referred to as "higher-risk" relative to the "lower-risk" synthetic
+#' data also available in this package; the synthetic data is purely for testing purposes.
+#'
+#' Categorical variables are primarily kept "as-is" in this partially synthetic data,
+#' with a small proportion of categorical records resampled from the data. Numeric
+#' variables are resampled from decision tree models that are intentionally designed
+#' to overfit to the confidential data.
+#'
+#' Original data source:
+#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
+#' https://doi.org/10.18128/D010.V15.0
+#'
+#' @format ## `acs_hr_synths`
+#' A list of 30 data frames with 1,000 rows and 11 columns:
+#' \describe{
+#' \item{county}{fct, county}
+#' \item{gq}{fct, group quarter kind}
+#' \item{sex}{fct, sex}
+#' \item{marst}{fct, marital status}
+#' \item{hcovany}{fct, health insurance status}
+#' \item{empstat}{fct, employment status}
+#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+#' \item{age}{dbl, age (in years)}
+#' \item{famsize}{dbl, household/family size}
+#' \item{transit_time}{dbl, transit time to work (in minutes)}
+#' \item{inctot}{dbl, annual income}
+#' }
+#' @source
+"acs_hr_synths"
\ No newline at end of file
diff --git a/R/disc_baseline.R b/R/disc_baseline.R
new file mode 100644
index 0000000..506a3a7
--- /dev/null
+++ b/R/disc_baseline.R
@@ -0,0 +1,252 @@
+#'
+#' Compute baseline disclosure risk metrics using confidential data.
+#'
+#' @param eval_data An `eval_data` object or a tibble / data.frame corresponding
+#' to the confidential data.
+#' @param qid_keys A character vector of quasi-identifying keys. Must be provided
+#' as `factor` type variables. Defaults to all factor variables in `eval_data`.
+#' @param sens_keys An optional character vector of sensitive variables of interest.
+#' Must be disjoint from `qid_keys`, or `FALSE` to provide no l-diversity or t-closeness
+#' metrics. Defaults to the complement of `qid_keys`.
+#' @param tclose_metric_cat A string describing the t-closeness distance metric
+#' between proportions of categorical variables. One of `"l1"` (L1 distance),
+#' `"l2"` (L2 distance), or `"linf"` (L-infinity distance), defaults to `"linf"`.
+#' @param tclose_metric_numeric A string describing the t-closeness distance metric
+#' between numeric variable empirical CDFs. One of `"ks"` (Kolmogorov-Smirnov),
+#' `"wass"` (Wasserstein L1 distance), `"cvm"` (Cramer-von-Mises distance), or
+#' `"ad"` (Anderson-Darling), defaults, to `"ks"`.
+#' @param na.rm Boolean if TRUE, will remove `NA` values from numeric `sens_keys`.
+#'
+#' @returns A `baseline_metrics` object.
+#'
+#' @export
+#'
+disc_baseline <- function(
+ eval_data,
+ qid_keys = NULL,
+ sens_keys = NULL,
+ tclose_metric_cat = c("linf", "l1", "l2"),
+ tclose_metric_numeric = c("ks", "wass", "cvm", "ad"),
+ na.rm = FALSE
+ ) {
+
+ # default argument parsing
+ tclose_metric_cat <- match.arg(tclose_metric_cat)
+ tclose_metric_numeric <- match.arg(tclose_metric_numeric)
+
+ # either use provided data.frame as-is or extract it from eval_data
+ conf_data <- if (is_eval_data(eval_data)) eval_data$conf_data else eval_data
+ stopifnot(is.data.frame(conf_data))
+
+ conf_data_types <- unlist(purrr::map(conf_data, pillar::type_sum))
+
+ # if no qid_keys provided, use all conf_data columns
+ if (is.null(qid_keys)) {
+
+ qid_keys <- names(conf_data)[conf_data_types == "fct"]
+
+ }
+
+ # require factor types for qid_keys
+ stopifnot(length(qid_keys) > 0)
+ stopifnot(all(conf_data_types[qid_keys] == "fct"))
+
+ # construct unique QID combinations
+ qid_agg <- .aggregate_qid(conf_data, keys = qid_keys)
+ qid_metrics <- tidyr::pivot_longer(
+ qid_agg,
+ -dplyr::one_of(c("key_id", qid_keys)),
+ names_to = "metric"
+ )
+
+ # return result early if not computing l-diversity or t-closeness metrics
+ if (identical(sens_keys, FALSE)) {
+
+ return(
+
+ list(
+ "qid_keys" = qid_keys,
+ "qid_metrics" = qid_metrics,
+ "sens_keys" = NULL,
+ "sens_metrics" = NULL
+ ) %>%
+ structure(class = "baseline_metrics")
+
+ )
+
+ }
+
+ # if no sensitive keys provided use the complement of qid_keys
+ if (is.null(sens_keys)) {
+
+ sens_keys <- setdiff(x = names(conf_data), y = qid_keys)
+
+ }
+
+ # ensure no overlap between qid_keys and sens_keys
+ stopifnot(length(intersect(sens_keys, qid_keys)) == 0)
+
+ # map sensitive keys to metrics
+ sens_key_types <- unlist(conf_data_types[sens_keys])
+
+ # calculate global distribution statistics
+ complete_dist_stats <- purrr::map(
+ .x = sens_keys,
+ .f = \(x) {
+
+ # for factor variables...
+ if (sens_key_types[[x]] == "fct") {
+
+ return(
+ # use the proportion of values in each factor level
+ c(table(conf_data[[x]], exclude=NULL)) /
+ nrow(conf_data)
+ )
+
+ } else {
+
+ if (!na.rm & any(is.na(conf_data[[x]]))) {
+
+ warning(
+ paste(
+ "NA values will not be included in t-closeness calculations but",
+ "will be part of sample size calculations for:",
+ x
+ )
+ )
+
+ }
+ # use observed values to construct empirical CDF
+ return(if (na.rm) stats::na.omit(conf_data[[x]]) else conf_data[[x]])
+
+ }
+
+ }
+ )
+ names(complete_dist_stats) <- sens_keys
+
+ # use unique, consistent QID key names constructed from .aggregate_qid
+ conf_w_key_ids <- conf_data %>%
+ dplyr::inner_join(
+ qid_agg %>%
+ dplyr::select(dplyr::all_of(c(qid_keys, "key_id"))),
+ by = qid_keys
+ )
+
+ # define per-metrics function per grouped data frame and key name
+ per_qik_metrics <- function(gdf, names) {
+
+ # calculate distinct l-diversity across each sensitive key column
+ l_div <- gdf %>%
+ dplyr::summarise(
+ dplyr::across(dplyr::all_of(sens_keys), dplyr::n_distinct)
+ )
+
+ # define t-closeness function (defined here because it depends on
+ # `gdf` and `names`)
+ t_closeness_func <- function(x) {
+
+ if (sens_key_types[dplyr::cur_column()] == "fct") {
+
+ # for factor variables, calculate per-level probabilities
+ qid_prop <- c(table(x, exclude=NULL)) / nrow(gdf)
+ complete_prop <- complete_dist_stats[[dplyr::cur_column()]]
+
+ # calculate appropriate distance between probability vectors
+ return(
+ dplyr::case_when(
+ tclose_metric_cat == "l1" ~ (
+ sum(abs(qid_prop - complete_prop))
+ ),
+ tclose_metric_cat == "l2" ~ (
+ sqrt(sum((qid_prop - complete_prop)**2))
+ ),
+ # default is L-infinity (maximum difference)
+ TRUE ~ (
+ max(abs(qid_prop - complete_prop))
+ )
+ )
+ )
+
+ } else {
+
+ # for numerics, first collect samples for each column
+ qid_samps <- dplyr::pull(gdf, dplyr::cur_column())
+ complete_samps <- complete_dist_stats[[dplyr::cur_column()]]
+
+ return(
+
+ # calculate appropriate distance between empirical CDFs
+ dplyr::case_when(
+ tclose_metric_numeric == "wass" ~ (
+ twosamples::wass_stat(qid_samps, complete_samps)
+ ),
+ tclose_metric_numeric == "cvm" ~ (
+ twosamples::cvm_stat(qid_samps, complete_samps)
+ ),
+ tclose_metric_numeric == "ad" ~ (
+ twosamples::ad_stat(qid_samps, complete_samps)
+ ),
+ # default is Kolmogorov-Smirnov
+ TRUE ~ (
+ twosamples::ks_stat(qid_samps, complete_samps)
+ )
+ )
+
+ )
+
+ }
+
+ }
+
+ # calculate t-closeness according to the specified metric
+ t_close <- gdf %>%
+ dplyr::summarise(
+ dplyr::across(dplyr::all_of(sens_keys), t_closeness_func))
+
+ return(
+ data.frame(
+ "l_div" = l_div,
+ "t_close" = t_close,
+ "key_id" = as.numeric(gdf[1, "key_id"])
+ )
+ )
+
+ }
+
+ # for observed keys, calculate additional metrics
+ sens_metrics <- conf_w_key_ids %>%
+ dplyr::group_by(
+ dplyr::across(dplyr::all_of(qid_keys))
+ ) %>%
+ dplyr::group_map(per_qik_metrics) %>%
+ dplyr::bind_rows()
+
+ # join together k-anonymity and l-diversity / t-closeness metrics...
+ joined_sens_metrics <- dplyr::inner_join(
+ qid_agg %>% # first, select k-anonymity results with QID keys
+ dplyr::select(dplyr::all_of(c("key_id", qid_keys, "raw_n", "prop"))),
+ # next, create rows for each key_id per metric and sensitive variable
+ tidyr::pivot_longer(
+ sens_metrics,
+ # pivot longer on all variables except key_id
+ -dplyr::one_of("key_id"),
+ # break up metrics and associated variables (ex: "tcloseness.sens_col1")
+ names_sep = "\\.",
+ names_to = c("metric", "sens_var")
+ ),
+ by = "key_id"
+ )
+
+ result <- list(
+ "qid_keys" = qid_keys,
+ "qid_metrics" = qid_metrics,
+ "sens_keys" = sens_keys,
+ "sens_metrics" = joined_sens_metrics
+ ) %>%
+ structure(class = "baseline_metrics")
+
+ return(result)
+
+}
+
diff --git a/R/disc_dd_helper.R b/R/disc_dd_helper.R
new file mode 100644
index 0000000..6e4034d
--- /dev/null
+++ b/R/disc_dd_helper.R
@@ -0,0 +1,316 @@
+#'
+#' Aggregate and count factor-level quasi-identifiers.
+#'
+#' @param df A data.frame.
+#' @param keys A character vector of column names.
+#'
+#' @return A data.frame of aggregated quasi-identifiers.
+#'
+.aggregate_qid <- function(df, keys) {
+
+ return(
+ df %>%
+ dplyr::group_by(
+ dplyr::across(dplyr::all_of(keys)),
+ .drop=FALSE
+ ) %>%
+ dplyr::summarise(raw_n = dplyr::n()) %>%
+ dplyr::ungroup() %>%
+ dplyr::mutate(prop = .data$raw_n / nrow(df)) %>%
+ dplyr::arrange(!!!rlang::syms(keys)) %>%
+ tibble::remove_rownames() %>%
+ tibble::rowid_to_column("key_id")
+ )
+
+}
+
+
+#'
+#' Input validation for categorical disclosure risk metrics.
+#'
+#' @param eval_data An `eval_data` object.
+#' @param keys A character vector of column names, or NULL to use all column names.
+#'
+#' @return `0` if all validation checks pass.
+#'
+.validate_eval_keys <- function(eval_data, keys = NULL) {
+
+ # check eval_data components
+ stopifnot(is_eval_data(eval_data))
+
+ # determine if using inferred keys or user-specified keys
+ if (is.null(keys)) {
+
+ keys <- names(eval_data$conf_data)
+
+ } else {
+
+ stopifnot(all(keys %in% names(eval_data$conf_data)))
+
+ }
+
+ # check all confidential data keys present in synthetic data
+ if (eval_data$n_rep > 1) {
+
+ for (ix in seq(eval_data$n_rep)) {
+
+ stopifnot(all(keys %in% names(eval_data$synth_data[[ix]])))
+
+ }
+
+ } else {
+
+ stopifnot(all(keys %in% names(eval_data$synth_data)))
+
+ }
+
+ if (!is.null(eval_data$holdout_data)) {
+
+ stopifnot(all(keys %in% names(eval_data$holdout_data)))
+
+ }
+
+ # check factor levels in confidential and synthetic data
+ for (key in keys) {
+
+ if (eval_data$n_rep > 1) {
+
+ for (ix in seq(eval_data$n_rep)) {
+
+ if (!identical(
+ levels(dplyr::pull(eval_data$conf_data, key)),
+ levels(dplyr::pull(eval_data$synth_data[[ix]], key))
+ )) {
+
+ stop(
+ "Key {key} has mismatched levels in confidential and synthetic data"
+ )
+
+ }
+
+ }
+
+ } else {
+
+ if (!identical(
+ levels(dplyr::pull(eval_data$conf_data, key)),
+ levels(dplyr::pull(eval_data$synth_data, key))
+ )) {
+
+ stop(
+ "Key {key} has mismatched levels in confidential and synthetic data"
+ )
+
+ }
+
+ }
+
+ if (!is.null(eval_data$holdout_data)) {
+
+ if (!identical(
+ levels(dplyr::pull(eval_data$conf_data, key)),
+ levels(dplyr::pull(eval_data$holdout_data, key))
+ )) {
+
+ stop(
+ "Key {key} has mismatched levels in confidential and holdout data"
+ )
+
+ }
+
+ }
+
+ }
+
+ # return 0 if all checks passed
+ return(0)
+
+}
+
+#'
+#' Discretize numeric columns for disclosure risk metrics
+#'
+#' @param eval_data An `eval_data` object.
+#' @param col_map A list mapping each column to its discretization parameters, one
+#' of either "k" (for specifying the total number of categories) or "width" (for
+#' specifying the width of each bin)
+#'
+#' @return An `eval_data` object.
+#'
+#' @examples
+#'
+#' prep_discrete_eval_data(
+#' eval_data(acs_conf, acs_lr_synths),
+#' col_map = list(
+#' "age" = list("k" = 10),
+#' "inctot" = list("width" = 10000)
+#' )
+#' )
+#'
+#' @export
+#'
+prep_discrete_eval_data <- function(eval_data, col_map) {
+
+ stopifnot(is_eval_data(eval_data))
+
+ eval_data_disc <- eval_data
+
+ for (col in names(col_map)) {
+
+ stopifnot(any(c("k", "width") %in% names(col_map[[col]])))
+
+ # get the observed extrema from all eval_data components, starting with
+ # the confidential data
+ col_min_candidates <- c(
+ min(dplyr::pull(eval_data$conf_data, col), na.rm = TRUE)
+ )
+ col_max_candidates <- c(
+ max(dplyr::pull(eval_data$conf_data, col), na.rm = TRUE)
+ )
+
+ # if using a single synthetic data replicate, add extrema to candidate list
+ if (eval_data$n_rep == 1) {
+
+ col_min_candidates <- c(
+ col_min_candidates,
+ min(dplyr::pull(eval_data$synth_data, col), na.rm = TRUE)
+ )
+
+ col_max_candidates <- c(
+ col_max_candidates,
+ max(dplyr::pull(eval_data$synth_data, col), na.rm = TRUE)
+ )
+
+ # else add column-wise extrema to candidate list across replicates
+ } else {
+
+ col_min_candidates <- c(
+ col_min_candidates,
+ min(
+ purrr::map_dbl(
+ .x = eval_data$synth_data,
+ .f = ~ min(dplyr::pull(.x, col), na.rm = TRUE)
+ )
+ )
+ )
+
+ col_max_candidates <- c(
+ col_max_candidates,
+ max(
+ purrr::map_dbl(
+ .x = eval_data$synth_data,
+ .f = ~ max(dplyr::pull(.x, col), na.rm = TRUE)
+ )
+ )
+ )
+
+ }
+
+ # if holdout data is supplied, add to candidates
+ if (!is.null(eval_data$holdout_data)) {
+
+ col_min_candidates <- c(
+ col_min_candidates,
+ min(dplyr::pull(eval_data$holdout_data, col), na.rm = TRUE)
+ )
+
+ col_max_candidates <- c(
+ col_max_candidates,
+ max(dplyr::pull(eval_data$holdout_data, col), na.rm = TRUE)
+ )
+
+ }
+
+ col_min <- min(col_min_candidates, na.rm = TRUE)
+ col_max <- max(col_max_candidates, na.rm = TRUE)
+
+ # apply col_map parameters to calculate discretization bounds
+ if ("k" %in% names(col_map[[col]])) {
+
+ breaks <- seq(
+ from = col_min,
+ to = col_max,
+ length.out = col_map[[col]][["k"]] + 1
+ )
+
+ } else {
+
+ breaks <- seq(
+ from = col_min,
+ to = col_max + col_map[[col]][["width"]],
+ by = col_map[[col]][["width"]]
+ )
+
+ }
+
+ # define mutate function
+ discretize_col <- function(data, col_name, breaks) {
+
+ # apply non-NA breaks
+ non_na_cut <- base::cut(
+ x = dplyr::pull(data, col_name),
+ breaks = breaks,
+ include.lowest = TRUE
+ )
+
+ # reintroduce NA as a factor level
+
+ return(
+ data %>%
+ dplyr::mutate(
+ {{col_name}} := factor(
+ non_na_cut,
+ levels = c(NA, levels(non_na_cut)),
+ exclude = NULL
+ )
+ )
+ )
+
+ }
+
+ # apply bounds to all relevant columns
+ eval_data_disc$conf_data <- discretize_col(
+ data = eval_data_disc$conf_data,
+ col_name = col,
+ breaks = breaks
+ )
+
+ if (eval_data_disc$n_rep == 1) {
+
+ eval_data_disc$synth_data <- discretize_col(
+ data = eval_data_disc$synth_data,
+ col_name = col,
+ breaks = breaks
+ )
+
+ } else {
+
+ for (ix in seq(eval_data_disc$n_rep)) {
+
+ eval_data_disc$synth_data[[ix]] <- discretize_col(
+ data = eval_data_disc$synth_data[[ix]],
+ col_name = col,
+ breaks = breaks
+ )
+
+ }
+
+ }
+
+ if (!is.null(eval_data_disc$holdout_data)) {
+
+ eval_data_disc$holdout_data <- discretize_col(
+ data = eval_data_disc$holdout_data,
+ col_name = col,
+ breaks = breaks
+ )
+
+ }
+
+ }
+
+ return(eval_data_disc)
+
+}
+
+
diff --git a/R/disc_mit.R b/R/disc_mit.R
index 0c4a3c3..70ad6cc 100644
--- a/R/disc_mit.R
+++ b/R/disc_mit.R
@@ -1,7 +1,7 @@
-#' Run a membership inference test
+#' Perform a nearest-neighbor membership inference test on one synthetic dataset
#'
-#' @param postsynth A postsynth object or tibble with synthetic data generated from the data input
-#' @param data A data frame with a subset of the original data
+#' @param synth_data A dataframe with synthetic data generated from the data input
+#' @param conf_data A data frame with a subset of the original data
#' @param holdout_data A dataframe with observations similar to the original but
#' not used to train the synthesizer. The data should have the same variables as
#' postsynth.
@@ -9,24 +9,19 @@
#' percentile will be predicted as in the training data. If the
#' threshold_percentile is not provided, the function calculates it with the
#' following formula: `nrow(data)/(nrow(data) + nrow(holdout_data))`
+#' @param summary Boolean if TRUE, returns summary statistics, if FALSE, returns
+#' two disaggregated dataframes of individual distances and ROC curve points.
#'
-#' @return A list with precision, recall, the confusion matrix, and ROC AUC
-#'
-#' @family Disclosure risk metrics
-#'
-#' @export
+#' @return Either a list with precision, recall, the confusion matrix, and ROC AUC
+#' or a list with two disaggregated dataframes (if summary = FALSE).
#'
-disc_mit <- function(postsynth, data, holdout_data, threshold_percentile = NULL) {
-
- if (is_postsynth(postsynth)) {
-
- synthetic_data <- postsynth$synthetic_data
-
- } else {
-
- synthetic_data <- postsynth
-
- }
+nn_membership_inference <- function(
+ synth_data,
+ conf_data,
+ holdout_data,
+ threshold_percentile = NULL,
+ summary = TRUE
+ ) {
# calculate threshold percentile for when the data are imbalanced
if (!is.null(threshold_percentile)) {
@@ -40,13 +35,13 @@ disc_mit <- function(postsynth, data, holdout_data, threshold_percentile = NULL)
} else {
- threshold_percentile <- nrow(data) / (nrow(data) + nrow(holdout_data))
+ threshold_percentile <- nrow(conf_data) / (nrow(conf_data) + nrow(holdout_data))
}
-
+
# combine records from the training data and holdout data
blended_data <- dplyr::bind_rows(
- training = data,
+ training = conf_data,
holdout = holdout_data,
.id = "source"
) %>%
@@ -56,7 +51,7 @@ disc_mit <- function(postsynth, data, holdout_data, threshold_percentile = NULL)
# record in the synthetic data
distances <- gower::gower_topn(
x = dplyr::select(blended_data, -source),
- y = synthetic_data,
+ y = synth_data,
n = 1
)
@@ -70,19 +65,188 @@ disc_mit <- function(postsynth, data, holdout_data, threshold_percentile = NULL)
blended_data <- dplyr::bind_cols(
blended_data,
- prediction = prediction,
- pseudo_probability = pseudo_probabilities
+ distance = distances$distance[1, ],
+ pseudo_probability = pseudo_probabilities,
+ prediction = prediction
) %>%
dplyr::mutate(prediction = factor(prediction, levels = c("training", "holdout")))
- # calculate metrics
- membership_metrics <- list(
- precision = yardstick::precision(blended_data, truth = source, estimate = prediction)$.estimate,
- recall = yardstick::recall(blended_data, truth = source, estimate = prediction)$.estimate,
- auc = yardstick::roc_auc_vec(truth = blended_data$source, estimate = blended_data$pseudo_probability),
- conf_mat = yardstick::conf_mat(blended_data, truth = source, estimate = prediction)
- )
+ if (summary) {
+
+ # calculate metrics
+ membership_metrics <- list(
+ precision = yardstick::precision(blended_data, truth = source, estimate = prediction)$.estimate,
+ recall = yardstick::recall(blended_data, truth = source, estimate = prediction)$.estimate,
+ auc = yardstick::roc_auc_vec(truth = blended_data$source, estimate = blended_data$pseudo_probability),
+ conf_mat = yardstick::conf_mat(blended_data, truth = source, estimate = prediction)
+ )
+
+ return(membership_metrics)
+
+ } else {
+
+ # calculate complete ROC
+ roc <- yardstick::roc_curve(
+ data = blended_data,
+ truth = tidyselect::all_of("source"),
+ tidyselect::all_of("pseudo_probability")
+ )
+
+ return(
+ list(
+ "results" = blended_data,
+ "roc" = roc
+ )
+ )
+
+ }
+
+}
- return(membership_metrics)
+#' Run a nearest-neighbor membership inference test
+#'
+#' @param eval_data An `eval_data` object.
+#' @param threshold_percentile Distances below the value associated with this
+#' percentile will be predicted as in the training data. If the
+#' threshold_percentile is not provided, the function calculates it with the
+#' following formula: `nrow(data) / (nrow(data) + nrow(holdout_data))`
+#' @param summary Boolean if TRUE, returns summary statistics, if FALSE, returns
+#' two disaggregated dataframes of individual distances and ROC curve points.
+#'
+#' @return A list with precision, recall, the confusion matrix, and ROC AUC
+#'
+#' @family Disclosure risk metrics
+#'
+#' @export
+#'
+disc_mit <- function(eval_data,
+ threshold_percentile = NULL,
+ summary = TRUE) {
-}
\ No newline at end of file
+ # if single replicate supplied
+ if (eval_data[["n_rep"]] == 1) {
+
+ return(
+ nn_membership_inference(
+ synth_data = eval_data[["synth_data"]],
+ conf_data = eval_data[["conf_data"]],
+ holdout_data = eval_data[["holdout_data"]],
+ threshold_percentile = threshold_percentile,
+ summary = summary
+ )
+ )
+
+ # if multiple replicates supplied
+ } else {
+
+ # calculate threshold percentile for when the data are imbalanced
+ if (!is.null(threshold_percentile)) {
+
+ # test the threshold percentile
+ if (threshold_percentile < 0 || threshold_percentile > 1) {
+
+ stop("error: threshold_percentile must be in [0, 1]")
+
+ }
+
+ } else {
+
+ threshold_percentile <- (
+ nrow(eval_data[["conf_data"]]) / (
+ nrow(eval_data[["conf_data"]]) + nrow(eval_data[["holdout_data"]])
+ )
+ )
+
+ }
+
+ # concatenate synthetic data and add synthesis id
+ synths <- purrr::imap(
+ .x = eval_data[["synth_data"]],
+ .f = \(x, idx) {
+ dplyr::mutate(x, synth_id = idx)
+ }
+ )
+
+ conf_data_id <- eval_data[["conf_data"]] %>%
+ tibble::rowid_to_column("nn_mi_id")
+
+ holdout_data_id <- eval_data[["holdout_data"]] %>%
+ tibble::rowid_to_column("nn_mi_id")
+
+ blended_data <- dplyr::bind_rows(
+ training = conf_data_id,
+ holdout = holdout_data_id,
+ .id = "source"
+ ) %>%
+ dplyr::mutate(source = factor(source, levels = c("training", "holdout")))
+
+ # for each record in the blended data, calculate the distance to the closest
+ # record in each synthetic dataset
+ synth_distances <- purrr::map(
+ .x = synths,
+ .f = \(.x) {
+
+ gower::gower_topn(
+ x = dplyr::select(blended_data, -source),
+ y = .x,
+ n = 1
+ )$distance[1, ]
+
+ }
+ )
+
+ all_distances <- purrr::reduce(
+ .x = synth_distances,
+ .f = c
+ )
+
+ pseudo_probabilities <- 1 - (all_distances / max(all_distances))
+
+ # convert distances into predictions for if the record from the blended data
+ # was used to train the synthetic data
+ threshold <- stats::quantile(all_distances, probs = threshold_percentile)
+ prediction <- ifelse(all_distances <= threshold, "training", "holdout")
+
+ blended_data <- dplyr::bind_cols(
+ dplyr::bind_rows(rep(list(blended_data), eval_data[["n_rep"]])),
+ distance = all_distances,
+ pseudo_probability = pseudo_probabilities,
+ prediction = prediction
+ ) %>%
+ dplyr::mutate(prediction = factor(prediction, levels = c("training", "holdout")))
+
+ if (summary) {
+
+ # calculate metrics
+ membership_metrics <- list(
+ precision = yardstick::precision(blended_data, truth = source, estimate = prediction)$.estimate,
+ recall = yardstick::recall(blended_data, truth = source, estimate = prediction)$.estimate,
+ auc = yardstick::roc_auc_vec(truth = blended_data$source, estimate = blended_data$pseudo_probability),
+ conf_mat = yardstick::conf_mat(blended_data, truth = source, estimate = prediction)
+ )
+
+ return(membership_metrics)
+
+ } else {
+
+ # calculate complete ROC
+ roc <- yardstick::roc_curve(
+ data = blended_data,
+ truth = tidyselect::all_of("source"),
+ tidyselect::all_of("pseudo_probability")
+ )
+
+ return(
+ list(
+ "results" = blended_data,
+ "roc" = roc
+ )
+ )
+
+ }
+
+ }
+
+}
+
+
diff --git a/R/disc_qid_mi.R b/R/disc_qid_mi.R
new file mode 100644
index 0000000..3e41347
--- /dev/null
+++ b/R/disc_qid_mi.R
@@ -0,0 +1,180 @@
+#'
+#' Aggregate and count factor factor-level quasi-identifiers for eval_data
+#'
+#' @param eval_data An `eval_data` object.
+#' @param keys A character vector of column quasi-identifiers
+#'
+#' @returns A data.frame of results
+#'
+#' @export
+#'
+aggregate_qid_eval <- function(eval_data, keys) {
+
+ # check to make sure multiple replicates provided
+ stopifnot(is_eval_data(eval_data))
+ stopifnot(eval_data$n_rep > 1)
+
+ # aggregate quasi-identifiers
+ conf_agg <- .aggregate_qid(eval_data$conf_data, keys)
+
+ synth_aggs <- purrr::map(
+ .x = eval_data$synth_data,
+ .f = \(x) { .aggregate_qid(x, keys) }
+ )
+
+ # join on key_id, which is consistently constructed from .aggregate_qid
+ result <- dplyr::inner_join(
+ conf_agg,
+ # apply quasi-identifier aggregation to each row
+ dplyr::bind_rows(
+ synth_aggs,
+ .id = "synth_id"
+ ) %>%
+ # only select outputs from aggregation and key_id to prevent duplicates
+ dplyr::select(dplyr::all_of(c("synth_id", "key_id", "raw_n", "prop"))),
+ by = "key_id",
+ suffix = c("_conf", "_synth")
+ ) %>%
+ # add indicators for whether keys were "selected" and proportion error metrics
+ dplyr::mutate(
+ s_synth = (.data[["raw_n_synth"]] > 0),
+ s_conf = (.data[["raw_n_conf"]] > 0),
+ prop_err_conf = .data[["prop_synth"]] - .data[["prop_conf"]],
+ prop_abserr_conf = abs(.data[["prop_err_conf"]])
+ )
+
+ # if no holdout data provided, end the process
+ if (is.null(eval_data$holdout_data)) {
+
+ return(result)
+
+ } else {
+
+ # else, aggregate holdout data
+ holdout_agg <- .aggregate_qid(eval_data$holdout_data, keys)
+ return(
+ result %>%
+ dplyr::inner_join(
+ holdout_agg %>%
+ # select only non-duplicate statistics
+ dplyr::select(dplyr::all_of(c("key_id", "raw_n", "prop"))) %>%
+ # explicitly rename count and proportion statistics to refer to
+ # holdout data to prevent duplicate suffixies on existing results
+ dplyr::rename(
+ "raw_n_holdout" = rlang::sym("raw_n"),
+ "prop_holdout" = rlang::sym("prop")
+ ),
+ by = "key_id"
+ ) %>%
+ # add indicators for whether keys were "selected" and proportion error metrics
+ dplyr::mutate(
+ s_holdout = (.data[["raw_n_holdout"]] > 0),
+ prop_err_holdout = .data[["prop_synth"]] - .data[["prop_holdout"]],
+ prop_abserr_holdout = abs(.data[["prop_err_holdout"]])
+ )
+ )
+
+ }
+}
+
+
+#'
+#' Plot partition selection probabilities
+#'
+#' @param agg_eval_data Output from `aggregate_qid_eval`
+#' @param keys A character vector of column names
+#' @param max_k largest partition selection size
+#'
+#' @returns A `ggplot2` plot.
+#'
+#' @export
+#'
+plot_prob_qid_partition <- function(
+ agg_eval_data,
+ keys,
+ max_k = 20) {
+
+ # out of all possible quasi-identifiers...
+ prob_plot_data <- agg_eval_data %>%
+ dplyr::group_by(
+ dplyr::across(dplyr::all_of(keys)),
+ .drop = FALSE
+ ) %>%
+ # compute the proportion of synthetic replicates that select at least one
+ # quasi-identifying key
+ dplyr::summarise(
+ raw_n_conf = mean(.data[["raw_n_conf"]]),
+ s_synth = mean(.data[["s_synth"]])
+ ) %>%
+ # filter to those with at most max_k confidential observations
+ dplyr::filter(.data[["raw_n_conf"]] <= max_k)
+
+
+ plot_result <- ggplot2::ggplot(prob_plot_data) +
+ # create one boxplot for count of confidential observations <= max_k
+ ggplot2::geom_boxplot(
+ ggplot2::aes(
+ x = factor(.data[["raw_n_conf"]]),
+ y = .data[["s_synth"]]
+ )
+ ) +
+ ggplot2::xlab("n_orig") +
+ ggplot2::ylab("Estimated Prob(n_synth > 0)") +
+ ggplot2::ggtitle(paste("Quasi-ID keys: ", paste(keys, collapse=", ")))
+
+ return(plot_result)
+
+}
+
+
+#'
+#' Plot absolute error probabilities
+#'
+#' @param agg_eval_data Output from `aggregate_qid_eval`
+#' @param keys A character vector of column names
+#' @param max_k largest partition selection size
+#' @param probs Quantiles at which to estimate confidence of QID count
+#' @param holdout boolean, use data from holdout instead of confidential
+#'
+#' @returns A `ggplot2` plot.
+#'
+#' @export
+#'
+plot_prob_qid_abs_err <- function(
+ agg_eval_data,
+ keys,
+ max_k = 20,
+ probs = c(0.5, 0.75, 0.9),
+ holdout = FALSE) {
+
+ err_var <- if (holdout) "prop_abserr_holdout" else "prop_abserr_conf"
+
+ qtile_data <- agg_eval_data %>%
+ # for each confidential count value...
+ dplyr::group_by(dplyr::across(dplyr::all_of("raw_n_conf"))) %>%
+ dplyr::reframe(
+ # create one row per quantile probability and value
+ qtile_value = stats::quantile(.data[[err_var]], probs = probs),
+ qtile = paste(probs * 100, "%ile", sep="")
+ ) %>%
+ # restrict plotted data where confidential observations <= max_k
+ dplyr::filter(.data[["raw_n_conf"]] <= max_k)
+
+ plot_result <- ggplot2::ggplot(
+ data = qtile_data,
+ # create one line plot per quantile
+ mapping = ggplot2::aes(
+ x = .data[["raw_n_conf"]],
+ y = .data[["qtile_value"]],
+ color = .data[["qtile"]]
+ )
+ ) +
+ ggplot2::geom_point() +
+ ggplot2::geom_line() +
+ ggplot2::xlab("n_orig") +
+ ggplot2::ylab("Estimated Quantile for |n_synth - n_conf|") +
+ ggplot2::ggtitle(paste("Quasi-ID keys: ", paste(keys, collapse=", ")))
+
+ return(plot_result)
+
+}
diff --git a/R/eval_data.R b/R/eval_data.R
new file mode 100644
index 0000000..facc73c
--- /dev/null
+++ b/R/eval_data.R
@@ -0,0 +1,143 @@
+#' Create evaluation data container
+#'
+#' @param conf_data A confidential dataframe
+#' @param synth_data A single (or list of) dataframe(s) or `postsynth` object(s).
+#' @param holdout_data An optional holdout dataframe containing the same columns
+#' as the confidential dataframe
+#'
+#' @return An `eval_data` object.
+#'
+#' @export
+#'
+eval_data <- function(conf_data, synth_data, holdout_data = NULL) {
+
+ stopifnot(inherits(conf_data, "data.frame"))
+
+ if (!is.null(holdout_data)) {
+
+ # check holdout data is dataframe
+ stopifnot(inherits(holdout_data, "data.frame"))
+
+ }
+
+ # single replicate logic
+ if (is_postsynth(synth_data)) {
+
+ synth_data <- synth_data[["synthetic_data"]]
+ n_rep <- 1
+
+ } else if (inherits(synth_data, "data.frame")) {
+
+ n_rep <- 1
+
+ } else {
+
+ stopifnot("list" %in% class(synth_data))
+
+ # multiple replicate logic
+ n_rep <- length(synth_data)
+
+ if (inherits(synth_data[[1]], "postsynth")) {
+
+ stopifnot(
+ all(
+ purrr::map_lgl(
+ .x = synth_data,
+ .f = ~ inherits(.x, "postsynth")
+ )
+ )
+ )
+
+ synth_data <- purrr::map(
+ .x = synth_data,
+ .f = ~ .x[["synthetic_data"]]
+ )
+
+ } else {
+
+ stopifnot(
+ all(
+ purrr::map_lgl(
+ .x = synth_data,
+ .f = ~ inherits(.x, "data.frame")
+ )
+ )
+ )
+
+ }
+
+ }
+
+ eval_data <- list(
+ conf_data = conf_data,
+ synth_data = synth_data,
+ holdout_data = holdout_data,
+ n_rep = n_rep
+ )
+
+ eval_data <- structure(eval_data, class = "eval_data")
+
+ return(eval_data)
+
+}
+
+#'
+#' Check if obejct is `eval_data`
+#'
+#' @param x object
+#'
+#' @return A boolean
+#'
+#' @export
+is_eval_data <- function(x) {
+ inherits(x, "eval_data")
+}
+
+#' @export
+print.eval_data <- function(x, ...) {
+
+ cat("Evaluation Data \n")
+ cat("Confidential Data: ",
+ dim(x$conf_data)[1],
+ " rows x ",
+ dim(x$conf_data)[2],
+ " columns \n")
+
+ cat("Synthetic Data: ",
+ x$n_rep,
+ " replicate(s) \n")
+
+ if (x$n_rep == 1) {
+
+ cat(
+ " ",
+ dim(x$synth_data)[1],
+ " rows x ",
+ dim(x$synth_data)[2],
+ " columns \n"
+ )
+
+ } else {
+
+ cat(
+ " First synthetic dataset: ",
+ dim(x$synth_data[[1]])[1],
+ " rows x ",
+ dim(x$synth_data[[1]])[2],
+ " columns \n"
+ )
+
+ }
+
+
+ if (!is.null(x$holdout_data)) {
+ cat("Holdout Data: ",
+ dim(x$holdout_data)[1],
+ " rows x ",
+ dim(x$holdout_data)[2],
+ " columns \n")
+ }
+
+ invisible(x)
+
+}
diff --git a/R/syntheval-package.R b/R/syntheval-package.R
index 52c0c07..9b057b3 100644
--- a/R/syntheval-package.R
+++ b/R/syntheval-package.R
@@ -3,5 +3,6 @@
## usethis namespace: start
#' @importFrom rlang .data
+#' @importFrom rlang :=
## usethis namespace: end
NULL
diff --git a/R/util_ci_overlap.R b/R/util_ci_overlap.R
index 8d34019..903c131 100644
--- a/R/util_ci_overlap.R
+++ b/R/util_ci_overlap.R
@@ -4,8 +4,33 @@
#' @param data A data frame with the original data
#' @param formula A formula for a linear regression model
#'
-#' @return A list with the regression confidence interval overlap and estimated
-#' coefficients
+#' @return A list of two dataframes:
+#' * `ci_overlap`: one row per model parameter with utility metrics.
+#' * `overlap `: symmetric overlap metric, calculated as the average of the
+#' interval overlap contained in the synthetic confidence interval and the
+#' interval overlap contained in the confidential confidence interval.
+#' * `coef_diff`: synthetic parameter estimate - confidential parameter estimate
+#' * `std_coef_diff`: `coef_diff` divided by the standard error for the confidential data.
+#' * `sign_match`: boolean if the synthetic and confidential parameter estimates have the same sign.
+#' * `significance_match`: boolean if the null hypothesis test where the
+#' parameter is 0 has p-value less than .05 agrees in both confidential and
+#' synthetic data.
+#' * `ss`: boolean if both `sign_match` and `significance_match` are true.
+#' * `sso`: boolean if `sign_match` is true and `overlap` is positive.
+#' * `coef_diff`: one row per model parameter and data source (confidential or
+#' synthetic) listing parameter estimates, standard errors, test statistics,
+#' p-values for null hypothesis tests, and 95% confidence interval bounds.
+#'
+#' @examples
+#' conf_data <- mtcars
+#' synth_data <- mtcars %>%
+#' dplyr::slice_sample(n = nrow(mtcars) / 2)
+#'
+#' util_ci_overlap(
+#' conf_data,
+#' synth_data,
+#' mpg ~ disp + vs + am
+#' )
#'
#' @family Utility metrics
#'
@@ -74,4 +99,4 @@ util_ci_overlap <- function(postsynth, data, formula) {
coefficient = coefficients
)
-}
\ No newline at end of file
+}
diff --git a/R/util_co_ocurrence.R b/R/util_co_ocurrence.R
index be12018..87ae2e4 100644
--- a/R/util_co_ocurrence.R
+++ b/R/util_co_ocurrence.R
@@ -1,7 +1,10 @@
#' Calculate the co-occurrence fit metric of a confidential data set.
#'
-#' @param postsynth A postsynth object from tidysynthesis or a tibble
+#' @param postsynth a postsynth object from tidysynthesis or a tibble
#' @param data an original (observed) data set.
+#' @param na.rm a logical indicating whether missing values should be removed.
+#' Note: values are jointly removed for each pair of variables even if only one
+#' value is missing.
#'
#' @return A `list` of fit metrics:
#' - `co_occurrence_original`: co-occurrence matrix of the original data.
@@ -19,7 +22,7 @@
#'
#' @export
#'
-util_co_occurrence <- function(postsynth, data) {
+util_co_occurrence <- function(postsynth, data, na.rm = FALSE) {
if (is_postsynth(postsynth)) {
@@ -44,7 +47,7 @@ util_co_occurrence <- function(postsynth, data) {
co_occurrence_matrix <-
x %>%
dplyr::select_if(is.numeric) %>%
- co_occurrence()
+ co_occurrence(na.rm = na.rm)
# set the values in the upper triangle to zero to avoid double counting
co_occurrence_matrix[upper.tri(co_occurrence_matrix, diag = TRUE)] <- NA
diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 0e393ea..5c21209 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -2,6 +2,10 @@
#'
#' @param postsynth A postsynth object from tidysynthesis or a tibble
#' @param data an original (observed) data set.
+#' @param use optional character string giving a method for computing
+#' covariances in the presence of missing values. This must be (an abbreviation
+#' of) one of the strings "everything", "all.obs", "complete.obs",
+#' "na.or.complete", or "pairwise.complete.obs".
#'
#' @return A `list` of fit metrics:
#' - `correlation_original`: correlation matrix of the original data.
@@ -16,7 +20,7 @@
#'
#' @export
-util_corr_fit <- function(postsynth, data) {
+util_corr_fit <- function(postsynth, data, use = "everything") {
if (is_postsynth(postsynth)) {
@@ -35,13 +39,13 @@ util_corr_fit <- function(postsynth, data) {
data <- dplyr::select(data, names(synthetic_data))
# helper function to find a correlation matrix with the upper tri set to zeros
- lower_triangle <- function(x) {
+ lower_triangle <- function(x, use) {
# find the linear correlation matrix of numeric variables from a data set
correlation_matrix <-
x %>%
dplyr::select_if(is.numeric) %>%
- stats::cor()
+ stats::cor(use = use)
# set the values in the upper triangle to zero to avoid double counting
correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA
@@ -50,10 +54,10 @@ util_corr_fit <- function(postsynth, data) {
}
# find the lower triangle of the original data linear correlation matrix
- original_lt <- lower_triangle(data)
+ original_lt <- lower_triangle(data, use = use)
# find the lower triangle of the synthetic data linear correlation matrix
- synthetic_lt <- lower_triangle(synthetic_data)
+ synthetic_lt <- lower_triangle(synthetic_data, use = use)
# compare names
if (any(rownames(original_lt) != rownames(synthetic_lt))) {
diff --git a/R/util_ks_distance.R b/R/util_ks_distance.R
index b2894fc..19a787a 100644
--- a/R/util_ks_distance.R
+++ b/R/util_ks_distance.R
@@ -1,8 +1,9 @@
#' Calculate the Kolmogorov-Smirnov distance (D) for each numeric variable in
#' the synthetic and confidential data
#'
-#' @param postsynth A postsynth object or tibble with synthetic data
-#' @param data A data frame with the original data
+#' @param postsynth a postsynth object or tibble with synthetic data
+#' @param data a data frame with the original data
+#' @param na.rm a logical indicating whether missing values should be removed.
#'
#' @return A tibble with the D and location of the largest distance for each
#' numeric variable
@@ -11,7 +12,7 @@
#'
#' @export
#'
-util_ks_distance <- function(postsynth, data) {
+util_ks_distance <- function(postsynth, data, na.rm = FALSE) {
if ("postsynth" %in% class(postsynth)) {
@@ -55,20 +56,31 @@ util_ks_distance <- function(postsynth, data) {
names(distances) <- variables
for (var in variables) {
+ var_synth <- dplyr::pull(synthetic_data, var)
+ var_data <- dplyr::pull(data, var)
+
+ # drop missing values
+ if (na.rm) {
+
+ var_synth <- var_synth[!is.na(var_synth)]
+ var_data <- var_data[!is.na(var_data)]
+
+ }
+
# find the eCDFs for both variables
- ecdf_synth <- stats::ecdf(dplyr::pull(synthetic_data, var))
- ecdf_orig <- stats::ecdf(dplyr::pull(data, var))
+ ecdf_synth <- stats::ecdf(var_synth)
+ ecdf_orig <- stats::ecdf(var_data)
# calculate the minimum and maximum across both variables
- minimum <- min(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var)))
- maximum <- max(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var)))
+ minimum <- min(c(var_synth, var_data))
+ maximum <- max(c(var_synth, var_data))
# create a grid of values for calculating the distances between the two
# eCDFs
z <- seq(
from = minimum,
to = maximum,
- length.out = min(nrow(synthetic_data), nrow(data), 10000)
+ length.out = min(length(var_synth), length(var_data), 10000)
)
# for each variable, find D and the location of D
diff --git a/R/util_moments.R b/R/util_moments.R
index c68dc17..0a047db 100644
--- a/R/util_moments.R
+++ b/R/util_moments.R
@@ -7,6 +7,7 @@
#' @param drop_zeros A logical for if zeros should be dropped
#' @param common_vars A logical for if only common variables should be kept
#' @param synth_vars A logical for if only synthesized variables should be kept
+#' @param na.rm A logical for ignoring `NA` values in computations.
#'
#' @return A `tibble` of summary statistics.
#'
@@ -20,7 +21,8 @@ util_moments <- function(postsynth,
group_by = NULL,
drop_zeros = FALSE,
common_vars = TRUE,
- synth_vars = TRUE) {
+ synth_vars = TRUE,
+ na.rm = FALSE) {
# catch binding error
. <- NULL
@@ -77,14 +79,15 @@ util_moments <- function(postsynth,
`synthetic` = synthetic_data,
.id = "source"
)
-
- na.rm_toggle <- FALSE
- if (drop_zeros) {
-
- combined_data[combined_data == 0] <- NA
- na.rm_toggle <- TRUE
-
- }
+
+ # prep data for NA handling
+ combined_data <- prep_combined_data_for_na.rm(
+ combined_data,
+ na.rm = na.rm,
+ drop_zeros = drop_zeros,
+ drop_zeros_exclude = group_by
+ )
+ na.rm_flag <- (na.rm | drop_zeros)
# calculate summary statistics
summary_stats <- combined_data %>%
@@ -94,11 +97,11 @@ util_moments <- function(postsynth,
dplyr::across(
.cols = -".temp_weight",
.fns = list(
- count = ~ sum((. != 0) * .data$.temp_weight, na.rm = na.rm_toggle),
- mean = ~ stats::weighted.mean(x = ., w = .data$.temp_weight, na.rm = na.rm_toggle),
- sd = ~ weighted_sd(x = ., w = .data$.temp_weight, na.rm = na.rm_toggle),
- skewness = ~ weighted_skewness(x = ., w = .data$.temp_weight, na.rm = na.rm_toggle),
- kurtosis = ~ weighted_kurtosis(x = ., w = .data$.temp_weight, na.rm = na.rm_toggle)
+ count = ~ sum((. != 0) * .data$.temp_weight, na.rm = na.rm_flag),
+ mean = ~ stats::weighted.mean(x = ., w = .data$.temp_weight, na.rm = na.rm_flag),
+ sd = ~ weighted_sd(x = ., w = .data$.temp_weight, na.rm = na.rm_flag),
+ skewness = ~ weighted_skewness(x = ., w = .data$.temp_weight, na.rm = na.rm_flag),
+ kurtosis = ~ weighted_kurtosis(x = ., w = .data$.temp_weight, na.rm = na.rm_flag)
)
)
) %>%
diff --git a/R/util_na_helper.R b/R/util_na_helper.R
new file mode 100644
index 0000000..c0e3bb7
--- /dev/null
+++ b/R/util_na_helper.R
@@ -0,0 +1,130 @@
+#'
+#' Check whether na.rm is compatible with univariate utilty metrics
+#'
+#' @param combined_data A data frame or tibble
+#' @param na.rm A boolean for whether to ignore missing values
+#' @param drop_zeros A boolean for whether to ignore zero values in utility metrics
+#' @param drop_zeros_exclude An optional set of unquoted columns on which to drop zeros
+#'
+#' @return A data frame or tibble with missing and/or zero values set to NA
+#'
+#' @export
+#'
+prep_combined_data_for_na.rm <- function(
+ combined_data,
+ na.rm = FALSE,
+ drop_zeros = FALSE,
+ drop_zeros_exclude = NULL) {
+
+ # raise warning if missing values present
+ if (na.rm == FALSE) {
+
+ na_cols <- combined_data %>%
+ purrr::map_lgl(.f = ~ any(is.na(.x)))
+
+ if (any(na_cols)) {
+
+ message(
+ paste(
+ "Some variables contain missing data: ",
+ paste(names(combined_data)[na_cols], collapse=", ")
+ )
+ )
+
+ # stop if drop_zeros incompatible with keeping NAs
+ if (drop_zeros) {
+
+ stop("Cannot set na.rm == FALSE and drop_zeros == TRUE with missing data")
+
+ }
+
+ }
+
+ }
+
+ if (drop_zeros) {
+
+ if (is.null(drop_zeros_exclude)) {
+
+ combined_data[combined_data == 0] <- NA
+
+ }
+
+ else {
+
+ combined_data <- combined_data %>%
+ dplyr::mutate(
+ dplyr::across(
+ -dplyr::all_of(drop_zeros_exclude),
+ \(x) {
+ dplyr::if_else(x == 0, NA, x)
+ }
+ )
+ )
+
+ }
+
+ }
+
+ return(combined_data)
+
+}
+
+
+#'
+#' Convert `NA` values to `"NA"` for categorical variables
+#'
+#' @param data A data frame or tibble
+#'
+#' @return A data frame or tibble with `NA` converted to `"NA"`
+#'
+#' @export
+#'
+convert_na_to_level <- function(data) {
+
+ na_to_level <- function(x) {
+
+ # do nothing if x isn't a character or factor
+ if (!pillar::type_sum(x) %in% c("chr", "ord", "fct")) return(x)
+
+ # test if NA is already a level
+ if (sum(x == "NA", na.rm = TRUE) > 0) {
+ stop("ERROR: can't convert NA to 'NA' because 'NA' already exists")
+ }
+
+ # replace `NA` with `"NA"`
+ if (all(!is.na(x))) {
+
+ return(x)
+
+ } else if (pillar::type_sum(x) == "chr") {
+
+ x <- tidyr::replace_na(data = x, replace = "NA")
+
+ } else if (pillar::type_sum(x) %in% c("ord", "fct")) {
+
+ ordinal_flag <- pillar::type_sum(x) == "ord"
+
+ # store the factor levels
+ x_levels <- c(levels(x), "NA")
+
+ # convert to character and replace the NA
+ x_chr <- as.character(x)
+
+ x_chr <- tidyr::replace_na(data = x_chr, replace = "NA")
+
+ # convert back to a factor
+ x <- factor(x_chr, levels = x_levels, ordered = ordinal_flag)
+
+ }
+
+ return(x)
+
+ }
+
+ data_converted <- data %>%
+ dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = na_to_level))
+
+ return(data_converted)
+
+}
diff --git a/R/util_percentiles.R b/R/util_percentiles.R
index 187c5fd..771501b 100644
--- a/R/util_percentiles.R
+++ b/R/util_percentiles.R
@@ -12,6 +12,7 @@
#' This option will frequently result in an error because quantile() is strict
#' about missing values.
#' @param synth_vars A logical for if only synthesized variables should be kept
+#' @param na.rm A logical for ignoring `NA` values in computations.
#'
#' @return A `tibble` of summary statistics.
#'
@@ -26,7 +27,8 @@ util_percentiles <- function(postsynth,
group_by = NULL,
drop_zeros = FALSE,
common_vars = TRUE,
- synth_vars = TRUE) {
+ synth_vars = TRUE,
+ na.rm = FALSE) {
# catch binding error
. <- NULL
@@ -85,14 +87,14 @@ util_percentiles <- function(postsynth,
)
- na.rm_toggle <- FALSE
- if (drop_zeros) {
-
- combined_data[combined_data == 0] <- NA
- na.rm_toggle <- TRUE
-
- }
-
+ # prep data for NA handling
+ combined_data <- prep_combined_data_for_na.rm(
+ combined_data,
+ na.rm = na.rm,
+ drop_zeros = drop_zeros,
+ drop_zeros_exclude = group_by
+ )
+ na.rm_flag <- (na.rm | drop_zeros)
# set weight to 1 for unweighted statistics
if (missing(weight_var)) {
@@ -105,7 +107,7 @@ util_percentiles <- function(postsynth,
.fns = ~ stats::quantile(
x = .x,
probs = probs,
- na.rm = na.rm_toggle
+ na.rm = na.rm_flag
)
),
p = probs
@@ -128,7 +130,7 @@ util_percentiles <- function(postsynth,
x = .,
weights = {{ weight_var }},
probs = probs,
- na.rm = na.rm_toggle
+ na.rm = na.rm_flag
)
),
p = probs
diff --git a/R/util_plots.R b/R/util_plots.R
new file mode 100644
index 0000000..f3cb667
--- /dev/null
+++ b/R/util_plots.R
@@ -0,0 +1,222 @@
+#' Create a histogram + KDE estimate for a numeric variable.
+#'
+#' @param joint_data A data.frame combining rows from confidential and synthetic
+#' data, with the column 'source' identifying the two.
+#' @param var_name Numeric variable name to plot.
+#' @param cat1_name Optional categorical variable to group by for subplots.
+#' @param cat2_name Optional categorical variable to group by for subplots.
+#'
+#' @return A `ggplot2` plot
+#'
+#' @export
+plot_numeric_hist_kde <- function(joint_data,
+ var_name,
+ cat1_name = NULL,
+ cat2_name = NULL) {
+
+ # check data types
+ stopifnot(pillar::type_sum(joint_data[[var_name]]) == "dbl")
+
+ if (!is.null(cat1_name)) {
+
+ stopifnot(pillar::type_sum(joint_data[[cat1_name]]) == "fct")
+
+ }
+
+ if (!is.null(cat2_name)) {
+
+
+ stopifnot(pillar::type_sum(joint_data[[cat2_name]]) == "fct")
+
+ }
+
+ # check source variable
+ stopifnot(all(c("confidential", "synthetic") %in%
+ (joint_data[["source"]] %>% unique)))
+
+ plot <- ggplot2::ggplot(
+ data = joint_data,
+ mapping = ggplot2::aes(
+ x = !!rlang::sym(var_name),
+ y = ggplot2::after_stat(!!rlang::sym('density')),
+ fill = source)
+ ) +
+ ggplot2::geom_histogram(
+ position = "identity",
+ bins = 30,
+ color = "black",
+ alpha = 0.3
+ ) +
+ ggplot2::geom_density(
+ ggplot2::aes(color = source),
+ alpha = 0.3
+ ) +
+ ggplot2::theme(
+ axis.text.x = ggplot2::element_text(angle = 90)
+ )
+
+ if (!is.null(cat1_name) & is.null(cat2_name)) {
+
+ plot <- plot +
+ ggplot2::facet_wrap(ggplot2::vars(!!rlang::sym(cat1_name)))
+
+ }
+
+ if (!is.null(cat1_name) & !is.null(cat2_name)) {
+
+ plot <- plot +
+ ggplot2::facet_grid(
+ rows = ggplot2::vars(!!rlang::sym(cat1_name)),
+ cols = ggplot2::vars(!!rlang::sym(cat2_name))
+ )
+
+ }
+
+ return(plot)
+
+}
+
+#' Create bar charts for a categorical random variable.
+#'
+#' @param joint_data A data.frame combining rows from confidential and synthetic
+#' data, with the column 'source' identifying the two.
+#' @param var_name Categorical variable name to plot.
+#' @param cat1_name Optional categorical variable to group by for subplots.
+#' @param cat2_name Optional categorical variable to group by for subplots.
+#'
+#' @return A `ggplot2` plot
+#'
+#' @export
+plot_categorical_bar <- function(joint_data,
+ var_name,
+ cat1_name = NULL,
+ cat2_name = NULL) {
+
+ # check data types
+ stopifnot(pillar::type_sum(joint_data[[var_name]]) == "fct")
+
+ if (!is.null(cat1_name)) {
+
+ stopifnot(pillar::type_sum(joint_data[[cat1_name]]) == "fct")
+
+ }
+
+ if (!is.null(cat2_name)) {
+
+
+ stopifnot(pillar::type_sum(joint_data[[cat2_name]]) == "fct")
+
+ }
+
+ # check source variable
+ stopifnot(all(c("confidential", "synthetic") %in%
+ (joint_data[["source"]] %>% unique)))
+
+ plot <- ggplot2::ggplot(
+ data = joint_data,
+ mapping = ggplot2::aes(x = !!rlang::sym(var_name),
+ fill = source,
+ group = source)
+ ) +
+ ggplot2::geom_bar(position = ggplot2::position_dodge()) +
+ ggplot2::theme(
+ axis.text.x = ggplot2::element_text(angle = 90)
+ )
+
+ if (!is.null(cat1_name) & is.null(cat2_name)) {
+
+ plot <- plot +
+ ggplot2::facet_wrap(ggplot2::vars(!!rlang::sym(cat1_name)))
+
+ }
+
+ if (!is.null(cat1_name) & !is.null(cat2_name)) {
+
+ plot <- plot +
+ ggplot2::facet_grid(
+ rows = ggplot2::vars(!!rlang::sym(cat1_name)),
+ cols = ggplot2::vars(!!rlang::sym(cat2_name))
+ )
+
+ }
+
+ return(plot)
+
+}
+
+#' Create a correlation heatmap for numeric random variables.
+#'
+#' @param data A data.frame/
+#' @param cor_method A correlation method to pass to `stats::cor(., method=)`
+#'
+#' @return A `ggplot2` plot
+#'
+#' @export
+create_cormat_plot <- function(data, cor_method = "pearson") {
+
+ # get numeric variable names
+ dtypes <- data %>% purrr::map_chr(~ pillar::type_sum(.x))
+ num_vars <- names(dtypes[dtypes == "dbl"])
+
+ # get lower triangular correlation matrix
+ cmat <- stats::cor(data[num_vars], method = cor_method) %>%
+ round(digits = 2)
+ cmat[upper.tri(cmat)] <- NA
+
+ # generate plot
+ cmat_df <- as.data.frame.table(cmat) %>%
+ tidyr::drop_na()
+
+ plot <- ggplot2::ggplot(data = cmat_df,
+ mapping = ggplot2::aes(x = .data$Var1,
+ y = .data$Var2,
+ fill = .data$Freq)) +
+ ggplot2::geom_tile(color = "white") +
+ ggplot2::scale_fill_gradient2(
+ low = "firebrick",
+ high= "chartreuse4",
+ mid = "white",
+ midpoint = 0,
+ limit=c(-1, 1),
+ space = "Lab",
+ name="Correlation"
+ ) +
+ ggplot2::theme(
+ axis.text.x = ggplot2::element_text(angle=90, vjust = 1),
+ axis.title.x = ggplot2::element_blank(),
+ axis.title.y = ggplot2::element_blank(),
+ panel.border = ggplot2::element_blank(),
+ panel.background = ggplot2::element_blank(),
+ panel.grid.major = ggplot2::element_blank(),
+ axis.ticks = ggplot2::element_blank()
+ ) +
+ ggplot2::coord_fixed() +
+ ggplot2::geom_text(
+ ggplot2::aes(label = .data$Freq),
+ color = "black",
+ size = 4
+ )
+
+ return(plot)
+
+}
+
+#' Create side-by-side correlation heatmaps for numeric random variables.
+#'
+#' @param conf_data Confidential data
+#' @param synth_data Synthetic data
+#' @param cor_method A correlation method to pass to `stats::cor(., method=)`
+#'
+#' @return A `ggplot2` plot
+#'
+#' @export
+plot_cormat <- function(conf_data, synth_data, cor_method = "pearson") {
+
+ p1 <- create_cormat_plot(conf_data, cor_method = cor_method) +
+ ggplot2::ggtitle("Confidential data")
+ p2 <- create_cormat_plot(synth_data, cor_method = cor_method) +
+ ggplot2::ggtitle("Synthetic data")
+
+ gridExtra::grid.arrange(p1, p2, nrow=1)
+
+}
diff --git a/R/util_proportions.R b/R/util_proportions.R
index 46f2069..4dd1e3b 100644
--- a/R/util_proportions.R
+++ b/R/util_proportions.R
@@ -6,6 +6,9 @@
#' @param group_by An unquoted name of a (or multiple) grouping variable(s)
#' @param common_vars A logical for if only common variables should be kept
#' @param synth_vars A logical for if only synthesized variables should be kept
+#' @param keep_empty_levels A logical for keeping all class levels in the group_by
+#' statements, including missing levels.
+#' @param na.rm A logical for ignoring `NA` values in proportion calculations.
#'
#' @return A tibble with variables, classes, and relative frequencies
#'
@@ -18,7 +21,9 @@ util_proportions <- function(postsynth,
weight_var = NULL,
group_by = NULL,
common_vars = TRUE,
- synth_vars = TRUE) {
+ synth_vars = TRUE,
+ keep_empty_levels = FALSE,
+ na.rm = FALSE) {
if (is_postsynth(postsynth)) {
@@ -27,7 +32,7 @@ util_proportions <- function(postsynth,
variable_order <-
levels(postsynth$jth_synthesis_time$variable)
-
+
} else {
synthetic_data <- postsynth
@@ -113,22 +118,125 @@ util_proportions <- function(postsynth,
# combining confidential and synthetic data
combined_data <- dplyr::bind_rows(
- synthetic = synthetic_data,
- original = data,
- .id = "source"
+ synthetic = synthetic_data,
+ original = data,
+ .id = "source"
+ )
+
+ group_by_weights <- combined_data %>%
+ tidyr::pivot_longer(
+ cols = -c(source, {{ group_by }}, ".temp_weight"),
+ names_to = "variable",
+ values_to = "class"
+ ) %>%
+ dplyr::group_by(
+ dplyr::across({{ group_by }}), source, variable,
+ .drop = !keep_empty_levels
)
+ # convert NAs to separate levels
+ combined_data <- convert_na_to_level(combined_data)
+
# lengthening combined data to find proportions for each level
- combined_data <- combined_data %>%
+ combined_data_long <- combined_data %>%
tidyr::pivot_longer(
cols = -c(source, {{ group_by }}, ".temp_weight"),
names_to = "variable",
values_to = "class"
)
+ # if flagged, join in empty levels to pivoted data
+ if (keep_empty_levels) {
+
+ # first, get explicit column names for variables where we need to keep
+ # empty levels (excludes variables in group_by, excluded by common_vars, etc)
+ prop_col_names <- names(
+ combined_data %>%
+ dplyr::select(-c(source, {{ group_by }}, ".temp_weight"))
+ )
+
+ extract_levels <- function(x) {
+
+ # for character columns, convert to factor before extracting levels
+ if (pillar::type_sum(combined_data[[x]]) == "chr") {
+
+ return(
+ data.frame("class" = levels(factor(combined_data[[x]]))) %>%
+ dplyr::mutate("variable" = x)
+ )
+
+ } else {
+
+ return(
+ # else, use pre-existing column factor levels (in case empty levels
+ # are unsynthesized)
+ data.frame("class" = levels(combined_data[[x]])) %>%
+ dplyr::mutate("variable" = x)
+ )
+
+ }
+
+ }
+
+ all_levels <- purrr::map(
+ .x = prop_col_names,
+ .f = extract_levels
+ ) %>%
+ dplyr::bind_rows() %>%
+ # create one copy of the complete levels for each source and groupby level
+ # by cross-joining
+ dplyr::cross_join(
+ combined_data %>%
+ dplyr::select(c(source, {{ group_by }})) %>%
+ dplyr::distinct()
+ )
+
+ # create the join specification depending on whether group_by is specified
+ if (is.null(group_by)) {
+
+ join_spec <- dplyr::join_by(class, variable, source)
+
+ } else {
+
+ join_spec <- dplyr::join_by(class, variable, source, {{ group_by }})
+
+ }
+
+ # join in all_levels to combined data and set weights to 0 where levels are
+ # unobserved
+ combined_data <- dplyr::left_join(
+ all_levels,
+ combined_data_long,
+ by = join_spec,
+ na_matches = "na"
+ ) %>%
+ tidyr::replace_na(list(".temp_weight" = 0))
+
+ } else {
+
+ combined_data <- combined_data_long
+
+ }
+
+ # if flagged, remove NA levels
+ if (na.rm) {
+
+ combined_data <- combined_data %>%
+ dplyr::filter(
+ !is.na(class) & (class != "NA")
+ )
+
+ }
+
# calculating proportions for each level of each variable
combined_data <- combined_data %>%
- dplyr::group_by(dplyr::across({{ group_by }}), .data$source, .data$variable, .data$class) %>%
+ dplyr::group_by(
+ dplyr::across({{ group_by }}),
+ .data$source,
+ .data$variable,
+ .data$class,
+ .drop = !keep_empty_levels
+ ) %>%
dplyr::summarise(.total_weight = sum(.data$.temp_weight)) %>%
dplyr::mutate(prop = (.data$.total_weight) / sum(.data$.total_weight)) %>%
dplyr::ungroup()
@@ -136,11 +244,17 @@ util_proportions <- function(postsynth,
# formatting results, getting proportion difference
combined_data <- combined_data %>%
tidyr::pivot_wider(names_from = source, values_from = "prop") %>%
- dplyr::group_by(dplyr::across({{ group_by }}), .data$variable, .data$class) %>%
+ dplyr::group_by(
+ dplyr::across({{ group_by }}),
+ .data$variable,
+ .data$class,
+ .drop = !keep_empty_levels
+ ) %>%
dplyr::summarise(synthetic = sum(.data$synthetic, na.rm = TRUE),
original = sum(.data$original, na.rm = TRUE)) %>%
- dplyr::mutate(difference = .data$synthetic - .data$original)
-
+ dplyr::mutate(difference = .data$synthetic - .data$original) %>%
+ dplyr::ungroup()
+
# (group_by) -- variable -- class -- original -- synthetic -- difference
return(combined_data)
diff --git a/R/util_tails.R b/R/util_tails.R
index 227ec82..66bd14a 100644
--- a/R/util_tails.R
+++ b/R/util_tails.R
@@ -60,7 +60,8 @@ util_tails <- function(postsynth,
# calculate proportion of total contained in each observation
long_data <- long_data %>%
dplyr::group_by(source, .data$variable) %>%
- dplyr::mutate(.weighted_prop = .data$.weighted_value / sum(.data$.weighted_value))
+ dplyr::mutate(.weighted_prop = .data$.weighted_value / sum(.data$.weighted_value)) %>%
+ dplyr::ungroup()
# keep top n
if (end == "max") {
diff --git a/R/util_totals.R b/R/util_totals.R
index 03e425a..88d7f49 100644
--- a/R/util_totals.R
+++ b/R/util_totals.R
@@ -6,6 +6,7 @@
#' @param group_by The unquoted name of a (or multiple) grouping variable(s)
#' @param common_vars A logical for if only common variables should be kept
#' @param synth_vars A logical for if only synthesized variables should be kept
+#' @param na.rm A logical for ignoring `NA` values in computations.
#'
#' @return A `tibble` of totals.
#'
@@ -18,7 +19,8 @@ util_totals<- function(postsynth,
weight_var = 1,
group_by = NULL,
common_vars = TRUE,
- synth_vars = TRUE) {
+ synth_vars = TRUE,
+ na.rm = FALSE) {
# catch binding error
. <- NULL
@@ -75,8 +77,6 @@ util_totals<- function(postsynth,
`synthetic` = synthetic_data,
.id = "source"
)
-
- na.rm_toggle <- FALSE
# calculate summary statistics
totals <- combined_data %>%
@@ -86,8 +86,8 @@ util_totals<- function(postsynth,
dplyr::across(
.cols = -".temp_weight",
.fns = list(
- count = ~ sum((. != 0) * .data$.temp_weight, na.rm = na.rm_toggle),
- total = ~ sum(. * .data$.temp_weight, na.rm = na.rm_toggle)
+ count = ~ sum((. != 0) * .data$.temp_weight, na.rm = na.rm),
+ total = ~ sum(. * .data$.temp_weight, na.rm = na.rm)
)
)
) %>%
diff --git a/README.md b/README.md
index e01349c..772cfd1 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,6 @@ util_proportions(
```
# A tibble: 2 × 5
- # Groups: variable [1]
variable class synthetic original difference
1 sex female 0.529 0.495 0.0330
@@ -92,7 +91,6 @@ util_proportions(
```
# A tibble: 8 × 5
- # Groups: variable [3]
variable class synthetic original difference
1 island Biscoe 0.465 0.489 -0.0240
@@ -470,16 +468,16 @@ disc1 |>
# A tibble: 666 × 10
.pred_synthetic .source_label .sample species island sex bill_length_mm
- 1 0.290 original training Adelie Torgersen male 39.1
- 2 0.290 original training Adelie Torgersen fema… 39.5
- 3 0.290 original training Adelie Torgersen fema… 40.3
- 4 0.656 original training Adelie Torgersen fema… 36.7
- 5 0.290 original testing Adelie Torgersen male 39.3
- 6 0.290 original training Adelie Torgersen fema… 38.9
- 7 0.290 original testing Adelie Torgersen male 39.2
- 8 0.290 original training Adelie Torgersen fema… 41.1
- 9 0.290 original testing Adelie Torgersen male 38.6
- 10 0.656 original training Adelie Torgersen male 34.6
+ 1 0.0556 original training Adelie Torgersen male 39.1
+ 2 0.355 original training Adelie Torgersen fema… 39.5
+ 3 0.636 original training Adelie Torgersen fema… 40.3
+ 4 0.375 original training Adelie Torgersen fema… 36.7
+ 5 0.76 original testing Adelie Torgersen male 39.3
+ 6 0.355 original training Adelie Torgersen fema… 38.9
+ 7 0.375 original training Adelie Torgersen male 39.2
+ 8 0.355 original training Adelie Torgersen fema… 41.1
+ 9 0.4 original training Adelie Torgersen male 38.6
+ 10 0.789 original testing Adelie Torgersen male 34.6
# ℹ 656 more rows
# ℹ 3 more variables: bill_depth_mm , flipper_length_mm ,
# body_mass_g
@@ -498,66 +496,66 @@ disc1 |>
node), split, n, loss, yval, (yprob)
* denotes terminal node
- 1) root 498 249 synthetic (0.5000000 0.5000000)
- 2) bill_length_mm< 38.25 115 46 synthetic (0.6000000 0.4000000)
- 4) body_mass_g>=3187.5 90 31 synthetic (0.6555556 0.3444444) *
- 5) body_mass_g< 3187.5 25 10 original (0.4000000 0.6000000)
- 10) bill_depth_mm>=17.6 11 4 synthetic (0.6363636 0.3636364) *
- 11) bill_depth_mm< 17.6 14 3 original (0.2142857 0.7857143) *
- 3) bill_length_mm>=38.25 383 180 original (0.4699739 0.5300261)
- 6) island=Biscoe,Dream 352 171 original (0.4857955 0.5142045)
- 12) body_mass_g< 3375 28 9 synthetic (0.6785714 0.3214286) *
- 13) body_mass_g>=3375 324 152 original (0.4691358 0.5308642)
- 26) bill_length_mm>=41.2 280 136 original (0.4857143 0.5142857)
- 52) flipper_length_mm< 189.5 18 4 synthetic (0.7777778 0.2222222) *
- 53) flipper_length_mm>=189.5 262 122 original (0.4656489 0.5343511)
- 106) body_mass_g>=3587.5 252 120 original (0.4761905 0.5238095)
- 212) flipper_length_mm< 192.5 15 5 synthetic (0.6666667 0.3333333) *
- 213) flipper_length_mm>=192.5 237 110 original (0.4641350 0.5358650)
- 426) body_mass_g>=4287.5 185 91 original (0.4918919 0.5081081)
- 852) bill_length_mm>=50.45 40 15 synthetic (0.6250000 0.3750000)
- 1704) bill_length_mm< 52.35 30 8 synthetic (0.7333333 0.2666667) *
- 1705) bill_length_mm>=52.35 10 3 original (0.3000000 0.7000000) *
- 853) bill_length_mm< 50.45 145 66 original (0.4551724 0.5448276)
- 1706) bill_length_mm< 47.05 80 37 synthetic (0.5375000 0.4625000)
- 3412) bill_length_mm>=46.65 8 1 synthetic (0.8750000 0.1250000) *
- 3413) bill_length_mm< 46.65 72 36 synthetic (0.5000000 0.5000000)
- 6826) body_mass_g< 5025 58 26 synthetic (0.5517241 0.4482759) *
- 6827) body_mass_g>=5025 14 4 original (0.2857143 0.7142857) *
- 1707) bill_length_mm>=47.05 65 23 original (0.3538462 0.6461538)
- 3414) bill_depth_mm>=15.65 28 14 synthetic (0.5000000 0.5000000)
- 6828) body_mass_g< 5525 12 3 synthetic (0.7500000 0.2500000) *
- 6829) body_mass_g>=5525 16 5 original (0.3125000 0.6875000) *
- 3415) bill_depth_mm< 15.65 37 9 original (0.2432432 0.7567568) *
- 427) body_mass_g< 4287.5 52 19 original (0.3653846 0.6346154)
- 854) bill_depth_mm>=19.65 12 4 synthetic (0.6666667 0.3333333) *
- 855) bill_depth_mm< 19.65 40 11 original (0.2750000 0.7250000) *
- 107) body_mass_g< 3587.5 10 2 original (0.2000000 0.8000000) *
- 27) bill_length_mm< 41.2 44 16 original (0.3636364 0.6363636)
- 54) bill_depth_mm>=18.95 17 7 synthetic (0.5882353 0.4117647) *
- 55) bill_depth_mm< 18.95 27 6 original (0.2222222 0.7777778) *
- 7) island=Torgersen 31 9 original (0.2903226 0.7096774) *
+ 1) root 498 249 synthetic (0.50000000 0.50000000)
+ 2) bill_length_mm< 34.8 19 4 synthetic (0.78947368 0.21052632) *
+ 3) bill_length_mm>=34.8 479 234 original (0.48851775 0.51148225)
+ 6) bill_depth_mm>=18.85 99 44 synthetic (0.55555556 0.44444444)
+ 12) bill_length_mm< 50.35 85 32 synthetic (0.62352941 0.37647059)
+ 24) species=Chinstrap 21 2 synthetic (0.90476190 0.09523810) *
+ 25) species=Adelie 64 30 synthetic (0.53125000 0.46875000)
+ 50) flipper_length_mm< 192.5 40 15 synthetic (0.62500000 0.37500000)
+ 100) bill_length_mm>=38.9 25 6 synthetic (0.76000000 0.24000000) *
+ 101) bill_length_mm< 38.9 15 6 original (0.40000000 0.60000000) *
+ 51) flipper_length_mm>=192.5 24 9 original (0.37500000 0.62500000) *
+ 13) bill_length_mm>=50.35 14 2 original (0.14285714 0.85714286) *
+ 7) bill_depth_mm< 18.85 380 179 original (0.47105263 0.52894737)
+ 14) bill_length_mm>=50.55 45 17 synthetic (0.62222222 0.37777778) *
+ 15) bill_length_mm< 50.55 335 151 original (0.45074627 0.54925373)
+ 30) bill_length_mm< 46.05 218 107 original (0.49082569 0.50917431)
+ 60) bill_depth_mm< 18.25 177 83 synthetic (0.53107345 0.46892655)
+ 120) bill_depth_mm>=13.75 169 77 synthetic (0.54437870 0.45562130)
+ 240) bill_length_mm>=36.55 142 61 synthetic (0.57042254 0.42957746)
+ 480) bill_length_mm< 38.35 27 8 synthetic (0.70370370 0.29629630) *
+ 481) bill_length_mm>=38.35 115 53 synthetic (0.53913043 0.46086957)
+ 962) bill_length_mm>=41.65 73 29 synthetic (0.60273973 0.39726027) *
+ 963) bill_length_mm< 41.65 42 18 original (0.42857143 0.57142857)
+ 1926) flipper_length_mm>=193.5 11 4 synthetic (0.63636364 0.36363636) *
+ 1927) flipper_length_mm< 193.5 31 11 original (0.35483871 0.64516129) *
+ 241) bill_length_mm< 36.55 27 11 original (0.40740741 0.59259259) *
+ 121) bill_depth_mm< 13.75 8 2 original (0.25000000 0.75000000) *
+ 61) bill_depth_mm>=18.25 41 13 original (0.31707317 0.68292683)
+ 122) flipper_length_mm>=191.5 23 11 synthetic (0.52173913 0.47826087)
+ 244) bill_depth_mm>=18.55 10 2 synthetic (0.80000000 0.20000000) *
+ 245) bill_depth_mm< 18.55 13 4 original (0.30769231 0.69230769) *
+ 123) flipper_length_mm< 191.5 18 1 original (0.05555556 0.94444444) *
+ 31) bill_length_mm>=46.05 117 44 original (0.37606838 0.62393162)
+ 62) flipper_length_mm>=229.5 9 3 synthetic (0.66666667 0.33333333) *
+ 63) flipper_length_mm< 229.5 108 38 original (0.35185185 0.64814815)
+ 126) flipper_length_mm< 214.5 54 24 original (0.44444444 0.55555556)
+ 252) body_mass_g>=3925 26 10 synthetic (0.61538462 0.38461538) *
+ 253) body_mass_g< 3925 28 8 original (0.28571429 0.71428571) *
+ 127) flipper_length_mm>=214.5 54 14 original (0.25925926 0.74074074) *
$discriminator_auc
# A tibble: 2 × 4
.sample .metric .estimator .estimate
- 1 training roc_auc binary 0.726
- 2 testing roc_auc binary 0.449
+ 1 training roc_auc binary 0.743
+ 2 testing roc_auc binary 0.461
$pmse
# A tibble: 2 × 4
.source .pmse .null_pmse .pmse_ratio
- 1 training 0.0417 0.0374 1.12
- 2 testing 0.0450 0.0361 1.25
+ 1 training 0.0465 0.0327 1.42
+ 2 testing 0.0483 0.0325 1.49
$specks
# A tibble: 2 × 2
.source .specks
- 1 training 0.382
- 2 testing 0.107
+ 1 training 0.386
+ 2 testing 0.0833
attr(,"class")
[1] "discrimination"
@@ -655,16 +653,16 @@ disc2 |>
# A tibble: 666 × 10
.pred_synthetic .source_label .sample species island sex bill_length_mm
- 1 0.439 original testing Adelie Torgersen male 39.1
- 2 0.483 original testing Adelie Torgersen fema… 39.5
- 3 0.461 original training Adelie Torgersen fema… 40.3
- 4 0.499 original training Adelie Torgersen fema… 36.7
- 5 0.443 original training Adelie Torgersen male 39.3
- 6 0.511 original training Adelie Torgersen fema… 38.9
- 7 0.416 original testing Adelie Torgersen male 39.2
- 8 0.492 original training Adelie Torgersen fema… 41.1
- 9 0.456 original training Adelie Torgersen male 38.6
- 10 0.447 original training Adelie Torgersen male 34.6
+ 1 0.5 original training Adelie Torgersen male 39.1
+ 2 0.5 original training Adelie Torgersen fema… 39.5
+ 3 0.5 original training Adelie Torgersen fema… 40.3
+ 4 0.5 original training Adelie Torgersen fema… 36.7
+ 5 0.5 original testing Adelie Torgersen male 39.3
+ 6 0.5 original testing Adelie Torgersen fema… 38.9
+ 7 0.5 original training Adelie Torgersen male 39.2
+ 8 0.5 original training Adelie Torgersen fema… 41.1
+ 9 0.5 original training Adelie Torgersen male 38.6
+ 10 0.5 original testing Adelie Torgersen male 34.6
# ℹ 656 more rows
# ℹ 3 more variables: bill_depth_mm , flipper_length_mm ,
# body_mass_g
@@ -686,76 +684,76 @@ disc2 |>
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
- 1 0 0.00 0.047070
- 2 1 0.11 0.042890
- 3 1 0.20 0.039080
- 4 1 0.27 0.035600
- 5 2 0.37 0.032440
- 6 2 0.48 0.029560
- 7 3 0.57 0.026930
- 8 3 0.65 0.024540
- 9 4 0.74 0.022360
- 10 5 0.85 0.020370
- 11 6 0.97 0.018560
- 12 6 1.09 0.016920
- 13 6 1.20 0.015410
- 14 6 1.29 0.014040
- 15 6 1.36 0.012800
- 16 6 1.42 0.011660
- 17 6 1.48 0.010620
- 18 6 1.52 0.009679
- 19 7 1.56 0.008820
- 20 8 1.60 0.008036
- 21 8 1.64 0.007322
- 22 8 1.67 0.006672
- 23 8 1.69 0.006079
- 24 9 1.73 0.005539
- 25 9 1.79 0.005047
- 26 9 1.84 0.004599
- 27 10 1.88 0.004190
- 28 10 1.91 0.003818
- 29 11 1.94 0.003479
- 30 11 1.96 0.003170
- 31 11 1.98 0.002888
- 32 11 2.00 0.002631
- 33 11 2.01 0.002398
- 34 12 2.03 0.002185
- 35 12 2.04 0.001991
- 36 12 2.05 0.001814
- 37 12 2.06 0.001653
- 38 12 2.06 0.001506
- 39 13 2.07 0.001372
- 40 13 2.08 0.001250
- 41 13 2.08 0.001139
- 42 13 2.09 0.001038
- 43 13 2.09 0.000946
- 44 13 2.09 0.000862
- 45 13 2.10 0.000785
- 46 13 2.10 0.000715
+ 1 0 0.00 0.041780
+ 2 1 0.09 0.038070
+ 3 1 0.16 0.034680
+ 4 1 0.22 0.031600
+ 5 2 0.27 0.028800
+ 6 2 0.36 0.026240
+ 7 2 0.43 0.023910
+ 8 4 0.51 0.021780
+ 9 4 0.58 0.019850
+ 10 4 0.63 0.018080
+ 11 4 0.68 0.016480
+ 12 6 0.72 0.015010
+ 13 6 0.78 0.013680
+ 14 6 0.83 0.012460
+ 15 6 0.87 0.011360
+ 16 7 0.90 0.010350
+ 17 7 0.94 0.009429
+ 18 7 0.97 0.008592
+ 19 7 0.99 0.007828
+ 20 7 1.02 0.007133
+ 21 8 1.06 0.006499
+ 22 8 1.15 0.005922
+ 23 8 1.22 0.005396
+ 24 9 1.29 0.004916
+ 25 9 1.34 0.004480
+ 26 11 1.39 0.004082
+ 27 11 1.44 0.003719
+ 28 11 1.49 0.003389
+ 29 11 1.53 0.003088
+ 30 11 1.56 0.002813
+ 31 11 1.58 0.002563
+ 32 11 1.61 0.002336
+ 33 11 1.62 0.002128
+ 34 11 1.64 0.001939
+ 35 11 1.65 0.001767
+ 36 12 1.66 0.001610
+ 37 12 1.68 0.001467
+ 38 12 1.69 0.001337
+ 39 12 1.70 0.001218
+ 40 11 1.70 0.001110
+ 41 11 1.71 0.001011
+ 42 11 1.71 0.000921
+ 43 11 1.72 0.000839
+ 44 11 1.72 0.000765
+ 45 11 1.72 0.000697
+ 46 11 1.73 0.000635
...
- and 5 more lines.
+ and 6 more lines.
$discriminator_auc
# A tibble: 2 × 4
.sample .metric .estimator .estimate
- 1 training roc_auc binary 0.589
- 2 testing roc_auc binary 0.428
+ 1 training roc_auc binary 0.5
+ 2 testing roc_auc binary 0.5
$pmse
# A tibble: 2 × 4
- .source .pmse .null_pmse .pmse_ratio
-
- 1 training 0.00385 0.00272 1.41
- 2 testing 0.00413 0.00280 1.47
+ .source .pmse .null_pmse .pmse_ratio
+
+ 1 training 1.23e-32 1.23e-32 1
+ 2 testing 1.23e-32 1.23e-32 1
$specks
# A tibble: 2 × 2
- .source .specks
-
- 1 training 0.137
- 2 testing 0.131
+ .source .specks
+
+ 1 training 4.86e-17
+ 2 testing 6.94e-17
attr(,"class")
[1] "discrimination"
diff --git a/README_files/figure-commonmark/unnamed-chunk-21-1.png b/README_files/figure-commonmark/unnamed-chunk-21-1.png
index 4a41413..f28dafd 100644
Binary files a/README_files/figure-commonmark/unnamed-chunk-21-1.png and b/README_files/figure-commonmark/unnamed-chunk-21-1.png differ
diff --git a/README_files/figure-commonmark/unnamed-chunk-21-2.png b/README_files/figure-commonmark/unnamed-chunk-21-2.png
index 0547e00..6bf8aa0 100644
Binary files a/README_files/figure-commonmark/unnamed-chunk-21-2.png and b/README_files/figure-commonmark/unnamed-chunk-21-2.png differ
diff --git a/README_files/figure-commonmark/unnamed-chunk-22-1.png b/README_files/figure-commonmark/unnamed-chunk-22-1.png
index 3591740..75f0de6 100644
Binary files a/README_files/figure-commonmark/unnamed-chunk-22-1.png and b/README_files/figure-commonmark/unnamed-chunk-22-1.png differ
diff --git a/data-raw/README.md b/data-raw/README.md
new file mode 100644
index 0000000..4127652
--- /dev/null
+++ b/data-raw/README.md
@@ -0,0 +1,20 @@
+# Test Data Replication
+
+The scripts in this folder reproduce the test data sets written to file
+in the `./data` folder.
+
+## Penguins
+
+The objects `penguins_*.rda` are derived from the
+`library(palmerpenguins)` dataset, available in its original form on
+`CRAN`.
+
+## American Community Survey
+
+The objects `acs_*.rda` are derived from IPUMS extracts from the 2019
+American Community Survey (ACS). Citation is available in the `acs_conf`
+docstring. To replicate the raw data, you need an IPUMS API key, which
+can be requested for free at https://account.ipums.org/api_keys and set
+using
+
+ ipumsr::set_ipums_api_key(api_key)
diff --git a/data-raw/acs_conf.R b/data-raw/acs_conf.R
new file mode 100644
index 0000000..b6cbcad
--- /dev/null
+++ b/data-raw/acs_conf.R
@@ -0,0 +1,90 @@
+library(ipumsr)
+
+set.seed(20240726)
+
+# IPUMPS API extract definition --------------------------------------------
+
+message(
+ "This script assumes you have an IPUMS API key already set up within your
+ global environment. If not, use ipumsr::set_ipums_api_key() to do so."
+)
+
+acs_extract_request <- ipumsr::define_extract_micro(
+ description = "Example 2019 Nebraska ACS Extract",
+ collection = "usa",
+ samples = c("us2019a"),
+ variables = list(
+ var_spec("STATEFIP", case_selections = c("31")),
+ var_spec("COUNTYFIP"),
+ var_spec("GQ"),
+ var_spec("SEX"),
+ var_spec("MARST"),
+ var_spec("HCOVANY"),
+ var_spec("EMPSTAT"),
+ var_spec("CLASSWKR"),
+ var_spec("AGE"),
+ var_spec("FAMSIZE"),
+ var_spec("TRANTIME"),
+ var_spec("INCTOT")
+ ),
+ data_structure = "rectangular"
+)
+submitted_extract <- ipumsr::submit_extract(acs_extract_request)
+downloadable_extract <- ipumsr::wait_for_extract(submitted_extract)
+path_to_data_files <- ipumsr::download_extract(downloadable_extract)
+
+data <- ipumsr::read_ipums_micro(path_to_data_files)
+
+# Gold-standard dataset pre-processing / cleaning -------------------------
+
+acs <- data %>%
+ dplyr::transmute(
+ county = haven::as_factor(COUNTYFIP) %>%
+ forcats::fct_recode(
+ "Douglas" = "55",
+ "Lancaster" = "109",
+ "Sarpy" = "153",
+ `NA` = "County not identifiable from public-use data (1950-onward)"
+ ),
+ gq = haven::as_factor(GQ) %>%
+ forcats::fct_collapse(
+ "Household" = c("Households under 1970 definition",
+ "Additional households under 1990 definition",
+ "Additional households under 2000 definition"),
+ "Institution" = c("Group quarters--Institutions"),
+ "Other GQ" = c("Other group quarters", "Vacant unit", "Fragment")
+ ),
+ sex = haven::as_factor(SEX) %>%
+ forcats::fct_drop(only = "Missing/blank"),
+ marst = haven::as_factor(MARST) %>%
+ forcats::fct_collapse(
+ "Married" = c("Married, spouse present",
+ "Married, spouse absent"),
+ "Divorced" = c("Divorced"),
+ "Separated" = c("Separated"),
+ "Widowed" = c("Widowed"),
+ "Single" = c("Never married/single")
+ ) %>%
+ forcats::fct_drop(only = "Blank, missing"),
+ hcovany = haven::as_factor(HCOVANY),
+ empstat = haven::as_factor(EMPSTAT) %>%
+ forcats::fct_collapse(
+ "Employed" = c("Employed"),
+ "Unemployed" = c("Unemployed"),
+ "Not in labor force" = c("Not in labor force", "N/A")
+ ) %>%
+ forcats::fct_drop(only = "Unknown/Illegible"),
+ classwkr = haven::as_factor(CLASSWKR) %>%
+ forcats::fct_drop(only = "Unknown"),
+ age = as.double(AGE),
+ famsize = as.double(FAMSIZE),
+ transit_time = as.double(TRANTIME),
+ inctot = dplyr::if_else(INCTOT == 9999999, NA, as.double(INCTOT))
+ ) %>%
+ dplyr::slice_sample(n = 2000)
+
+acs_conf <- acs[1:1000, ]
+acs_holdout <- acs[1001:2000, ]
+
+usethis::use_data(acs_conf, overwrite = TRUE)
+usethis::use_data(acs_holdout, overwrite = TRUE)
diff --git a/data-raw/acs_synth.R b/data-raw/acs_synth.R
new file mode 100644
index 0000000..7c6f6cf
--- /dev/null
+++ b/data-raw/acs_synth.R
@@ -0,0 +1,180 @@
+library(tidymodels)
+library(tidysynthesis)
+
+
+# shared preprocessing ------------------------------------------------------
+
+set.seed(20240726)
+
+# load ACS confidential data
+data(acs_conf)
+
+conf_data <- acs_conf %>%
+ dplyr::select(
+ county, gq, sex, marst, hcovany, empstat, classwkr, age, inctot
+ )
+
+conf_props <- acs_conf %>%
+ dplyr::group_by(county, gq, sex, marst, hcovany, empstat, classwkr,
+ .drop = FALSE) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::mutate(prop = n / nrow(acs_conf))
+
+# lower-risk synthesis ------------------------------------------------------
+
+#'
+#' Create one lower-disclosure-risk synthetic data sample
+#'
+#' @param synth_id Integer, ID to associate with synthetic data replicate
+#'
+sample_lr_synth <- function(synth_id) {
+
+ # lower-risk categorical synthesis: sample from regularized cell frequencies
+ lr_synth <- conf_props %>%
+ dplyr::mutate(
+ lr_n = stats::rmultinom(
+ n = 1,
+ size = nrow(conf_data),
+ # mixture of 95% confidential data and 5% uniform sample
+ prob = conf_props$prop * 0.95 + 0.5 / nrow(conf_props)
+ )[, 1]
+ ) %>%
+ tidyr::uncount(weights = lr_n) %>%
+ dplyr::select(-c(n, prop))
+
+ # use sampled categorical variables as start data
+ schema <- schema(
+ conf_data = conf_data,
+ start_data = lr_synth
+ )
+
+ # synthesize two numeric variables, "age" and "inctot"
+ visit_sequence <- visit_sequence(
+ schema = schema,
+ type = "manual",
+ manual_vars = c("age", "inctot")
+ )
+
+ roadmap <- roadmap(visit_sequence = visit_sequence)
+
+ # use a standard rpart decision tree for each variable
+ rpart_reg <- parsnip::decision_tree(mode = "regression")
+
+ synth_spec <- synth_spec(
+ roadmap = roadmap,
+ synth_algorithms = rpart_reg,
+ recipes = construct_recipes(roadmap = roadmap),
+ predict_methods = sample_rpart
+ )
+
+ presynth <- presynth(
+ roadmap = roadmap,
+ synth_spec = synth_spec
+ )
+
+
+ return(
+
+ # synthesize using tidysynthesis
+ synthesize(presynth)$synthetic_data %>%
+ dplyr::mutate(
+ synth_id = synth_id,
+ # add two-sided geometric row-wise noise to each numeric synthesis
+ age = age + rgeom(n = nrow(acs_conf), prob = 0.5) -
+ rgeom(n = nrow(acs_conf), prob = 0.5),
+ inctot = dplyr::if_else(
+ inctot > 0,
+ round(inctot, -1) + 10 * (
+ rgeom(n = nrow(acs_conf), prob = 0.2) -
+ rgeom(n = nrow(acs_conf), prob = 0.2)
+ ),
+ inctot
+ )
+ )
+
+ )
+
+}
+
+# synthesize and write to package
+acs_lr_synths <- purrr::map(
+ .x = 1:30,
+ .f = ~ sample_lr_synth(.x)
+)
+usethis::use_data(acs_lr_synths, overwrite = TRUE)
+
+
+# higher-risk synthesis ------------------------------------------------------
+
+#'
+#' Create one higher-disclosure-risk synthetic data sample
+#'
+#' @param synth_id Integer, ID to associate with synthetic data replicate
+#'
+sample_hr_synth <- function(synth_id) {
+
+ # starting categoricals: resample x% of data uniformly from the original dataset
+ hr_cats <- acs_conf %>%
+ dplyr::select(
+ county, gq, sex, marst, hcovany, empstat, classwkr
+ ) %>%
+ dplyr::mutate(
+ keep_ix = (
+ sample(1:nrow(acs_conf)) > round(0.05 * nrow(acs_conf))
+ )
+ ) %>%
+ dplyr::filter(keep_ix == TRUE)
+
+ # use sampled categorical variables as start data
+ schema <- schema(
+ conf_data = conf_data,
+ start_data = hr_synth
+ )
+
+ # synthesize two numeric variables, "age" and "inctot"
+ visit_sequence <- visit_sequence(
+ schema = schema,
+ type = "manual",
+ manual_vars = c("age", "inctot")
+ )
+
+ roadmap <- roadmap(visit_sequence = visit_sequence)
+
+ # define an intentionally overfit decision tree model
+ overfit_rpart_reg <- parsnip::decision_tree(
+ mode = "regression",
+ tree_depth = 30, # large max tree depth
+ min_n = 2 # small terminal node size
+ ) %>%
+ parsnip::set_engine(
+ "rpart", xval = 0 # disable cross-validation for pruning
+ )
+
+ synth_spec <- synth_spec(
+ roadmap = roadmap,
+ synth_algorithms = overfit_rpart_reg,
+ recipes = construct_recipes(roadmap = roadmap),
+ predict_methods = sample_rpart
+ )
+
+ presynth <- presynth(
+ roadmap = roadmap,
+ synth_spec = synth_spec
+ )
+
+ return(
+ # return synthesis result without modification
+ synthesize(presynth)$synthetic_data %>%
+ dplyr::mutate(synth_id = synth_id)
+ )
+
+}
+
+# synthesize and write to package
+acs_hr_synths <- purrr::map(
+ .x = 1:30,
+ .f = ~ sample_hr_synth(.x)
+)
+usethis::use_data(acs_hr_synths, overwrite = TRUE)
+
diff --git a/data/acs_conf.rda b/data/acs_conf.rda
new file mode 100644
index 0000000..ea9fc6a
Binary files /dev/null and b/data/acs_conf.rda differ
diff --git a/data/acs_holdout.rda b/data/acs_holdout.rda
new file mode 100644
index 0000000..cdb1e3a
Binary files /dev/null and b/data/acs_holdout.rda differ
diff --git a/data/acs_hr_synths.rda b/data/acs_hr_synths.rda
new file mode 100644
index 0000000..cb37bb0
Binary files /dev/null and b/data/acs_hr_synths.rda differ
diff --git a/data/acs_lr_synths.rda b/data/acs_lr_synths.rda
new file mode 100644
index 0000000..b2899f6
Binary files /dev/null and b/data/acs_lr_synths.rda differ
diff --git a/man/acs_conf.Rd b/man/acs_conf.Rd
new file mode 100644
index 0000000..427865d
--- /dev/null
+++ b/man/acs_conf.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{acs_conf}
+\alias{acs_conf}
+\title{American Community Survey confidential microdata}
+\format{
+\subsection{\code{acs_conf}}{
+
+A data frame with 1,000 rows and 11 columns:
+\describe{
+\item{county}{fct, county}
+\item{gq}{fct, group quarter kind}
+\item{sex}{fct, sex}
+\item{marst}{fct, marital status}
+\item{hcovany}{fct, health insurance status}
+\item{empstat}{fct, employment status}
+\item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+\item{age}{dbl, age (in years)}
+\item{famsize}{dbl, household/family size}
+\item{transit_time}{dbl, transit time to work (in minutes)}
+\item{inctot}{dbl, annual income}
+}
+}
+}
+\source{
+\url{https://usa.ipums.org/usa/}
+}
+\usage{
+acs_conf
+}
+\description{
+An extract constructed from the 2019 American Community Survey containing a
+random sample of n = 1000 Nebraska respondents.
+}
+\details{
+Original data source:
+Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+IPUMS USA: Version 15.0 [dataset]. Minneapolis, MN: IPUMS, 2024.
+https://doi.org/10.18128/D010.V15.0
+}
+\keyword{datasets}
diff --git a/man/acs_holdout.Rd b/man/acs_holdout.Rd
new file mode 100644
index 0000000..1e55281
--- /dev/null
+++ b/man/acs_holdout.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{acs_holdout}
+\alias{acs_holdout}
+\title{American Community Survey holdout microdata}
+\format{
+\subsection{\code{acs_holdout}}{
+
+A data frame with 1,000 rows and 11 columns:
+\describe{
+\item{county}{fct, county}
+\item{gq}{fct, group quarter kind}
+\item{sex}{fct, sex}
+\item{marst}{fct, marital status}
+\item{hcovany}{fct, health insurance status}
+\item{empstat}{fct, employment status}
+\item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+\item{age}{dbl, age (in years)}
+\item{famsize}{dbl, household/family size}
+\item{transit_time}{dbl, transit time to work (in minutes)}
+\item{inctot}{dbl, annual income}
+}
+}
+}
+\source{
+\url{https://usa.ipums.org/usa/}
+}
+\usage{
+acs_holdout
+}
+\description{
+An extract constructed from the 2019 American Community Survey containing a
+random sample of n = 1000 Nebraska respondents. This sample is distinct from
+\code{acs_conf} and is not used in producing the synthetic data available in this
+package.
+}
+\details{
+Original data source:
+Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+IPUMS USA: Version 15.0 [dataset]. Minneapolis, MN: IPUMS, 2024.
+https://doi.org/10.18128/D010.V15.0
+}
+\keyword{datasets}
diff --git a/man/acs_hr_synths.Rd b/man/acs_hr_synths.Rd
new file mode 100644
index 0000000..2f152b9
--- /dev/null
+++ b/man/acs_hr_synths.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{acs_hr_synths}
+\alias{acs_hr_synths}
+\title{American Community Survey higher-risk synthetic data}
+\format{
+\subsection{\code{acs_hr_synths}}{
+
+A list of 30 data frames with 1,000 rows and 11 columns:
+\describe{
+\item{county}{fct, county}
+\item{gq}{fct, group quarter kind}
+\item{sex}{fct, sex}
+\item{marst}{fct, marital status}
+\item{hcovany}{fct, health insurance status}
+\item{empstat}{fct, employment status}
+\item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+\item{age}{dbl, age (in years)}
+\item{famsize}{dbl, household/family size}
+\item{transit_time}{dbl, transit time to work (in minutes)}
+\item{inctot}{dbl, annual income}
+}
+}
+}
+\source{
+\url{https://usa.ipums.org/usa/}
+}
+\usage{
+acs_hr_synths
+}
+\description{
+A list of 30 samples of partial synthetic data derived from \code{acs_conf},
+generated using models that intentionally overfit to the confidential data.
+These are referred to as "higher-risk" relative to the "lower-risk" synthetic
+data also available in this package; the synthetic data is purely for testing purposes.
+}
+\details{
+Categorical variables are primarily kept "as-is" in this partially synthetic data,
+with a small proportion of categorical records resampled from the data. Numeric
+variables are resampled from decision tree models that are intentionally designed
+to overfit to the confidential data.
+
+Original data source:
+Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+IPUMS USA: Version 15.0 [dataset]. Minneapolis, MN: IPUMS, 2024.
+https://doi.org/10.18128/D010.V15.0
+}
+\keyword{datasets}
diff --git a/man/acs_lr_synths.Rd b/man/acs_lr_synths.Rd
new file mode 100644
index 0000000..af7a8aa
--- /dev/null
+++ b/man/acs_lr_synths.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{acs_lr_synths}
+\alias{acs_lr_synths}
+\title{American Community Survey lower-risk synthetic data}
+\format{
+\subsection{\code{acs_lr_synths}}{
+
+A list of 30 data frames with 1,000 rows and 11 columns:
+\describe{
+\item{county}{fct, county}
+\item{gq}{fct, group quarter kind}
+\item{sex}{fct, sex}
+\item{marst}{fct, marital status}
+\item{hcovany}{fct, health insurance status}
+\item{empstat}{fct, employment status}
+\item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
+\item{age}{dbl, age (in years)}
+\item{famsize}{dbl, household/family size}
+\item{transit_time}{dbl, transit time to work (in minutes)}
+\item{inctot}{dbl, annual income}
+}
+}
+}
+\source{
+\url{https://usa.ipums.org/usa/}
+}
+\usage{
+acs_lr_synths
+}
+\description{
+A list of 30 samples of synthetic data derived from \code{acs_conf},
+generated using noise infusion for both categorical and numeric random variables.
+These are referred to as "lower-risk" relative to the "higher-risk" synthetic data
+also available in this package; the synthetic data is purely for testing purposes.
+}
+\details{
+Categorical random variables are selected by resampling from a mixture of the
+original multivariate cell proportions and a uniform mixture. Numeric random
+variables are first modelled using regression trees, and new sampled values
+each have additional discrete two-sided geometric noise added to them.
+
+Original data source:
+Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
+Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
+IPUMS USA: Version 15.0 [dataset]. Minneapolis, MN: IPUMS, 2024.
+https://doi.org/10.18128/D010.V15.0
+}
+\keyword{datasets}
diff --git a/man/add_pmse.Rd b/man/add_pmse.Rd
index 2a12257..48da543 100644
--- a/man/add_pmse.Rd
+++ b/man/add_pmse.Rd
@@ -23,8 +23,8 @@ Add pMSE to discrimination object
\seealso{
Other Utility metrics:
\code{\link{add_pmse_ratio}()},
-\code{\link{add_propensities_tuned}()},
\code{\link{add_propensities}()},
+\code{\link{add_propensities_tuned}()},
\code{\link{add_specks}()},
\code{\link{discrimination}()},
\code{\link{util_ci_overlap}()},
diff --git a/man/add_pmse_ratio.Rd b/man/add_pmse_ratio.Rd
index fc404bb..be2a13d 100644
--- a/man/add_pmse_ratio.Rd
+++ b/man/add_pmse_ratio.Rd
@@ -27,8 +27,8 @@ Add pMSE ratio to discrimination object
\seealso{
Other Utility metrics:
\code{\link{add_pmse}()},
-\code{\link{add_propensities_tuned}()},
\code{\link{add_propensities}()},
+\code{\link{add_propensities_tuned}()},
\code{\link{add_specks}()},
\code{\link{discrimination}()},
\code{\link{util_ci_overlap}()},
diff --git a/man/add_propensities.Rd b/man/add_propensities.Rd
index f2e2896..aabfd08 100644
--- a/man/add_propensities.Rd
+++ b/man/add_propensities.Rd
@@ -40,8 +40,8 @@ Add propensities for if an observation belongs to the synthetic data
}
\seealso{
Other Utility metrics:
-\code{\link{add_pmse_ratio}()},
\code{\link{add_pmse}()},
+\code{\link{add_pmse_ratio}()},
\code{\link{add_propensities_tuned}()},
\code{\link{add_specks}()},
\code{\link{discrimination}()},
diff --git a/man/add_propensities_tuned.Rd b/man/add_propensities_tuned.Rd
index 9aa5c8d..c510a1f 100644
--- a/man/add_propensities_tuned.Rd
+++ b/man/add_propensities_tuned.Rd
@@ -46,8 +46,8 @@ Add propensities for if an observation belongs to the synthetic data
}
\seealso{
Other Utility metrics:
-\code{\link{add_pmse_ratio}()},
\code{\link{add_pmse}()},
+\code{\link{add_pmse_ratio}()},
\code{\link{add_propensities}()},
\code{\link{add_specks}()},
\code{\link{discrimination}()},
diff --git a/man/add_specks.Rd b/man/add_specks.Rd
index 8383940..86252b9 100644
--- a/man/add_specks.Rd
+++ b/man/add_specks.Rd
@@ -20,10 +20,10 @@ Add SPECKS to discrimination object
}
\seealso{
Other Utility metrics:
-\code{\link{add_pmse_ratio}()},
\code{\link{add_pmse}()},
-\code{\link{add_propensities_tuned}()},
+\code{\link{add_pmse_ratio}()},
\code{\link{add_propensities}()},
+\code{\link{add_propensities_tuned}()},
\code{\link{discrimination}()},
\code{\link{util_ci_overlap}()},
\code{\link{util_ks_distance}()},
diff --git a/man/aggregate_qid_eval.Rd b/man/aggregate_qid_eval.Rd
new file mode 100644
index 0000000..989d31d
--- /dev/null
+++ b/man/aggregate_qid_eval.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_qid_mi.R
+\name{aggregate_qid_eval}
+\alias{aggregate_qid_eval}
+\title{Aggregate and count factor factor-level quasi-identifiers for eval_data}
+\usage{
+aggregate_qid_eval(eval_data, keys)
+}
+\arguments{
+\item{eval_data}{An \code{eval_data} object.}
+
+\item{keys}{A character vector of column quasi-identifiers}
+}
+\value{
+A data.frame of results
+}
+\description{
+Aggregate and count factor factor-level quasi-identifiers for eval_data
+}
diff --git a/man/co_occurrence.Rd b/man/co_occurrence.Rd
index fe7aad9..4f1de18 100644
--- a/man/co_occurrence.Rd
+++ b/man/co_occurrence.Rd
@@ -4,10 +4,12 @@
\alias{co_occurrence}
\title{Construct a co-occurrence matrix}
\usage{
-co_occurrence(data)
+co_occurrence(data, na.rm = FALSE)
}
\arguments{
\item{data}{A tibble with numeric variables}
+
+\item{na.rm}{a logical indicating whether missing values should be removed.}
}
\value{
A co-occurrence matrix
diff --git a/man/convert_na_to_level.Rd b/man/convert_na_to_level.Rd
new file mode 100644
index 0000000..baeeb02
--- /dev/null
+++ b/man/convert_na_to_level.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util_na_helper.R
+\name{convert_na_to_level}
+\alias{convert_na_to_level}
+\title{Convert \code{NA} values to \code{"NA"} for categorical variables}
+\usage{
+convert_na_to_level(data)
+}
+\arguments{
+\item{data}{A data frame or tibble}
+}
+\value{
+A data frame or tibble with \code{NA} converted to \code{"NA"}
+}
+\description{
+Convert \code{NA} values to \code{"NA"} for categorical variables
+}
diff --git a/man/create_cormat_plot.Rd b/man/create_cormat_plot.Rd
new file mode 100644
index 0000000..6babdbd
--- /dev/null
+++ b/man/create_cormat_plot.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util_plots.R
+\name{create_cormat_plot}
+\alias{create_cormat_plot}
+\title{Create a correlation heatmap for numeric random variables.}
+\usage{
+create_cormat_plot(data, cor_method = "pearson")
+}
+\arguments{
+\item{data}{A data.frame/}
+
+\item{cor_method}{A correlation method to pass to \verb{stats::cor(., method=)}}
+}
+\value{
+A \code{ggplot2} plot
+}
+\description{
+Create a correlation heatmap for numeric random variables.
+}
diff --git a/man/disc_baseline.Rd b/man/disc_baseline.Rd
new file mode 100644
index 0000000..31cf482
--- /dev/null
+++ b/man/disc_baseline.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_baseline.R
+\name{disc_baseline}
+\alias{disc_baseline}
+\title{Compute baseline disclosure risk metrics using confidential data.}
+\usage{
+disc_baseline(
+ eval_data,
+ qid_keys = NULL,
+ sens_keys = NULL,
+ tclose_metric_cat = c("linf", "l1", "l2"),
+ tclose_metric_numeric = c("ks", "wass", "cvm", "ad"),
+ na.rm = FALSE
+)
+}
+\arguments{
+\item{eval_data}{An \code{eval_data} object or a tibble / data.frame corresponding
+to the confidential data.}
+
+\item{qid_keys}{A character vector of quasi-identifying keys. Must be provided
+as \code{factor} type variables. Defaults to all factor variables in \code{eval_data}.}
+
+\item{sens_keys}{An optional character vector of sensitive variables of interest.
+Must be disjoint from \code{qid_keys}, or \code{FALSE} to provide no l-diversity or t-closeness
+metrics. Defaults to the complement of \code{qid_keys}.}
+
+\item{tclose_metric_cat}{A string describing the t-closeness distance metric
+between proportions of categorical variables. One of \code{"l1"} (L1 distance),
+\code{"l2"} (L2 distance), or \code{"linf"} (L-infinity distance), defaults to \code{"linf"}.}
+
+\item{tclose_metric_numeric}{A string describing the t-closeness distance metric
+between numeric variable empirical CDFs. One of \code{"ks"} (Kolmogorov-Smirnov),
+\code{"wass"} (Wasserstein L1 distance), \code{"cvm"} (Cramer-von-Mises distance), or
+\code{"ad"} (Anderson-Darling), defaults, to \code{"ks"}.}
+
+\item{na.rm}{Boolean if TRUE, will remove \code{NA} values from numeric \code{sens_keys}.}
+}
+\value{
+A \code{baseline_metrics} object.
+}
+\description{
+Compute baseline disclosure risk metrics using confidential data.
+}
diff --git a/man/disc_mit.Rd b/man/disc_mit.Rd
index e3554dc..24bd7b1 100644
--- a/man/disc_mit.Rd
+++ b/man/disc_mit.Rd
@@ -2,28 +2,25 @@
% Please edit documentation in R/disc_mit.R
\name{disc_mit}
\alias{disc_mit}
-\title{Run a membership inference test}
+\title{Run a nearest-neighbor membership inference test}
\usage{
-disc_mit(postsynth, data, holdout_data, threshold_percentile = NULL)
+disc_mit(eval_data, threshold_percentile = NULL, summary = TRUE)
}
\arguments{
-\item{postsynth}{A postsynth object or tibble with synthetic data generated from the data input}
-
-\item{data}{A data frame with a subset of the original data}
-
-\item{holdout_data}{A dataframe with observations similar to the original but
-not used to train the synthesizer. The data should have the same variables as
-postsynth.}
+\item{eval_data}{An \code{eval_data} object.}
\item{threshold_percentile}{Distances below the value associated with this
percentile will be predicted as in the training data. If the
threshold_percentile is not provided, the function calculates it with the
-following formula: \code{nrow(data)/(nrow(data) + nrow(holdout_data))}}
+following formula: \code{nrow(data) / (nrow(data) + nrow(holdout_data))}}
+
+\item{summary}{Boolean if TRUE, returns summary statistics, if FALSE, returns
+two disaggregated dataframes of individual distances and ROC curve points.}
}
\value{
A list with precision, recall, the confusion matrix, and ROC AUC
}
\description{
-Run a membership inference test
+Run a nearest-neighbor membership inference test
}
\concept{Disclosure risk metrics}
diff --git a/man/discrimination.Rd b/man/discrimination.Rd
index b2dc2cb..d47a3d9 100644
--- a/man/discrimination.Rd
+++ b/man/discrimination.Rd
@@ -19,10 +19,10 @@ Combine synthetic data and data for a discriminant based metric
}
\seealso{
Other Utility metrics:
-\code{\link{add_pmse_ratio}()},
\code{\link{add_pmse}()},
-\code{\link{add_propensities_tuned}()},
+\code{\link{add_pmse_ratio}()},
\code{\link{add_propensities}()},
+\code{\link{add_propensities_tuned}()},
\code{\link{add_specks}()},
\code{\link{util_ci_overlap}()},
\code{\link{util_ks_distance}()},
diff --git a/man/dot-aggregate_qid.Rd b/man/dot-aggregate_qid.Rd
new file mode 100644
index 0000000..0d24941
--- /dev/null
+++ b/man/dot-aggregate_qid.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_dd_helper.R
+\name{.aggregate_qid}
+\alias{.aggregate_qid}
+\title{Aggregate and count factor-level quasi-identifiers.}
+\usage{
+.aggregate_qid(df, keys)
+}
+\arguments{
+\item{df}{A data.frame.}
+
+\item{keys}{A character vector of column names.}
+}
+\value{
+A data.frame of aggregated quasi-identifiers.
+}
+\description{
+Aggregate and count factor-level quasi-identifiers.
+}
diff --git a/man/dot-validate_eval_keys.Rd b/man/dot-validate_eval_keys.Rd
new file mode 100644
index 0000000..fdd333b
--- /dev/null
+++ b/man/dot-validate_eval_keys.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_dd_helper.R
+\name{.validate_eval_keys}
+\alias{.validate_eval_keys}
+\title{Input validation for categorical disclosure risk metrics.}
+\usage{
+.validate_eval_keys(eval_data, keys = NULL)
+}
+\arguments{
+\item{eval_data}{An \code{eval_data} object.}
+
+\item{keys}{A character vector of column names, or NULL to use all column names.}
+}
+\value{
+\code{0} if all validation checks pass.
+}
+\description{
+Input validation for categorical disclosure risk metrics.
+}
diff --git a/man/eval_data.Rd b/man/eval_data.Rd
new file mode 100644
index 0000000..0572d7d
--- /dev/null
+++ b/man/eval_data.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/eval_data.R
+\name{eval_data}
+\alias{eval_data}
+\title{Create evaluation data container}
+\usage{
+eval_data(conf_data, synth_data, holdout_data = NULL)
+}
+\arguments{
+\item{conf_data}{A confidential dataframe}
+
+\item{synth_data}{A single (or list of) dataframe(s) or \code{postsynth} object(s).}
+
+\item{holdout_data}{An optional holdout dataframe containing the same columns
+as the confidential dataframe}
+}
+\value{
+An \code{eval_data} object.
+}
+\description{
+Create evaluation data container
+}
diff --git a/man/is_eval_data.Rd b/man/is_eval_data.Rd
new file mode 100644
index 0000000..844a2f1
--- /dev/null
+++ b/man/is_eval_data.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/eval_data.R
+\name{is_eval_data}
+\alias{is_eval_data}
+\title{Check if obejct is \code{eval_data}}
+\usage{
+is_eval_data(x)
+}
+\arguments{
+\item{x}{object}
+}
+\value{
+A boolean
+}
+\description{
+Check if obejct is \code{eval_data}
+}
diff --git a/man/nn_membership_inference.Rd b/man/nn_membership_inference.Rd
new file mode 100644
index 0000000..fe15839
--- /dev/null
+++ b/man/nn_membership_inference.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_mit.R
+\name{nn_membership_inference}
+\alias{nn_membership_inference}
+\title{Perform a nearest-neighbor membership inference test on one synthetic dataset}
+\usage{
+nn_membership_inference(
+ synth_data,
+ conf_data,
+ holdout_data,
+ threshold_percentile = NULL,
+ summary = TRUE
+)
+}
+\arguments{
+\item{synth_data}{A dataframe with synthetic data generated from the data input}
+
+\item{conf_data}{A data frame with a subset of the original data}
+
+\item{holdout_data}{A dataframe with observations similar to the original but
+not used to train the synthesizer. The data should have the same variables as
+postsynth.}
+
+\item{threshold_percentile}{Distances below the value associated with this
+percentile will be predicted as in the training data. If the
+threshold_percentile is not provided, the function calculates it with the
+following formula: \code{nrow(data)/(nrow(data) + nrow(holdout_data))}}
+
+\item{summary}{Boolean if TRUE, returns summary statistics, if FALSE, returns
+two disaggregated dataframes of individual distances and ROC curve points.}
+}
+\value{
+Either a list with precision, recall, the confusion matrix, and ROC AUC
+or a list with two disaggregated dataframes (if summary = FALSE).
+}
+\description{
+Perform a nearest-neighbor membership inference test on one synthetic dataset
+}
diff --git a/man/plot_categorical_bar.Rd b/man/plot_categorical_bar.Rd
new file mode 100644
index 0000000..a6cf387
--- /dev/null
+++ b/man/plot_categorical_bar.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util_plots.R
+\name{plot_categorical_bar}
+\alias{plot_categorical_bar}
+\title{Create bar charts for a categorical random variable.}
+\usage{
+plot_categorical_bar(joint_data, var_name, cat1_name = NULL, cat2_name = NULL)
+}
+\arguments{
+\item{joint_data}{A data.frame combining rows from confidential and synthetic
+data, with the column 'source' identifying the two.}
+
+\item{var_name}{Categorical variable name to plot.}
+
+\item{cat1_name}{Optional categorical variable to group by for subplots.}
+
+\item{cat2_name}{Optional categorical variable to group by for subplots.}
+}
+\value{
+A \code{ggplot2} plot
+}
+\description{
+Create bar charts for a categorical random variable.
+}
diff --git a/man/plot_cormat.Rd b/man/plot_cormat.Rd
new file mode 100644
index 0000000..a1d35bc
--- /dev/null
+++ b/man/plot_cormat.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util_plots.R
+\name{plot_cormat}
+\alias{plot_cormat}
+\title{Create side-by-side correlation heatmaps for numeric random variables.}
+\usage{
+plot_cormat(conf_data, synth_data, cor_method = "pearson")
+}
+\arguments{
+\item{conf_data}{Confidential data}
+
+\item{synth_data}{Synthetic data}
+
+\item{cor_method}{A correlation method to pass to \verb{stats::cor(., method=)}}
+}
+\value{
+A \code{ggplot2} plot
+}
+\description{
+Create side-by-side correlation heatmaps for numeric random variables.
+}
diff --git a/man/plot_numeric_hist_kde.Rd b/man/plot_numeric_hist_kde.Rd
new file mode 100644
index 0000000..19e7bd5
--- /dev/null
+++ b/man/plot_numeric_hist_kde.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util_plots.R
+\name{plot_numeric_hist_kde}
+\alias{plot_numeric_hist_kde}
+\title{Create a histogram + KDE estimate for a numeric variable.}
+\usage{
+plot_numeric_hist_kde(joint_data, var_name, cat1_name = NULL, cat2_name = NULL)
+}
+\arguments{
+\item{joint_data}{A data.frame combining rows from confidential and synthetic
+data, with the column 'source' identifying the two.}
+
+\item{var_name}{Numeric variable name to plot.}
+
+\item{cat1_name}{Optional categorical variable to group by for subplots.}
+
+\item{cat2_name}{Optional categorical variable to group by for subplots.}
+}
+\value{
+A \code{ggplot2} plot
+}
+\description{
+Create a histogram + KDE estimate for a numeric variable.
+}
diff --git a/man/plot_prob_qid_abs_err.Rd b/man/plot_prob_qid_abs_err.Rd
new file mode 100644
index 0000000..103607a
--- /dev/null
+++ b/man/plot_prob_qid_abs_err.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_qid_mi.R
+\name{plot_prob_qid_abs_err}
+\alias{plot_prob_qid_abs_err}
+\title{Plot absolute error probabilities}
+\usage{
+plot_prob_qid_abs_err(
+ agg_eval_data,
+ keys,
+ max_k = 20,
+ probs = c(0.5, 0.75, 0.9),
+ holdout = FALSE
+)
+}
+\arguments{
+\item{agg_eval_data}{Output from \code{aggregate_qid_eval}}
+
+\item{keys}{A character vector of column names}
+
+\item{max_k}{largest partition selection size}
+
+\item{probs}{Quantiles at which to estimate confidence of QID count}
+
+\item{holdout}{boolean, use data from holdout instead of confidential}
+}
+\value{
+A \code{ggplot2} plot.
+}
+\description{
+Plot absolute error probabilities
+}
diff --git a/man/plot_prob_qid_partition.Rd b/man/plot_prob_qid_partition.Rd
new file mode 100644
index 0000000..f4887d0
--- /dev/null
+++ b/man/plot_prob_qid_partition.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_qid_mi.R
+\name{plot_prob_qid_partition}
+\alias{plot_prob_qid_partition}
+\title{Plot partition selection probabilities}
+\usage{
+plot_prob_qid_partition(agg_eval_data, keys, max_k = 20)
+}
+\arguments{
+\item{agg_eval_data}{Output from \code{aggregate_qid_eval}}
+
+\item{keys}{A character vector of column names}
+
+\item{max_k}{largest partition selection size}
+}
+\value{
+A \code{ggplot2} plot.
+}
+\description{
+Plot partition selection probabilities
+}
diff --git a/man/prep_combined_data_for_na.rm.Rd b/man/prep_combined_data_for_na.rm.Rd
new file mode 100644
index 0000000..ba2d238
--- /dev/null
+++ b/man/prep_combined_data_for_na.rm.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util_na_helper.R
+\name{prep_combined_data_for_na.rm}
+\alias{prep_combined_data_for_na.rm}
+\title{Check whether na.rm is compatible with univariate utilty metrics}
+\usage{
+prep_combined_data_for_na.rm(
+ combined_data,
+ na.rm = FALSE,
+ drop_zeros = FALSE,
+ drop_zeros_exclude = NULL
+)
+}
+\arguments{
+\item{combined_data}{A data frame or tibble}
+
+\item{na.rm}{A boolean for whether to ignore missing values}
+
+\item{drop_zeros}{A boolean for whether to ignore zero values in utility metrics}
+
+\item{drop_zeros_exclude}{An optional set of unquoted columns on which to drop zeros}
+}
+\value{
+A data frame or tibble with missing and/or zero values set to NA
+}
+\description{
+Check whether na.rm is compatible with univariate utilty metrics
+}
diff --git a/man/prep_discrete_eval_data.Rd b/man/prep_discrete_eval_data.Rd
new file mode 100644
index 0000000..7667a02
--- /dev/null
+++ b/man/prep_discrete_eval_data.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disc_dd_helper.R
+\name{prep_discrete_eval_data}
+\alias{prep_discrete_eval_data}
+\title{Discretize numeric columns for disclosure risk metrics}
+\usage{
+prep_discrete_eval_data(eval_data, col_map)
+}
+\arguments{
+\item{eval_data}{An \code{eval_data} object.}
+
+\item{col_map}{A list mapping each column to its discretization parameters, one
+of either "k" (for specifying the total number of categories) or "width" (for
+specifying the width of each bin)}
+}
+\value{
+An \code{eval_data} object.
+}
+\description{
+Discretize numeric columns for disclosure risk metrics
+}
+\examples{
+
+prep_discrete_eval_data(
+ eval_data(acs_conf, acs_lr_synths),
+ col_map = list(
+ "age" = list("k" = 10),
+ "inctot" = list("width" = 10000)
+ )
+ )
+
+}
diff --git a/man/syntheval-package.Rd b/man/syntheval-package.Rd
index bd9e3b9..1507d3e 100644
--- a/man/syntheval-package.Rd
+++ b/man/syntheval-package.Rd
@@ -18,6 +18,11 @@ Useful links:
\author{
\strong{Maintainer}: Aaron R. Williams \email{awilliams@urban.org} (\href{https://orcid.org/0000-0001-5564-1938}{ORCID})
+Authors:
+\itemize{
+ \item Jeremy Seeman \email{jseeman@urban.org} (\href{https://orcid.org/0000-0003-3526-3209}{ORCID})
+}
+
Other contributors:
\itemize{
\item Gabe Morrison \email{gmorrison@urban.org} [contributor]
diff --git a/man/util_ci_overlap.Rd b/man/util_ci_overlap.Rd
index d538073..81cbb61 100644
--- a/man/util_ci_overlap.Rd
+++ b/man/util_ci_overlap.Rd
@@ -14,18 +14,48 @@ util_ci_overlap(postsynth, data, formula)
\item{formula}{A formula for a linear regression model}
}
\value{
-A list with the regression confidence interval overlap and estimated
-coefficients
+A list of two dataframes:
+\itemize{
+\item \code{ci_overlap}: one row per model parameter with utility metrics.
+\itemize{
+\item \code{overlap }: symmetric overlap metric, calculated as the average of the
+interval overlap contained in the synthetic confidence interval and the
+interval overlap contained in the confidential confidence interval.
+\item \code{coef_diff}: synthetic parameter estimate - confidential parameter estimate
+\item \code{std_coef_diff}: \code{coef_diff} divided by the standard error for the confidential data.
+\item \code{sign_match}: boolean if the synthetic and confidential parameter estimates have the same sign.
+\item \code{significance_match}: boolean if the null hypothesis test where the
+parameter is 0 has p-value less than .05 agrees in both confidential and
+synthetic data.
+\item \code{ss}: boolean if both \code{sign_match} and \code{significance_match} are true.
+\item \code{sso}: boolean if \code{sign_match} is true and \code{overlap} is positive.
+}
+\item \code{coef_diff}: one row per model parameter and data source (confidential or
+synthetic) listing parameter estimates, standard errors, test statistics,
+p-values for null hypothesis tests, and 95\% confidence interval bounds.
+}
}
\description{
Regression confidence interval overlap
+}
+\examples{
+conf_data <- mtcars
+synth_data <- mtcars \%>\%
+ dplyr::slice_sample(n = nrow(mtcars) / 2)
+
+util_ci_overlap(
+ conf_data,
+ synth_data,
+ mpg ~ disp + vs + am
+)
+
}
\seealso{
Other Utility metrics:
-\code{\link{add_pmse_ratio}()},
\code{\link{add_pmse}()},
-\code{\link{add_propensities_tuned}()},
+\code{\link{add_pmse_ratio}()},
\code{\link{add_propensities}()},
+\code{\link{add_propensities_tuned}()},
\code{\link{add_specks}()},
\code{\link{discrimination}()},
\code{\link{util_ks_distance}()},
diff --git a/man/util_co_occurrence.Rd b/man/util_co_occurrence.Rd
index 5662b50..37fbcc5 100644
--- a/man/util_co_occurrence.Rd
+++ b/man/util_co_occurrence.Rd
@@ -4,12 +4,16 @@
\alias{util_co_occurrence}
\title{Calculate the co-occurrence fit metric of a confidential data set.}
\usage{
-util_co_occurrence(postsynth, data)
+util_co_occurrence(postsynth, data, na.rm = FALSE)
}
\arguments{
-\item{postsynth}{A postsynth object from tidysynthesis or a tibble}
+\item{postsynth}{a postsynth object from tidysynthesis or a tibble}
\item{data}{an original (observed) data set.}
+
+\item{na.rm}{a logical indicating whether missing values should be removed.
+Note: values are jointly removed for each pair of variables even if only one
+value is missing.}
}
\value{
A \code{list} of fit metrics:
diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd
index a4a6982..8044124 100644
--- a/man/util_corr_fit.Rd
+++ b/man/util_corr_fit.Rd
@@ -4,12 +4,17 @@
\alias{util_corr_fit}
\title{Calculate the correlation fit metric of a confidential data set.}
\usage{
-util_corr_fit(postsynth, data)
+util_corr_fit(postsynth, data, use = "everything")
}
\arguments{
\item{postsynth}{A postsynth object from tidysynthesis or a tibble}
\item{data}{an original (observed) data set.}
+
+\item{use}{optional character string giving a method for computing
+covariances in the presence of missing values. This must be (an abbreviation
+of) one of the strings "everything", "all.obs", "complete.obs",
+"na.or.complete", or "pairwise.complete.obs".}
}
\value{
A \code{list} of fit metrics:
diff --git a/man/util_ks_distance.Rd b/man/util_ks_distance.Rd
index dac56bb..85dbf1a 100644
--- a/man/util_ks_distance.Rd
+++ b/man/util_ks_distance.Rd
@@ -5,12 +5,14 @@
\title{Calculate the Kolmogorov-Smirnov distance (D) for each numeric variable in
the synthetic and confidential data}
\usage{
-util_ks_distance(postsynth, data)
+util_ks_distance(postsynth, data, na.rm = FALSE)
}
\arguments{
-\item{postsynth}{A postsynth object or tibble with synthetic data}
+\item{postsynth}{a postsynth object or tibble with synthetic data}
-\item{data}{A data frame with the original data}
+\item{data}{a data frame with the original data}
+
+\item{na.rm}{a logical indicating whether missing values should be removed.}
}
\value{
A tibble with the D and location of the largest distance for each
@@ -22,10 +24,10 @@ the synthetic and confidential data
}
\seealso{
Other Utility metrics:
-\code{\link{add_pmse_ratio}()},
\code{\link{add_pmse}()},
-\code{\link{add_propensities_tuned}()},
+\code{\link{add_pmse_ratio}()},
\code{\link{add_propensities}()},
+\code{\link{add_propensities_tuned}()},
\code{\link{add_specks}()},
\code{\link{discrimination}()},
\code{\link{util_ci_overlap}()},
diff --git a/man/util_moments.Rd b/man/util_moments.Rd
index b7b4df0..96c4e1e 100644
--- a/man/util_moments.Rd
+++ b/man/util_moments.Rd
@@ -11,7 +11,8 @@ util_moments(
group_by = NULL,
drop_zeros = FALSE,
common_vars = TRUE,
- synth_vars = TRUE
+ synth_vars = TRUE,
+ na.rm = FALSE
)
}
\arguments{
@@ -28,6 +29,8 @@ util_moments(
\item{common_vars}{A logical for if only common variables should be kept}
\item{synth_vars}{A logical for if only synthesized variables should be kept}
+
+\item{na.rm}{A logical for ignoring \code{NA} values in computations.}
}
\value{
A \code{tibble} of summary statistics.
diff --git a/man/util_percentiles.Rd b/man/util_percentiles.Rd
index 55c6198..1830f3b 100644
--- a/man/util_percentiles.Rd
+++ b/man/util_percentiles.Rd
@@ -12,7 +12,8 @@ util_percentiles(
group_by = NULL,
drop_zeros = FALSE,
common_vars = TRUE,
- synth_vars = TRUE
+ synth_vars = TRUE,
+ na.rm = FALSE
)
}
\arguments{
@@ -35,6 +36,8 @@ This option will frequently result in an error because quantile() is strict
about missing values.}
\item{synth_vars}{A logical for if only synthesized variables should be kept}
+
+\item{na.rm}{A logical for ignoring \code{NA} values in computations.}
}
\value{
A \code{tibble} of summary statistics.
diff --git a/man/util_proportions.Rd b/man/util_proportions.Rd
index 89df4fd..54bfd86 100644
--- a/man/util_proportions.Rd
+++ b/man/util_proportions.Rd
@@ -10,7 +10,9 @@ util_proportions(
weight_var = NULL,
group_by = NULL,
common_vars = TRUE,
- synth_vars = TRUE
+ synth_vars = TRUE,
+ keep_empty_levels = FALSE,
+ na.rm = FALSE
)
}
\arguments{
@@ -25,6 +27,11 @@ util_proportions(
\item{common_vars}{A logical for if only common variables should be kept}
\item{synth_vars}{A logical for if only synthesized variables should be kept}
+
+\item{keep_empty_levels}{A logical for keeping all class levels in the group_by
+statements, including missing levels.}
+
+\item{na.rm}{A logical for ignoring \code{NA} values in proportion calculations.}
}
\value{
A tibble with variables, classes, and relative frequencies
@@ -34,10 +41,10 @@ Calculate relative frequency tables for categorical variables
}
\seealso{
Other Utility metrics:
-\code{\link{add_pmse_ratio}()},
\code{\link{add_pmse}()},
-\code{\link{add_propensities_tuned}()},
+\code{\link{add_pmse_ratio}()},
\code{\link{add_propensities}()},
+\code{\link{add_propensities_tuned}()},
\code{\link{add_specks}()},
\code{\link{discrimination}()},
\code{\link{util_ci_overlap}()},
diff --git a/man/util_totals.Rd b/man/util_totals.Rd
index bb5212f..2382eb4 100644
--- a/man/util_totals.Rd
+++ b/man/util_totals.Rd
@@ -10,7 +10,8 @@ util_totals(
weight_var = 1,
group_by = NULL,
common_vars = TRUE,
- synth_vars = TRUE
+ synth_vars = TRUE,
+ na.rm = FALSE
)
}
\arguments{
@@ -25,6 +26,8 @@ util_totals(
\item{common_vars}{A logical for if only common variables should be kept}
\item{synth_vars}{A logical for if only synthesized variables should be kept}
+
+\item{na.rm}{A logical for ignoring \code{NA} values in computations.}
}
\value{
A \code{tibble} of totals.
diff --git a/project-standards.md b/project-standards.md
new file mode 100644
index 0000000..79d7cba
--- /dev/null
+++ b/project-standards.md
@@ -0,0 +1,222 @@
+# Project Standards
+
+## Overview
+
+This document explains the project and programming conventions used for `library(syntheval)` and the evaluation of synthetic data. The document is a work-in-progress and should be updated as conventions are created or changed.
+
+## Contents
+
+* [Principles](#principles)
+ - [tidyverse](#tidyverse)
+ - [tidymodels](#tidymodels)
+ - [syntheval](#syntheval)
+* [Project Organization](#project-organization)
+ - [Directory Structure](#directory-structure)
+ - [Documentation](#documentation)
+ - [Workflow](#workflow)
+* [Styles](#style)
+ - [Code Style](code-style)
+ - [Naming Conventions](#naming-conventions)
+ - [roxygen2](#roxygen2)
+ - [Assertions and In-line Errors](#assertions-and-in-line-errors)
+ - [Tests](#tests)
+
+## Principles
+
+This project is heavily inspired by `library(tidyverse)` and `library(tidymodels)`.
+
+### tidyverse
+
+This project aims to follow the four guiding principles outlined in the [tidytools manifesto](https://tidyverse.tidyverse.org/articles/manifesto.html):
+
+1. Reuse existing data structures
+2. Compose simple functions with the pipe
+3. Embrace functional programming
+4. Design for humans
+
+Building smaller packages that handle discrete tasks instead of large packages that do everything is clearly a tidytools principle that is not listed. Our eventual goal is to reflect this design.
+
+### tidymodels
+
+`library(tidymodels)` weds the unified modeling interface of `library(caret)` with tidy principles. [Conventions for R Modeling Packages](https://tidymodels.github.io/model-implementation-principles/index.html) is a draft outline of principles to `library(tidymodels)`. Here are a few important principles:
+
+* All results should be reproducible from run-to-run
+* Retain the minimally sufficient objects in the model object.
+* Every class should have a `print` method that gives a concise description of the object.
+
+### syntheval
+
+* All utility evaluations should start `R/util_*.R`, with `*` naming the functions or group of functions used.
+* All disclosure risk evaluations should start `R/disc_*.R`, with `*` naming the functions or group of functions used.
+* All evaluation metrics should accept an `eval_data` object as the first input.
+
+## Project Organization
+
+### Directories
+
+Directories add structure to a project and make it possible to turn `syntheval` into `library(syntheval)`
+
+* `R/` contains R functions as `.R` scripts
+* `man/` contains `.Rd` documentation files. No manual editing should happen in this directory.
+* `tests/` contains unit tests for functions
+
+### Documentation
+
+There are several important places where documentation is captured:
+
+* The README contains information specific to the code base
+* `roxygen2` skeletons contain information specific to functions
+* Some `.R` scripts contain in-line comments clarifying code
+
+Out-of-date and incorrect documentation can be more damaging than no documentation at all. It is important that documentation is updated when changes are made. Check all of the above places after making changes to code.
+
+### Development Workflow
+
+1. Open a [GitHub issue](https://github.com/UrbanInstitute/syntheval/issues)
+2. Checkout a new branch named `iss###` that corresponds to the related issue
+3. Update the code
+4. Build necessary tests for new code and updating existing tests for code changes
+5. Run `devtools::document()` to update package documentation and the package NAMESPACE
+6. Build and install the package (with Ctrl-Shift-b if using RStudio)
+7. Run R CMD check (with Ctrl-Shift-E if using RStudio) and resolve any issues.
+8. Push the code and put in a Pull Request to the `version#.#.#` branch. Request at least one reviewer for any changes to code or documentation.
+9. Delete the remote branch (and possibly the local branch) when all changes are merged into the master branch
+10. From time-to-time, new releases will be moved from `version#.#.#` to `main`. The `main` branch should be stable at all times and updated according to a release schedule.
+
+**Note:** do not use `devtools::load_all(.)`.
+
+**Note:** use `git merge master`, not `git rebase master` if your Pull Request falls behind the master branch of the repository. This preserves the commit history.
+
+### Releases
+
+* Major changes should be tracked in `NEWS.md`. `library(parnsip)` is a good [example](https://github.com/tidymodels/parsnip/blob/master/NEWS.md).
+* Changes on the `version#.#.#` branch should be tracked at the top under the header `syntheval (development version)`.
+* We are using [semantic versioning](https://semver.org/).
+
+## Styles
+
+### Code Style
+
+The project follows [the tidyverse style guide](https://style.tidyverse.org/index.html).
+
+One major exception is that all functions should include `return()` at the end of the function.
+
+Package NAMESPACEs should be directly referenced with `::` in all production code including R Markdown reports.
+
+Argument names should be explicitly included in all function calls from `library(syntheval)`. Arguments other than `data` or `x` should be explicitly included for most other function calls.
+
+The [tidyverse style guide](https://style.tidyverse.org/index.html) is light on details about vertical spacing. Vertical spacing should be liberally used. For example:
+
+```
+if (x > 3) {
+
+ "apple"
+
+} else {
+
+ "orange"
+
+}
+```
+
+This project takes a functional programming approach. Functions should be heavily used. Each function should get its own `.R` script in the `R/` directory.
+
+Functions should be [referentially transparent](https://b-rodrigues.github.io/fput/fprog.html#fprog_intro). Values and data should always be explicitly passed to the function through function arguments so that a function always returns the same output for a given set of arguments regardless of the environment.
+
+Hard coding of values should be avoided in functions. When possible, values should be parameterized.
+
+The project uses `.Rproj` to manage directory paths. `setwd()` and absolute file paths should never be used.
+
+### Naming Conventions
+
+### roxygen2
+
+Every function should include a [roxygen2](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html) header.
+
+* The first line of the documentation should be a concise description of the function without a full stop
+* Every argument of the function should be documented with `@param`. Text should be in sentence case and end in a full stop.
+
+### Assertions and In-line Errors
+
+Assertions, things expected to always be true about the code, should be tested in-line. [healthinequality-code](https://github.com/michaelstepner/healthinequality-code/blob/master/code/readme.md#assert-what-youre-expecting-to-be-true) offers some good background.
+
+Functions should contain logical tests that catch glaring errors when functions are called. Consider the following example from `visit_sequence()`:
+
+```
+ valid_types <- c("default", "correlation", "proportion", "weighted total",
+ "weighted absolute total")
+
+ if (!type %in% valid_types) {
+ stop(
+ "Error: 'type' argument must be one of: ",
+ paste0(valid_types, collapse = ", ")
+ )
+ }
+```
+
+### Tests
+
+> Whenever you are tempted to type something into a print statement or a debugger expression, write it as a test instead. — Martin Fowler
+
+Every function should include a corresponding test file in `tests/testthat/`.
+
+Use `usethis::use_testthat()` to create a new test file for `library(syntheval)`. Test files have three layers:
+
+1. **expectations** describe the expected result of a computation
+2. **tests** are collections of expectations related to the same functionality
+3. **files** are groups of related tests
+
+Consider the following example from [Advanced R](https://r-pkgs.org/tests.html):
+
+```
+context("String length")
+library(stringr)
+
+test_that("str_length is number of characters", {
+ expect_equal(str_length("a"), 1)
+ expect_equal(str_length("ab"), 2)
+ expect_equal(str_length("abc"), 3)
+})
+
+test_that("str_length of factor is length of level", {
+ expect_equal(str_length(factor("a")), 1)
+ expect_equal(str_length(factor("ab")), 2)
+ expect_equal(str_length(factor("abc")), 3)
+})
+
+test_that("str_length of missing is missing", {
+ expect_equal(str_length(NA), NA_integer_)
+ expect_equal(str_length(c(NA, 1)), c(NA, 1))
+ expect_equal(str_length("NA"), 2)
+})
+```
+
+Our workflow:
+
+1. Every function should have tests. Write tests *before* writing a new function.
+2. Develop code. Add tests as functionality changes.
+3. Always run the tests after building the package with `devtools::test()`
+
+A few suggestions:
+
+* Always write a test when you discover a bug
+* Test each behavior once and only once--if possible
+* Test simple code. Spend even more time testing complex/fragile code
+
+Tests will focus on if correct values are returned by a function, if the return values are of the right class, and if error messages are thrown when necessary. The test workflow will also catch warnings and errors from all code called in the code base.
+
+Here are common `expect_*()` functions:
+
+* `expect_equal()`
+* `expect_identical()`
+* `expect_match()`
+* `expect_output()`
+* `expect_warning()`
+* `expect_error()`
+* `expect_is()`
+* `expect_true()`
+* `expect_false()`
+
+**Note:** do not use `devtools::load_all(.)` in test files.
+
+Assertions should be used to catch user errors or unexpected results. Tests should be used to catch design errors and errors in the code base.
\ No newline at end of file
diff --git a/tests/testthat/test-disc_baseline.R b/tests/testthat/test-disc_baseline.R
new file mode 100644
index 0000000..894573b
--- /dev/null
+++ b/tests/testthat/test-disc_baseline.R
@@ -0,0 +1,102 @@
+
+test_that("disc_baseline error checking", {
+
+ expect_error(
+ disc_baseline("not a dataframe")
+ )
+
+ # fail on data.frames with no factor columns for QIDs
+ expect_error(
+ disc_baseline(acs_conf[, c("inctot", "age")])
+ )
+
+ # fail on data.frames with non-factor provided QIDs
+ expect_error(
+ disc_baseline(acs_conf, qid_keys = c("inctot", "age"))
+ )
+
+ # fail on overlapping user-specified keys
+ expect_error(
+ disc_baseline(acs_conf, qid_keys = c("hcovany"), sens_keys = c("hcovany"))
+ )
+
+})
+
+test_that("disc_baseline without sensitivity metrics", {
+
+ qid_only <- disc_baseline(
+ eval_data = acs_conf,
+ qid_keys = c("county", "gq"),
+ sens_keys = FALSE
+ )
+
+ expect_equal(nrow(qid_only$qid_metrics), 24)
+
+ expect_identical(
+ names(qid_only$qid_metrics),
+ c("key_id", "county", "gq", "metric", "value")
+ )
+
+ expect_null(qid_only$sens_keys)
+ expect_null(qid_only$sens_metrics)
+
+})
+
+test_that("disc_baseline basic functionality", {
+
+ metrics <- disc_baseline(
+ eval_data = acs_conf[complete.cases(acs_conf), ],
+ qid_keys = c("county", "gq"),
+ sens_keys = c("hcovany", "inctot")
+ )
+
+ expect_equal(
+ names(metrics$sens_metrics),
+ c("key_id", "county", "gq", "raw_n", "prop", "metric", "sens_var", "value")
+ )
+
+ expect_true(
+ all(metrics$metric %in% c("l_div", "t_close"))
+ )
+
+ # check all distinct l-diversities are integer-valued
+ l_divs <- metrics$sens_metrics %>%
+ dplyr::filter(.data[["metric"]] == "l_div") %>%
+ dplyr::pull("value")
+ expect_true(all(l_divs == as.integer(l_divs)))
+
+ # check all KS t-closeness values are between 0 and 1
+ ks_values <- metrics$sens_metrics %>%
+ dplyr::filter(
+ .data[["sens_var"]] == "inctot" &
+ .data[["metric"]] == "t_close"
+ ) %>%
+ dplyr::pull("value")
+ expect_true(all(ks_values >= 0 & ks_values <= 1))
+
+})
+
+test_that("disc_baseline na.rm functionality", {
+
+ expect_warning(
+ metrics <- disc_baseline(
+ eval_data = acs_conf,
+ qid_keys = c("county", "gq"),
+ sens_keys = c("hcovany", "inctot")
+ )
+ )
+
+ expect_no_warning(
+ metrics_narm <- disc_baseline(
+ eval_data = acs_conf,
+ qid_keys = c("county", "gq"),
+ sens_keys = c("hcovany", "inctot"),
+ na.rm = TRUE
+ )
+ )
+
+ expect_false(
+ identical(metrics$sens_metrics, metrics_narm$sens_metrics)
+ )
+
+})
diff --git a/tests/testthat/test-disc_dd_helper.R b/tests/testthat/test-disc_dd_helper.R
new file mode 100644
index 0000000..4d27476
--- /dev/null
+++ b/tests/testthat/test-disc_dd_helper.R
@@ -0,0 +1,281 @@
+
+# .aggregate_qid tests ---------------------------------------
+
+test_that(".aggregate_qid basic functionality", {
+
+ res <- .aggregate_qid(
+ df = acs_conf,
+ keys = c("county", "gq")
+ )
+
+ # check expected column names
+ expect_identical(
+ names(res),
+ c("key_id", "county", "gq", "raw_n", "prop")
+ )
+
+ # check expected column types
+ expect_identical(
+ purrr::map_chr(.x = res, .f = ~ pillar::type_sum(.x)),
+ c("key_id" = "int",
+ "county" = "fct",
+ "gq" = "fct",
+ "raw_n" = "int",
+ "prop" = "dbl")
+ )
+
+ # check robustness to reordering
+ set.seed(202407)
+ res2 <- .aggregate_qid(
+ df = acs_conf %>%
+ dplyr::slice_sample(n = dim(acs_conf)[1], replace = FALSE),
+ keys = c("county", "gq")
+ )
+
+ expect_identical(res, res2)
+
+ # check inclusion of missing keys
+ res3 <- .aggregate_qid(
+ df = acs_conf %>%
+ dplyr::filter(county == "Douglas" & gq == "Household"),
+ keys = c("county", "gq")
+ )
+
+ expect_identical(
+ res %>%
+ dplyr::select(key_id, county, gq),
+ res3 %>%
+ dplyr::select(key_id, county, gq)
+ )
+
+})
+
+# .validate_eval_keys tests ---------------------------------------
+
+test_that(".validate_eval_keys validates valid eval_data objects", {
+
+ expect_equal(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_lr_synths
+ ),
+ keys = c("county", "gq")
+ ),
+ 0
+ )
+
+ expect_equal(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_lr_synths[[1]]
+ ),
+ keys = c("county", "gq")
+ ),
+ 0
+ )
+
+ expect_equal(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_lr_synths,
+ holdout_data = acs_conf
+ ),
+ keys = c("county", "gq")
+ ),
+ 0
+ )
+
+ expect_equal(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf %>%
+ dplyr::select(county, gq),
+ synth_data = acs_lr_synths[[1]] %>%
+ dplyr::select(county, gq)
+ )
+ ),
+ 0
+ )
+
+})
+
+
+test_that(".validate_eval_keys fails when expected", {
+
+ expect_error(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = mtcars
+ ),
+ keys = c("county", "gq")
+ )
+ )
+
+ expect_error(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_lr_synths,
+ holdout_data = mtcars
+ ),
+ keys = c("county", "gq")
+ )
+ )
+
+ expect_error(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = append(acs_lr_synths, list(mtcars))
+ ),
+ keys = c("county", "gq")
+ )
+ )
+
+ expect_error(
+ .validate_eval_keys(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_lr_synths[[1]] %>%
+ dplyr::mutate(
+ county = forcats::fct_collapse(
+ gq,
+ hi = c("Household", "Institution"))
+ )
+ ),
+ keys = c("county", "gq")
+ )
+ )
+
+})
+
+
+# prep_discrete_eval_data tests ---------------------------------------
+
+
+orig_ed <- eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_lr_synths,
+ holdout_data = acs_conf
+)
+
+disc_ed1 <- prep_discrete_eval_data(
+ orig_ed,
+ col_map = list(
+ "age" = list("k" = 10),
+ "inctot" = list("width" = 10000)
+ )
+)
+
+test_that("prep_discrete_eval_data type functionality", {
+
+ # check column types
+ expect_identical(
+ pillar::type_sum(disc_ed1$conf_data$age), "fct"
+ )
+ expect_identical(
+ pillar::type_sum(disc_ed1$synth_data[[1]]$age), "fct"
+ )
+ expect_identical(
+ pillar::type_sum(disc_ed1$holdout_data$age), "fct"
+ )
+
+ expect_identical(
+ pillar::type_sum(disc_ed1$conf_data$inctot), "fct"
+ )
+ expect_identical(
+ pillar::type_sum(disc_ed1$synth_data[[1]]$inctot), "fct"
+ )
+ expect_identical(
+ pillar::type_sum(disc_ed1$holdout_data$inctot), "fct"
+ )
+
+ # check identical factor level mappings
+ expect_identical(
+ levels(disc_ed1$conf_data$age),
+ levels(disc_ed1$synth_data[[1]]$age)
+ )
+ expect_identical(
+ levels(disc_ed1$conf_data$age),
+ levels(disc_ed1$holdout_data$age)
+ )
+
+ expect_identical(
+ levels(disc_ed1$conf_data$inctot),
+ levels(disc_ed1$synth_data[[1]]$inctot)
+ )
+ expect_identical(
+ levels(disc_ed1$conf_data$inctot),
+ levels(disc_ed1$holdout_data$inctot)
+ )
+
+ # check that first level is NA
+ expect_identical(
+ levels(disc_ed1$holdout_data$age)[1],
+ NA_character_
+ )
+
+ expect_identical(
+ levels(disc_ed1$holdout_data$inctot)[1],
+ NA_character_
+ )
+
+})
+
+
+test_that("prep_discrete_eval_data boundary construction", {
+
+ # check discretization logic
+ expect_equal(length(levels(disc_ed1$conf_data$age)), 11)
+
+ # check that first interval includes the smallest value
+ first_boundary <- stringr::str_extract(
+ levels(disc_ed1$conf_data$inctot), "(?<=\\[).*?(?=\\])"
+ ) %>%
+ stringr::str_split_fixed(",", n = 2)
+
+ expect_equal(
+ as.numeric(first_boundary[2, 2]) - as.numeric(first_boundary[2, 1]),
+ 10000
+ )
+
+ # extract boundaries for all except the first window (left-open intervals)
+ boundaries <- stringr::str_extract(
+ levels(disc_ed1$conf_data$inctot), "(?<=\\().*(?=\\])"
+ ) %>%
+ stringr::str_split_fixed(",", n = 2)
+
+ # all except the first two rows (NA) and last boundary...
+ boundaries <- boundaries[
+ 3:(length(levels(disc_ed1$conf_data$inctot)) - 1),
+ ] %>%
+ base::apply(2, as.numeric)
+
+ # ...should be equal width
+ expect_equal(
+ unique(round(boundaries[2, ] - boundaries[1, ], -4)),
+ 10000
+ )
+
+})
+
+test_that("prep_discrete_eval_data boundary application", {
+
+ # ensure each value gets mapped to a non-trivial factor level
+ expect_false(
+ NA %in% unique(disc_ed1$conf_data$age)
+ )
+ expect_false(
+ NA %in% unique(disc_ed1$synth_data$age)
+ )
+ expect_false(
+ NA %in% unique(disc_ed1$holdout_data$age)
+ )
+
+})
+
+
diff --git a/tests/testthat/test-disc_mit.R b/tests/testthat/test-disc_mit.R
index c025f49..0f87d51 100644
--- a/tests/testthat/test-disc_mit.R
+++ b/tests/testthat/test-disc_mit.R
@@ -27,9 +27,11 @@ class(postsynth) <- "postsynth"
test_that("Perfect training match and perfect holdout lack of match", {
test1 <- disc_mit(
- postsynth = postsynth,
- data = data1,
- holdout_data = data2
+ eval_data(
+ conf_data = data1,
+ synth_data = postsynth,
+ holdout_data = data2
+ )
)
@@ -53,9 +55,11 @@ test_that("Perfect training match and perfect holdout lack of match", {
test_that("Perfect training lack of match and perfect holdout match", {
test2 <- disc_mit(
- postsynth = postsynth,
- data = data2,
- holdout_data = data1
+ eval_data(
+ conf_data = data2,
+ synth_data = postsynth,
+ holdout_data = data1
+ )
)
@@ -79,9 +83,11 @@ test_that("Perfect training lack of match and perfect holdout match", {
test_that("Identical training and holdout data", {
test3 <- disc_mit(
- postsynth = postsynth,
- data = data3,
- holdout_data = data3
+ eval_data(
+ conf_data = data3,
+ synth_data = postsynth,
+ holdout_data = data3
+ )
)
expect_equal(
@@ -101,31 +107,98 @@ test_that("Identical training and holdout data", {
})
+test_that("Disaggregation returns tibble", {
+
+ test4 <- disc_mit(
+ eval_data(
+ conf_data = data3,
+ synth_data = postsynth,
+ holdout_data = data3
+ ),
+ summary = FALSE
+ )
+
+ expect_s3_class(test4$results, "tbl")
+ expect_identical(names(test4$results),
+ c("source","a", "b", "distance",
+ "pseudo_probability", "prediction"))
+
+ expect_s3_class(test4$roc, "tbl")
+ expect_identical(names(test4$roc),
+ c(".threshold", "specificity", "sensitivity"))
+
+})
+
+test_that("disc_mit() multiple synthesis", {
+
+ test5 <- disc_mit(
+ eval_data(
+ conf_data = data3,
+ synth_data = list(data1, data1, data1),
+ holdout_data = data3
+ )
+ )
+
+ expect_equal(
+ test5$precision,
+ 0.5
+ )
+
+ expect_equal(
+ test5$recall,
+ 0.5
+ )
+
+ expect_equal(
+ test5$auc,
+ 0.5
+ )
+
+ test6 <- disc_mit(
+ eval_data(
+ conf_data = data3,
+ synth_data = list(data1, data1, data1),
+ holdout_data = data3
+ ),
+ summary = FALSE
+ )
+
+ expect_s3_class(test6$results, "data.frame")
+ expect_equal(dim(test6$results)[1], 24)
+
+})
+
test_that("disc_mit() input errors ", {
expect_error(
disc_mit(
- postsynth = postsynth,
- data = data3,
- holdout_data = data3,
+ eval_data(
+ conf_data = data3,
+ synth_data = postsynth,
+ holdout_data = data3
+ ),
threshold_percentile = "abc"
)
)
expect_error(
disc_mit(
- postsynth = postsynth,
- data = data3,
- holdout_data = data3,
+ eval_data(
+ conf_data = data3,
+ synth_data = postsynth,
+ holdout_data = data3
+ ),
threshold_percentile = -1
)
)
expect_error(
disc_mit(
- postsynth = postsynth,
- data = data3,
- holdout_data = data3,
+ eval_data(
+ conf_data = data3,
+ synth_data = postsynth,
+ holdout_data = data3
+ ),
threshold_percentile = 1.1
)
)
diff --git a/tests/testthat/test-disc_qid_mi.R b/tests/testthat/test-disc_qid_mi.R
new file mode 100644
index 0000000..b013b61
--- /dev/null
+++ b/tests/testthat/test-disc_qid_mi.R
@@ -0,0 +1,120 @@
+acs_eval <- eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_lr_synths
+)
+
+acs_eval_holdout <- eval_data(
+ conf_data = acs_conf[sample(nrow(acs_conf)), ],
+ synth_data = acs_lr_synths,
+ holdout_data = acs_holdout[sample(nrow(acs_holdout)), ]
+)
+
+qid_keys <- c("county", "gq", "sex", "empstat", "classwkr")
+
+acs_eval_agg <- aggregate_qid_eval(
+ eval_data = acs_eval,
+ keys = qid_keys
+)
+acs_eval_holdout_agg <- aggregate_qid_eval(
+ eval_data = acs_eval_holdout,
+ keys = qid_keys
+)
+
+test_that("aggregate_qid_eval expected errors", {
+
+ # wrong input class
+ expect_error(
+ aggregate_qid_eval("not_eval_data", c("a", "b"))
+ )
+
+ # wrong n_rep
+ expect_error(
+ aggregate_qid_eval(
+ eval_data(
+ conf_data = acs_conf,
+ synth_data = acs_conf
+ )
+ )
+ )
+
+})
+
+test_that("aggregate_qid_eval expected behavior", {
+
+ # expect number of rows is consistent
+ # first, calculate expected number of rows, which should be the product of...
+ expected_rows <- acs_eval[["n_rep"]] * # the number of synthetic data reps...
+ purrr::reduce(
+ # ...times the number of levels in each factor...
+ purrr::map(
+ .x = qid_keys,
+ .f = \(x) {length(levels(acs_conf[[x]]))}
+ ),
+ `*`) # all multiplied together
+
+ # check that this equals the size of the computed result
+ expect_equal(nrow(acs_eval_agg), expected_rows)
+ expect_equal(nrow(acs_eval_holdout_agg), expected_rows)
+
+ # expect key_id definitions are consistent
+ expect_identical(
+ acs_eval_agg %>%
+ dplyr::select(
+ dplyr::all_of(c("key_id", qid_keys))
+ ),
+ acs_eval_holdout_agg %>%
+ dplyr::select(
+ dplyr::all_of(c("key_id", qid_keys))
+ )
+ )
+
+})
+
+test_that("plot_prob_qid_partition correct ggplot2 objects", {
+
+ p1 <- plot_prob_qid_partition(
+ agg_eval_data = acs_eval_agg,
+ keys = qid_keys,
+ max_k = 5
+ )
+
+ # expect plot object
+ expect_true("ggplot" %in% class(p1))
+
+ # expect prob data calculated properly
+ expect_identical(
+ names(p1$data),
+ c(qid_keys, "raw_n_conf", "s_synth", ".group")
+ )
+
+ expect_true(
+ all(p1$data[["s_synth"]] %in% (seq(0, 30) / 30.))
+ )
+
+ expect_true(
+ all(p1$data[["raw_n_conf"]] <= 5)
+ )
+
+})
+
+test_that("plot_prob_qid_abs_err returns correct ggplot2 objects", {
+
+ p2 <- plot_prob_qid_abs_err(
+ agg_eval_data = acs_eval_agg,
+ keys = qid_keys,
+ max_k = 5
+ )
+
+ # expect plot object
+ expect_true("ggplot" %in% class(p2))
+
+ # expect prob data calculated properly
+ expect_true(
+ all(p2$data[["qtile"]] %in% c("50%ile", "75%ile", "90%ile"))
+ )
+
+ expect_true(
+ all(p2$data[["raw_n_conf"]] <= 5)
+ )
+
+})
\ No newline at end of file
diff --git a/tests/testthat/test-eval_data.R b/tests/testthat/test-eval_data.R
new file mode 100644
index 0000000..1e4c032
--- /dev/null
+++ b/tests/testthat/test-eval_data.R
@@ -0,0 +1,52 @@
+
+
+test_that("eval_data input errors", {
+
+ expect_error(
+ eval_data(
+ conf_data = "not a dataframe",
+ synth_data = mtcars
+ )
+ )
+
+ expect_error(
+ eval_data(
+ conf_data = mtcars,
+ synth_data = "not a dataframe"
+ )
+ )
+
+ expect_error(
+ eval_data(
+ conf_data = mtcars,
+ synth_data = mtcars,
+ holdout_data = "not a dataframe"
+ )
+ )
+
+ expect_error(
+ eval_data(
+ conf_data = mtcars,
+ synth_data = list(mtcars, "not a dataframe")
+ )
+ )
+
+})
+
+test_that("eval_data calculates n_rep", {
+
+ rep1_ed <- eval_data(
+ conf_data = mtcars,
+ synth_data = mtcars
+ )
+
+ expect_equal(rep1_ed$n_rep, 1)
+
+ rep3_ed <- eval_data(
+ conf_data = mtcars,
+ synth_data = list(mtcars, mtcars, mtcars)
+ )
+
+ expect_equal(rep3_ed$n_rep, 3)
+
+})
\ No newline at end of file
diff --git a/tests/testthat/test-util_co_occurrence.R b/tests/testthat/test-util_co_occurrence.R
index 006d331..7cdd4b9 100644
--- a/tests/testthat/test-util_co_occurrence.R
+++ b/tests/testthat/test-util_co_occurrence.R
@@ -8,23 +8,6 @@ syn <- list(synthetic_data = data.frame(a = c(1, 0, 0, 0),
b = c(1, 0, 0, 0))) %>%
structure(class = "postsynth")
-
-
-
-
-
-# # difference matrix for tests
-# diff_matrix <- matrix(
-# c(NA, NA, NA,
-# -2, NA, NA,
-# 0, -2, NA),
-# ncol = 3,
-# byrow = TRUE
-# )
-
-# rownames(diff_matrix) <- c("a", "c", "b")
-# colnames(diff_matrix) <- c("a", "c", "b")
-
# test with postsynth
test_that("util_co_occurrence() is correct with identical data ", {
@@ -54,3 +37,22 @@ test_that("util_co_occurrence() is correct with different data ", {
expect_equal(co_occurrence$co_occurrence_difference_mae, mean(abs(-0.25)))
expect_equal(co_occurrence$co_occurrence_difference_rmse, sqrt(mean((-0.25) ^ 2)))
})
+
+test_that("util_co_occurrence() works with NA ", {
+
+ syn <- list(
+ synthetic_data = acs_conf
+ ) %>%
+ structure(class = "postsynth")
+
+ co_occurrence <- util_co_occurrence(
+ postsynth = syn,
+ data = acs_conf,
+ na.rm = TRUE
+ )
+
+ expect_equal(max(co_occurrence$co_occurrence_difference, na.rm = TRUE), 0)
+ expect_equal(co_occurrence$co_occurrence_difference_mae, 0)
+ expect_equal(co_occurrence$co_occurrence_difference_rmse, 0)
+
+})
diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R
index 4ff40c8..6fe3162 100644
--- a/tests/testthat/test-util_corr_fit.R
+++ b/tests/testthat/test-util_corr_fit.R
@@ -48,3 +48,22 @@ test_that("util_corr_fit is correct with postsynth ", {
expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2))))
expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
})
+
+test_that("util_corr_fit works with NA ", {
+
+ syn <- list(
+ synthetic_data = acs_conf
+ ) %>%
+ structure(class = "postsynth")
+
+ corr <- util_corr_fit(
+ postsynth = syn,
+ data = acs_conf,
+ use = "pairwise.complete.obs"
+ )
+
+ expect_equal(max(corr$correlation_difference, na.rm = TRUE), 0)
+ expect_equal(corr$correlation_fit, 0)
+ expect_equal(corr$correlation_difference_mae, 0)
+ expect_equal(corr$correlation_difference_rmse, 0)
+})
diff --git a/tests/testthat/test-util_ks-distance.R b/tests/testthat/test-util_ks-distance.R
index e14b6ab..40e7c07 100644
--- a/tests/testthat/test-util_ks-distance.R
+++ b/tests/testthat/test-util_ks-distance.R
@@ -1,16 +1,16 @@
df <- data.frame(
- a = c(1, 2, 3, 4),
- b = c(1, 2, 3, 4),
- c = c("a", "a", "b", "b")
+ a = c(NA, 1, 2, 3, 4),
+ b = c(NA, 1, 2, 3, 4),
+ c = c(NA, "a", "a", "b", "b")
)
test_that("KS is 0 ", {
syn <- list(
synthetic_data = data.frame(
- a = c(1, 2, 3, 4),
- b = c(1, 2, 3, 4),
- c = c("a", "a", "b", "b")
+ a = c(1, 2, 3, 4, NA),
+ b = c(1, 2, 3, 4, NA),
+ c = c("a", "a", "b", "b", NA)
),
jth_synthesis_time = data.frame(
variable = factor(c("a", "b"))
@@ -18,19 +18,19 @@ test_that("KS is 0 ", {
) %>%
structure(class = "postsynth")
- D <- util_ks_distance(postsynth = syn, data = df)
+ D <- util_ks_distance(postsynth = syn, data = df, na.rm = TRUE)
expect_equal(D$D, rep(0, 8))
})
-test_that("KS distance if 0.5 ", {
+test_that("KS distance is 0.5 ", {
syn <- list(
synthetic_data = data.frame(
- a = c(3, 4, 5, 6),
- b = c(3, 4, 5, 6),
- c = c("a", "a", "b", "b")
+ a = c(3, 4, 5, 6, NA),
+ b = c(3, 4, 5, 6, NA),
+ c = c("a", "a", "b", "b", NA)
),
jth_synthesis_time = data.frame(
variable = factor(c("a", "b"))
@@ -38,7 +38,7 @@ test_that("KS distance if 0.5 ", {
) %>%
structure(class = "postsynth")
- D <- util_ks_distance(postsynth = syn, data = df)
+ D <- util_ks_distance(postsynth = syn, data = df, na.rm = TRUE)
expect_equal(D$D, rep(0.5, 4))
@@ -48,9 +48,9 @@ test_that("KS distance is 1 ", {
syn <- list(
synthetic_data = data.frame(
- a = c(60, 70, 80, 90),
- b = c(60, 70, 80, 90),
- c = c("a", "a", "b", "b")
+ a = c(60, 70, 80, 90, NA),
+ b = c(60, 70, 80, 90, NA),
+ c = c("a", "a", "b", "b", NA)
),
jth_synthesis_time = data.frame(
variable = factor(c("a", "b"))
@@ -58,8 +58,24 @@ test_that("KS distance is 1 ", {
) %>%
structure(class = "postsynth")
- D <- util_ks_distance(postsynth = syn, data = df)
+ D <- util_ks_distance(postsynth = syn, data = df, na.rm = TRUE)
expect_equal(D$D, c(1, 1))
})
+
+test_that("KS distance works with NA ", {
+
+ syn <- list(
+ synthetic_data = acs_conf
+ ) %>%
+ structure(class = "postsynth")
+
+ D <- util_ks_distance(
+ postsynth = syn,
+ data = acs_conf,
+ na.rm = TRUE)
+
+ expect_equal(max(D$D), 0)
+
+})
diff --git a/tests/testthat/test-util_moments.R b/tests/testthat/test-util_moments.R
index 84be652..3a43524 100644
--- a/tests/testthat/test-util_moments.R
+++ b/tests/testthat/test-util_moments.R
@@ -8,6 +8,15 @@ df <- data.frame(
weight = c(100, 100, 200)
)
+df_na <- data.frame(
+ a = c(1000, 1000, NA),
+ b = c(1200, 800, NA),
+ c = c("1", "1", "2"),
+ d = c(0, 0, 10),
+ e = c("a", "b", "b"),
+ weight = c(100, 100, 200)
+)
+
# test synth (4 of 8 combinations)
syn <- list(
synthetic_data = data.frame(
@@ -24,6 +33,21 @@ syn <- list(
) %>%
structure(class = "postsynth")
+syn_na <- list(
+ synthetic_data = data.frame(
+ a = c(1400, 0, 1000),
+ b = c(1000, 1000, NA),
+ c = c("1", "1", "2"),
+ d = c(20, 10, 0),
+ e = c("a", "b", "b"),
+ weight = c(150, 150, 100)
+ ),
+ jth_synthesis_time = data.frame(
+ variable = factor(c("a", "b", "d", "weight"))
+ )
+) %>%
+ structure(class = "postsynth")
+
# full unweighted - postysynth
test_that("moments full unweighted -- postsynth ", {
@@ -246,7 +270,8 @@ test_that("util_moments() variables selection returns correct dimensions ", {
structure(class = "postsynth")
# are variable names missing ever?
- expect_false(
+ expect_message(
+ expect_false(
util_moments(
postsynth = syn,
data = storms_sub,
@@ -255,45 +280,54 @@ test_that("util_moments() variables selection returns correct dimensions ", {
)$variable %>%
is.na() %>%
all()
+ )
)
+
# are statistics names missing ever?
- expect_false(
- util_moments(
- postsynth = syn,
- data = storms_sub,
- common_vars = FALSE,
- synth_vars = FALSE
- )$statistic %>%
- is.na() %>%
- all()
- )
-
- # 55 rows = all 11 variables times 5 statistics
- expect_equal(
- dim(
+ expect_message(
+ expect_false(
util_moments(
postsynth = syn,
data = storms_sub,
common_vars = FALSE,
synth_vars = FALSE
- )
- ),
- c(55, 6)
+ )$statistic %>%
+ is.na() %>%
+ all()
+ )
+ )
+
+ # 55 rows = all 11 variables times 5 statistics
+ expect_message(
+ expect_equal(
+ dim(
+ util_moments(
+ postsynth = syn,
+ data = storms_sub,
+ common_vars = FALSE,
+ synth_vars = FALSE
+ )
+ ),
+ c(55, 6)
+ )
)
# 50 rows = 10 common variables times 5 statistics
- expect_equal(
- dim(
- util_moments(
- postsynth = syn,
- data = storms_sub,
- common_vars = TRUE,
- synth_vars = FALSE
- )
- ),
- c(50, 6)
+ expect_message(
+ expect_equal(
+ dim(
+ util_moments(
+ postsynth = syn,
+ data = storms_sub,
+ common_vars = TRUE,
+ synth_vars = FALSE
+ )
+ ),
+ c(50, 6)
+ )
)
+
# 15 rows = 3 synthesized numeric variables times 5 statistics
expect_equal(
@@ -321,4 +355,28 @@ test_that("util_moments() variables selection returns correct dimensions ", {
c(15, 6)
)
+})
+
+
+test_that("util_moments na.rm works as expected", {
+ expect_message(
+ res <- util_moments(
+ postsynth = syn_na,
+ data = df_na,
+ na.rm = FALSE
+ )
+ )
+ expect_true(
+ all(is.na(res[res["variable"] == "a", "original"]))
+ )
+
+ res_rm <- util_moments(
+ postsynth = syn_na,
+ data = df_na,
+ na.rm = TRUE
+ )
+
+ expect_true(
+ all(res_rm[2, c("original", "synthetic")] == c(1000, 800))
+ )
})
\ No newline at end of file
diff --git a/tests/testthat/test-util_na_helper.R b/tests/testthat/test-util_na_helper.R
new file mode 100644
index 0000000..f424be1
--- /dev/null
+++ b/tests/testthat/test-util_na_helper.R
@@ -0,0 +1,60 @@
+df_with_nas <- data.frame(
+ a = c(1000, 1000, 1000, NA),
+ b = c(1200, 800, 1000, NA),
+ c = c("1", "1", "2", NA),
+ d = c(0, 0, 10, 0),
+ e = c("a", "b", "b", "b"),
+ weight = c(100, 100, 200, 100)
+)
+
+test_that("prep_combined_data_for_na.rm functionality", {
+
+ # if NAs present, raise warning
+ expect_message(
+ prep_result <- prep_combined_data_for_na.rm(df_with_nas),
+ "Some variables contain missing data: a, b, c"
+ )
+
+ # passing no optional arguments returns same dataframe
+ expect_identical(
+ df_with_nas,
+ prep_result)
+
+ # passing a dataframe with NAs with drop_zeros raises an error
+ expect_error(
+ expect_message(
+ prep_combined_data_for_na.rm(df_with_nas, drop_zeros = TRUE)
+ )
+ )
+
+ expect_no_error(
+ prep_combined_data_for_na.rm(
+ df_with_nas,
+ na.rm = TRUE,
+ drop_zeros = TRUE
+ )
+ )
+
+ # drop_zeros works as expected
+ expect_identical(
+ prep_combined_data_for_na.rm(
+ df_with_nas[, c("weight", "d")],
+ drop_zeros = TRUE
+ )[, "d"],
+ c(NA, NA, 10, NA)
+ )
+
+ # drop_zeros optionally ignores unspecified columns
+ expect_identical(
+ prep_combined_data_for_na.rm(
+ df_with_nas[, c("weight", "d")],
+ drop_zeros = TRUE,
+ drop_zeros_exclude = rlang::sym("d")
+ )[, "d"],
+ c(0, 0, 10, 0)
+ )
+
+})
+
+
+
diff --git a/tests/testthat/test-util_percentiles.R b/tests/testthat/test-util_percentiles.R
index 91f2447..ed28b53 100644
--- a/tests/testthat/test-util_percentiles.R
+++ b/tests/testthat/test-util_percentiles.R
@@ -8,6 +8,15 @@ df <- data.frame(
weight = c(300, 100, 100)
)
+df_na <- data.frame(
+ a = c(1000, 1000, 1000, 1000),
+ b = c(-1200, -800, 1000, NA),
+ c = c("1", "1", "2", "2"),
+ d = c(0, 0, 10, 0),
+ e = c("a", "b", "a", "b"),
+ weight = c(300, 100, 100, 100)
+)
+
# test synth (4 of 8 combinations)
syn <- list(
synthetic_data = data.frame(
@@ -24,6 +33,21 @@ syn <- list(
) %>%
structure(class = "postsynth")
+syn_na <- list(
+ synthetic_data = data.frame(
+ a = c(1400, NA, 1000),
+ b = c(1000, 1000, 1000),
+ c = c("1", "1", "2"),
+ d = c(20, 10, 0),
+ e = c("a", "b", "a"),
+ weight = c(300, 100, 100)
+ ),
+ jth_synthesis_time = data.frame(
+ variable = factor(c("a", "b", "d", "weight"))
+ )
+) %>%
+ structure(class = "postsynth")
+
test_that("unweighted percentiles make sense ", {
test1 <- util_percentiles(
@@ -219,12 +243,14 @@ test_that("util_percentiles() variables selection returns correct dimensions ",
)
# error: quantile doesn't work with missing values
- expect_error(
- util_percentiles(
- postsynth = syn,
- data = storms_sub,
- common_vars = FALSE,
- synth_vars = FALSE
+ expect_message(
+ expect_error(
+ util_percentiles(
+ postsynth = syn,
+ data = storms_sub,
+ common_vars = FALSE,
+ synth_vars = FALSE
+ )
)
)
@@ -267,4 +293,32 @@ test_that("util_percentiles() variables selection returns correct dimensions ",
c(9, 6)
)
+})
+
+test_that("util_percentiles na.rm", {
+
+ # if na.rm = FALSE, raise error for missing values
+ expect_message(
+ expect_error(
+ util_percentiles(
+ postsynth = syn_na,
+ data = df_na,
+ probs = 0.5,
+ na.rm = FALSE
+ )
+ )
+ )
+
+ # else, re
+ res <- util_percentiles(
+ postsynth = syn_na,
+ data = df_na,
+ probs = 0.5,
+ na.rm = TRUE
+ )
+
+ expect_true(
+ all(res[1, c("original", "synthetic")] == c(1000, 1200))
+ )
+
})
\ No newline at end of file
diff --git a/tests/testthat/test-util_plots.R b/tests/testthat/test-util_plots.R
new file mode 100644
index 0000000..7fc785d
--- /dev/null
+++ b/tests/testthat/test-util_plots.R
@@ -0,0 +1,83 @@
+conf_df <- data.frame(
+ n1 = c(1., 2., 3., 4.),
+ n2 = c(5., 6., 7., 8.),
+ c1 = factor(c("1", "1", "2", "1")),
+ c2 = factor(c("a", "a", "a", "b"))
+)
+
+synth_df <- data.frame(
+ n1 = c(1., 2., 4., 4.),
+ n2 = c(5., 5., 7., 9.),
+ c1 = factor(c("1", "2", "2", "1")),
+ c2 = factor(c("a", "b", "a", "b"))
+)
+
+joint_data <- dplyr::bind_rows(
+ confidential = conf_df,
+ synthetic = synth_df,
+ .id = "source"
+)
+
+test_that("plot_numeric_hist_kde throws expected errors", {
+
+ expect_error(
+ plot_numeric_hist_kde(joint_data, "c1")
+ )
+
+ expect_error(
+ plot_numeric_hist_kde(joint_data, "n1", "n2")
+ )
+
+ expect_error(
+ plot_numeric_hist_kde(joint_data, "n1", "c1", "n2")
+ )
+
+})
+
+test_that("plot_numeric_hist_kde creates the right ggplot", {
+
+ plot <- plot_numeric_hist_kde(joint_data, "n1")
+ expect_s3_class(plot$layers[[1]]$geom, "GeomBar")
+ expect_s3_class(plot$layers[[2]]$geom, "GeomDensity")
+
+})
+
+test_that("plot_categorical_bar throws expected errors", {
+
+ expect_error(
+ plot_categorical_bar(joint_data, "n1")
+ )
+
+ expect_error(
+ plot_categorical_bar(joint_data, "c1", "n2")
+ )
+
+ expect_error(
+ plot_categorical_bar(joint_data, "c1", "c1", "n2")
+ )
+
+})
+
+test_that("plot_categorical_bar creates the right ggplot", {
+
+ plot <- plot_categorical_bar(joint_data, "c1")
+ expect_s3_class(plot$layers[[1]]$geom, "GeomBar")
+
+})
+
+test_that("create_cormat_plot creates the right ggplot", {
+
+ plot <- create_cormat_plot(synth_df)
+
+ expect_s3_class(plot$layers[[1]]$geom, "GeomTile")
+ expect_s3_class(plot$layers[[2]]$geom, "GeomText")
+
+})
+
+test_that("plot_cormat creates the right ggplot", {
+
+ plot <- plot_cormat(conf_df, synth_df)
+
+ expect_equal(length(plot$grobs), 2)
+
+})
\ No newline at end of file
diff --git a/tests/testthat/test-util_proportions.R b/tests/testthat/test-util_proportions.R
index 7b08d68..616d741 100644
--- a/tests/testthat/test-util_proportions.R
+++ b/tests/testthat/test-util_proportions.R
@@ -6,6 +6,15 @@ df <- data.frame(
weight = c(100, 100, 200)
)
+df_na <- data.frame(
+ a = c(1, 2, 1, 1),
+ b = c("orange", "yellow", "green", NA),
+ c = factor(c("1", "1", "2", NA),
+ levels = c("1", "2", "3", NA),
+ exclude = NULL),
+ weight = c(100, 100, 200, 100)
+)
+
# postsynth data object
syn <- list(
synthetic_data = data.frame(
@@ -20,6 +29,22 @@ syn <- list(
) %>%
structure(class = "postsynth")
+# postsynth data object
+syn_na <- list(
+ synthetic_data = data.frame(
+ a = c(2, 2, 2, NA),
+ b = c("orange", "yellow", "green", "green"),
+ c = factor(c("1", "1", "2", NA),
+ levels = c("1", "2", "3", NA),
+ exclude = NULL),
+ weight = c(150, 150, 100, 100)
+ ),
+ jth_synthesis_time = data.frame(
+ variable = factor(c("b", "c"))
+ )
+) %>%
+ structure(class = "postsynth")
+
# testing variable selection
test_that("testing if proportions only uses fct and chr variables", {
@@ -352,3 +377,41 @@ test_that("util_proportions() variables selection returns correct dimensions ",
})
+test_that("na.rm in levels works as expected", {
+ res <- util_proportions(
+ postsynth = syn_na,
+ data = df_na
+ )
+ expect_identical(
+ res$class,
+ c("NA", "green", "orange", "yellow", "1", "2", NA)
+ )
+
+ res_rm <- util_proportions(
+ postsynth = syn_na,
+ data = df_na,
+ na.rm = TRUE
+ )
+
+ expect_identical(
+ res_rm$class,
+ c("green", "orange", "yellow", "1", "2")
+ )
+
+})
+
+test_that("keep_empty_levels works as expected", {
+
+ res <- util_proportions(
+ postsynth = syn_na,
+ data = df_na,
+ keep_empty_levels = TRUE
+ )
+
+ # expect to show 0 proportion in empty level "3" for variable c
+ expect_true(
+ all(res[7, c("synthetic", "original")] == c(0, 0))
+ )
+
+})
+
diff --git a/tests/testthat/test-util_totals.R b/tests/testthat/test-util_totals.R
index 6fedbb2..d88d5ac 100644
--- a/tests/testthat/test-util_totals.R
+++ b/tests/testthat/test-util_totals.R
@@ -8,6 +8,15 @@ df <- data.frame(
weight = c(100, 100, 200)
)
+df_na <- data.frame(
+ a = c(1000, 1000, 1000, NA),
+ b = c(1200, 800, 1000, NA),
+ c = c("1", "1", "2", "2"),
+ d = c(0, 0, 10, 0),
+ e = c("a", "b", "b", "a"),
+ weight = c(100, 100, 200, 100)
+)
+
# test synth (4 of 8 combinations)
syn <- list(
synthetic_data = data.frame(
@@ -24,6 +33,21 @@ syn <- list(
) %>%
structure(class = "postsynth")
+syn_na <- list(
+ synthetic_data = data.frame(
+ a = c(1400, 0, 1000),
+ b = c(1000, NA, 1000),
+ c = c("1", "1", "2"),
+ d = c(20, 10, 0),
+ e = c("a", "b", "b"),
+ weight = c(150, 150, 100)
+ ),
+ jth_synthesis_time = data.frame(
+ variable = factor(c("a", "b", "d", "weight"))
+ )
+) %>%
+ structure(class = "postsynth")
+
# full unweighted - postysynth
test_that("moments full unweighted -- postsynth ", {
@@ -321,4 +345,28 @@ test_that("util_totals() variables selection returns correct dimensions ", {
c(6, 6)
)
-})
\ No newline at end of file
+})
+
+test_that("util_totals() na.rm works as expected", {
+
+ res <- util_totals(
+ postsynth = syn_na,
+ data = df_na,
+ na.rm = FALSE
+ )
+
+ expect_true(
+ all(is.na(res[1:4, "original"]))
+ )
+
+ res_rm <- util_totals(
+ postsynth = syn_na,
+ data = df_na,
+ na.rm = TRUE
+ )
+
+ expect_true(
+ all(res_rm[2, c("original", "synthetic")] == c(3000, 2400))
+ )
+
+})