Skip to content

Commit

Permalink
Merge pull request #105 from UrbanInstitute/version0.0.4
Browse files Browse the repository at this point in the history
Version0.0.4
  • Loading branch information
jhseeman authored Oct 27, 2024
2 parents 458f549 + 72eabac commit 4b07296
Show file tree
Hide file tree
Showing 88 changed files with 4,342 additions and 351 deletions.
6 changes: 5 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
^renv$
^renv\.lock$
^syntheval\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
^README\.Rmd$
^README\.qmd$
^README_files$
^data-raw$
^test.R
disriminators.qmd
^project-standards.md$
53 changes: 51 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,52 @@
.Rproj.user
# History files
.Rhistory
.DS_Store
.Rapp.history

# Mac system file
.DS_Store

# Session Data files
.RData

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
/*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
rsconnect/

# log files
*.log
*\.html

# renv environment files
renv/

# README build files
README_files/

# Plot outputs from unit tests
tests/testthat/Rplots.pdf
19 changes: 16 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
Package: syntheval
Title: A set of tools for evaluating synthetic data utility and disclosure risk
Version: 0.0.3
Version: 0.0.4
Authors@R: c(
person(given = "Aaron R.", family = "Williams",
person(given = "Aaron R.",
family = "Williams",
email = "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5564-1938")),
person(given = "Jeremy",
family = "Seeman",
email = "[email protected]",
role = "aut",
comment = c(ORCID = "0000-0003-3526-3209")),
person("Gabe", "Morrison", , "[email protected]", role = "ctb"),
person("Elyse", "McFalls", , "[email protected]", role = "ctb")
)
Expand All @@ -16,24 +22,31 @@ License: AGPL (>= 3)
BugReports: https://github.com/UI-Research/syntheval/issues
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Suggests:
forcats,
stringr,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Imports:
broom,
dplyr,
ggplot2,
gower,
gridExtra,
Hmisc,
magrittr,
parsnip,
pillar,
purrr,
recipes,
rlang,
rsample,
tibble,
tidyr,
tidyselect,
tune,
twosamples,
workflows,
yardstick
Suggestions:
Expand Down
15 changes: 15 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
# Generated by roxygen2: do not edit by hand

S3method(print,eval_data)
export("%>%")
export(add_discriminator_auc)
export(add_pmse)
export(add_pmse_ratio)
export(add_propensities)
export(add_propensities_tuned)
export(add_specks)
export(aggregate_qid_eval)
export(convert_na_to_level)
export(create_cormat_plot)
export(disc_baseline)
export(disc_mit)
export(discrimination)
export(eval_data)
export(is_eval_data)
export(plot_categorical_bar)
export(plot_cormat)
export(plot_numeric_hist_kde)
export(plot_prob_qid_abs_err)
export(plot_prob_qid_partition)
export(prep_combined_data_for_na.rm)
export(prep_discrete_eval_data)
export(util_ci_overlap)
export(util_co_occurrence)
export(util_corr_fit)
Expand All @@ -20,4 +34,5 @@ export(util_tails)
export(util_totals)
export(weighted_skewness)
importFrom(magrittr,"%>%")
importFrom(rlang,":=")
importFrom(rlang,.data)
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# syntheval 0.0.4

* Add empirical disclosure risk metrics.
* Add comparison visualization utilities.
* Add `na.rm` functionality to most functions to handle `NA` values.
* Add families to roxygen2 headers.
* Ensure that all functions return ungrouped output.

# syntheval 0.0.3

* Add a README with examples.
Expand Down
3 changes: 2 additions & 1 deletion R/add_discriminator_auc.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ add_discriminator_auc <- function(discrimination, split = TRUE) {
dplyr::group_by(.data$.sample) %>%
yardstick::roc_auc(".source_label", ".pred_synthetic") %>%
dplyr::mutate(.sample = factor(.data$.sample, levels = c("training", "testing"))) %>%
dplyr::arrange(.data$.sample)
dplyr::arrange(.data$.sample) %>%
dplyr::ungroup()

} else {

Expand Down
17 changes: 15 additions & 2 deletions R/co_occurence.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#' Construct a co-occurrence matrix
#'
#' @param data A tibble with numeric variables
#' @param na.rm a logical indicating whether missing values should be removed.
#'
#' @return A co-occurrence matrix
#'
co_occurrence <- function(data) {
co_occurrence <- function(data, na.rm = FALSE) {

# create a vector of variable names
data_names <- names(data)
Expand All @@ -19,8 +20,20 @@ co_occurrence <- function(data) {

for (col_name in data_names) {

row_var <- dplyr::pull(data, row_name)
col_var <- dplyr::pull(data, col_name)

if (na.rm) {

# remove missing values
na_lgl <- !is.na(row_var) & !is.na(col_var)
row_var <- row_var[na_lgl]
col_var <- col_var[na_lgl]

}

co_occurence_matrix[row_name, col_name] <-
mean(dplyr::pull(data, row_name) != 0 & dplyr::pull(data, col_name) != 0)
mean(row_var != 0 & col_var != 0)

}

Expand Down
132 changes: 132 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#' American Community Survey confidential microdata
#'
#' An extract constructed from the 2019 American Community Survey containing a
#' random sample of n = 1000 Nebraska respondents.
#'
#' Original data source:
#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
#' https://doi.org/10.18128/D010.V15.0
#'
#' @format ## `acs_conf`
#' A data frame with 1,000 rows and 11 columns:
#' \describe{
#' \item{county}{fct, county}
#' \item{gq}{fct, group quarter kind}
#' \item{sex}{fct, sex}
#' \item{marst}{fct, marital status}
#' \item{hcovany}{fct, health insurance status}
#' \item{empstat}{fct, employment status}
#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
#' \item{age}{dbl, age (in years)}
#' \item{famsize}{dbl, household/family size}
#' \item{transit_time}{dbl, transit time to work (in minutes)}
#' \item{inctot}{dbl, annual income}
#' }
#' @source <https://usa.ipums.org/usa/>
"acs_conf"

#' American Community Survey holdout microdata
#'
#' An extract constructed from the 2019 American Community Survey containing a
#' random sample of n = 1000 Nebraska respondents. This sample is distinct from
#' `acs_conf` and is not used in producing the synthetic data available in this
#' package.
#'
#' Original data source:
#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
#' https://doi.org/10.18128/D010.V15.0
#'
#' @format ## `acs_holdout`
#' A data frame with 1,000 rows and 11 columns:
#' \describe{
#' \item{county}{fct, county}
#' \item{gq}{fct, group quarter kind}
#' \item{sex}{fct, sex}
#' \item{marst}{fct, marital status}
#' \item{hcovany}{fct, health insurance status}
#' \item{empstat}{fct, employment status}
#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
#' \item{age}{dbl, age (in years)}
#' \item{famsize}{dbl, household/family size}
#' \item{transit_time}{dbl, transit time to work (in minutes)}
#' \item{inctot}{dbl, annual income}
#' }
#' @source <https://usa.ipums.org/usa/>
"acs_holdout"

#' American Community Survey lower-risk synthetic data
#'
#' A list of 30 samples of synthetic data derived from `acs_conf`,
#' generated using noise infusion for both categorical and numeric random variables.
#' These are referred to as "lower-risk" relative to the "higher-risk" synthetic data
#' also available in this package; the synthetic data is purely for testing purposes.
#'
#' Categorical random variables are selected by resampling from a mixture of the
#' original multivariate cell proportions and a uniform mixture. Numeric random
#' variables are first modelled using regression trees, and new sampled values
#' each have additional discrete two-sided geometric noise added to them.
#'
#' Original data source:
#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
#' https://doi.org/10.18128/D010.V15.0
#'
#' @format ## `acs_lr_synths`
#' A list of 30 data frames with 1,000 rows and 11 columns:
#' \describe{
#' \item{county}{fct, county}
#' \item{gq}{fct, group quarter kind}
#' \item{sex}{fct, sex}
#' \item{marst}{fct, marital status}
#' \item{hcovany}{fct, health insurance status}
#' \item{empstat}{fct, employment status}
#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
#' \item{age}{dbl, age (in years)}
#' \item{famsize}{dbl, household/family size}
#' \item{transit_time}{dbl, transit time to work (in minutes)}
#' \item{inctot}{dbl, annual income}
#' }
#' @source <https://usa.ipums.org/usa/>
"acs_lr_synths"


#' American Community Survey higher-risk synthetic data
#'
#' A list of 30 samples of partial synthetic data derived from `acs_conf`,
#' generated using models that intentionally overfit to the confidential data.
#' These are referred to as "higher-risk" relative to the "lower-risk" synthetic
#' data also available in this package; the synthetic data is purely for testing purposes.
#'
#' Categorical variables are primarily kept "as-is" in this partially synthetic data,
#' with a small proportion of categorical records resampled from the data. Numeric
#' variables are resampled from decision tree models that are intentionally designed
#' to overfit to the confidential data.
#'
#' Original data source:
#' Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen,
#' Grace Cooper, Stephanie Richards, Renae Rogers, and Megan Schouweiler.
#' IPUMS USA: Version 15.0 \[dataset\]. Minneapolis, MN: IPUMS, 2024.
#' https://doi.org/10.18128/D010.V15.0
#'
#' @format ## `acs_hr_synths`
#' A list of 30 data frames with 1,000 rows and 11 columns:
#' \describe{
#' \item{county}{fct, county}
#' \item{gq}{fct, group quarter kind}
#' \item{sex}{fct, sex}
#' \item{marst}{fct, marital status}
#' \item{hcovany}{fct, health insurance status}
#' \item{empstat}{fct, employment status}
#' \item{classwkr}{fct, employment kind (ex: self-employed, etc.)}
#' \item{age}{dbl, age (in years)}
#' \item{famsize}{dbl, household/family size}
#' \item{transit_time}{dbl, transit time to work (in minutes)}
#' \item{inctot}{dbl, annual income}
#' }
#' @source <https://usa.ipums.org/usa/>
"acs_hr_synths"
Loading

0 comments on commit 4b07296

Please sign in to comment.