diff --git a/R/co_occurence.R b/R/co_occurence.R index 70dbf47..316c49c 100644 --- a/R/co_occurence.R +++ b/R/co_occurence.R @@ -1,10 +1,11 @@ #' Construct a co-occurrence matrix #' #' @param data A tibble with numeric variables +#' @param na.rm a logical indicating whether missing values should be removed. #' #' @return A co-occurrence matrix #' -co_occurrence <- function(data) { +co_occurrence <- function(data, na.rm = FALSE) { # create a vector of variable names data_names <- names(data) @@ -19,8 +20,20 @@ co_occurrence <- function(data) { for (col_name in data_names) { + row_var <- dplyr::pull(data, row_name) + col_var <- dplyr::pull(data, col_name) + + if (na.rm) { + + # remove missing values + na_lgl <- !is.na(row_var) & !is.na(col_var) + row_var <- row_var[na_lgl] + col_var <- col_var[na_lgl] + + } + co_occurence_matrix[row_name, col_name] <- - mean(dplyr::pull(data, row_name) != 0 & dplyr::pull(data, col_name) != 0) + mean(row_var != 0 & col_var != 0) } diff --git a/R/util_co_ocurrence.R b/R/util_co_ocurrence.R index be12018..87ae2e4 100644 --- a/R/util_co_ocurrence.R +++ b/R/util_co_ocurrence.R @@ -1,7 +1,10 @@ #' Calculate the co-occurrence fit metric of a confidential data set. #' -#' @param postsynth A postsynth object from tidysynthesis or a tibble +#' @param postsynth a postsynth object from tidysynthesis or a tibble #' @param data an original (observed) data set. +#' @param na.rm a logical indicating whether missing values should be removed. +#' Note: values are jointly removed for each pair of variables even if only one +#' value is missing. #' #' @return A `list` of fit metrics: #' - `co_occurrence_original`: co-occurrence matrix of the original data. @@ -19,7 +22,7 @@ #' #' @export #' -util_co_occurrence <- function(postsynth, data) { +util_co_occurrence <- function(postsynth, data, na.rm = FALSE) { if (is_postsynth(postsynth)) { @@ -44,7 +47,7 @@ util_co_occurrence <- function(postsynth, data) { co_occurrence_matrix <- x %>% dplyr::select_if(is.numeric) %>% - co_occurrence() + co_occurrence(na.rm = na.rm) # set the values in the upper triangle to zero to avoid double counting co_occurrence_matrix[upper.tri(co_occurrence_matrix, diag = TRUE)] <- NA diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 0e393ea..5c21209 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -2,6 +2,10 @@ #' #' @param postsynth A postsynth object from tidysynthesis or a tibble #' @param data an original (observed) data set. +#' @param use optional character string giving a method for computing +#' covariances in the presence of missing values. This must be (an abbreviation +#' of) one of the strings "everything", "all.obs", "complete.obs", +#' "na.or.complete", or "pairwise.complete.obs". #' #' @return A `list` of fit metrics: #' - `correlation_original`: correlation matrix of the original data. @@ -16,7 +20,7 @@ #' #' @export -util_corr_fit <- function(postsynth, data) { +util_corr_fit <- function(postsynth, data, use = "everything") { if (is_postsynth(postsynth)) { @@ -35,13 +39,13 @@ util_corr_fit <- function(postsynth, data) { data <- dplyr::select(data, names(synthetic_data)) # helper function to find a correlation matrix with the upper tri set to zeros - lower_triangle <- function(x) { + lower_triangle <- function(x, use) { # find the linear correlation matrix of numeric variables from a data set correlation_matrix <- x %>% dplyr::select_if(is.numeric) %>% - stats::cor() + stats::cor(use = use) # set the values in the upper triangle to zero to avoid double counting correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA @@ -50,10 +54,10 @@ util_corr_fit <- function(postsynth, data) { } # find the lower triangle of the original data linear correlation matrix - original_lt <- lower_triangle(data) + original_lt <- lower_triangle(data, use = use) # find the lower triangle of the synthetic data linear correlation matrix - synthetic_lt <- lower_triangle(synthetic_data) + synthetic_lt <- lower_triangle(synthetic_data, use = use) # compare names if (any(rownames(original_lt) != rownames(synthetic_lt))) { diff --git a/R/util_ks_distance.R b/R/util_ks_distance.R index b2894fc..19a787a 100644 --- a/R/util_ks_distance.R +++ b/R/util_ks_distance.R @@ -1,8 +1,9 @@ #' Calculate the Kolmogorov-Smirnov distance (D) for each numeric variable in #' the synthetic and confidential data #' -#' @param postsynth A postsynth object or tibble with synthetic data -#' @param data A data frame with the original data +#' @param postsynth a postsynth object or tibble with synthetic data +#' @param data a data frame with the original data +#' @param na.rm a logical indicating whether missing values should be removed. #' #' @return A tibble with the D and location of the largest distance for each #' numeric variable @@ -11,7 +12,7 @@ #' #' @export #' -util_ks_distance <- function(postsynth, data) { +util_ks_distance <- function(postsynth, data, na.rm = FALSE) { if ("postsynth" %in% class(postsynth)) { @@ -55,20 +56,31 @@ util_ks_distance <- function(postsynth, data) { names(distances) <- variables for (var in variables) { + var_synth <- dplyr::pull(synthetic_data, var) + var_data <- dplyr::pull(data, var) + + # drop missing values + if (na.rm) { + + var_synth <- var_synth[!is.na(var_synth)] + var_data <- var_data[!is.na(var_data)] + + } + # find the eCDFs for both variables - ecdf_synth <- stats::ecdf(dplyr::pull(synthetic_data, var)) - ecdf_orig <- stats::ecdf(dplyr::pull(data, var)) + ecdf_synth <- stats::ecdf(var_synth) + ecdf_orig <- stats::ecdf(var_data) # calculate the minimum and maximum across both variables - minimum <- min(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var))) - maximum <- max(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var))) + minimum <- min(c(var_synth, var_data)) + maximum <- max(c(var_synth, var_data)) # create a grid of values for calculating the distances between the two # eCDFs z <- seq( from = minimum, to = maximum, - length.out = min(nrow(synthetic_data), nrow(data), 10000) + length.out = min(length(var_synth), length(var_data), 10000) ) # for each variable, find D and the location of D diff --git a/man/co_occurrence.Rd b/man/co_occurrence.Rd index fe7aad9..4f1de18 100644 --- a/man/co_occurrence.Rd +++ b/man/co_occurrence.Rd @@ -4,10 +4,12 @@ \alias{co_occurrence} \title{Construct a co-occurrence matrix} \usage{ -co_occurrence(data) +co_occurrence(data, na.rm = FALSE) } \arguments{ \item{data}{A tibble with numeric variables} + +\item{na.rm}{a logical indicating whether missing values should be removed.} } \value{ A co-occurrence matrix diff --git a/man/util_co_occurrence.Rd b/man/util_co_occurrence.Rd index 5662b50..37fbcc5 100644 --- a/man/util_co_occurrence.Rd +++ b/man/util_co_occurrence.Rd @@ -4,12 +4,16 @@ \alias{util_co_occurrence} \title{Calculate the co-occurrence fit metric of a confidential data set.} \usage{ -util_co_occurrence(postsynth, data) +util_co_occurrence(postsynth, data, na.rm = FALSE) } \arguments{ -\item{postsynth}{A postsynth object from tidysynthesis or a tibble} +\item{postsynth}{a postsynth object from tidysynthesis or a tibble} \item{data}{an original (observed) data set.} + +\item{na.rm}{a logical indicating whether missing values should be removed. +Note: values are jointly removed for each pair of variables even if only one +value is missing.} } \value{ A \code{list} of fit metrics: diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd index a4a6982..8044124 100644 --- a/man/util_corr_fit.Rd +++ b/man/util_corr_fit.Rd @@ -4,12 +4,17 @@ \alias{util_corr_fit} \title{Calculate the correlation fit metric of a confidential data set.} \usage{ -util_corr_fit(postsynth, data) +util_corr_fit(postsynth, data, use = "everything") } \arguments{ \item{postsynth}{A postsynth object from tidysynthesis or a tibble} \item{data}{an original (observed) data set.} + +\item{use}{optional character string giving a method for computing +covariances in the presence of missing values. This must be (an abbreviation +of) one of the strings "everything", "all.obs", "complete.obs", +"na.or.complete", or "pairwise.complete.obs".} } \value{ A \code{list} of fit metrics: diff --git a/man/util_ks_distance.Rd b/man/util_ks_distance.Rd index f47e9ed..85dbf1a 100644 --- a/man/util_ks_distance.Rd +++ b/man/util_ks_distance.Rd @@ -5,12 +5,14 @@ \title{Calculate the Kolmogorov-Smirnov distance (D) for each numeric variable in the synthetic and confidential data} \usage{ -util_ks_distance(postsynth, data) +util_ks_distance(postsynth, data, na.rm = FALSE) } \arguments{ -\item{postsynth}{A postsynth object or tibble with synthetic data} +\item{postsynth}{a postsynth object or tibble with synthetic data} -\item{data}{A data frame with the original data} +\item{data}{a data frame with the original data} + +\item{na.rm}{a logical indicating whether missing values should be removed.} } \value{ A tibble with the D and location of the largest distance for each diff --git a/tests/testthat/test-util_co_occurrence.R b/tests/testthat/test-util_co_occurrence.R index 006d331..7cdd4b9 100644 --- a/tests/testthat/test-util_co_occurrence.R +++ b/tests/testthat/test-util_co_occurrence.R @@ -8,23 +8,6 @@ syn <- list(synthetic_data = data.frame(a = c(1, 0, 0, 0), b = c(1, 0, 0, 0))) %>% structure(class = "postsynth") - - - - - -# # difference matrix for tests -# diff_matrix <- matrix( -# c(NA, NA, NA, -# -2, NA, NA, -# 0, -2, NA), -# ncol = 3, -# byrow = TRUE -# ) - -# rownames(diff_matrix) <- c("a", "c", "b") -# colnames(diff_matrix) <- c("a", "c", "b") - # test with postsynth test_that("util_co_occurrence() is correct with identical data ", { @@ -54,3 +37,22 @@ test_that("util_co_occurrence() is correct with different data ", { expect_equal(co_occurrence$co_occurrence_difference_mae, mean(abs(-0.25))) expect_equal(co_occurrence$co_occurrence_difference_rmse, sqrt(mean((-0.25) ^ 2))) }) + +test_that("util_co_occurrence() works with NA ", { + + syn <- list( + synthetic_data = acs_conf + ) %>% + structure(class = "postsynth") + + co_occurrence <- util_co_occurrence( + postsynth = syn, + data = acs_conf, + na.rm = TRUE + ) + + expect_equal(max(co_occurrence$co_occurrence_difference, na.rm = TRUE), 0) + expect_equal(co_occurrence$co_occurrence_difference_mae, 0) + expect_equal(co_occurrence$co_occurrence_difference_rmse, 0) + +}) diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R index 4ff40c8..6fe3162 100644 --- a/tests/testthat/test-util_corr_fit.R +++ b/tests/testthat/test-util_corr_fit.R @@ -48,3 +48,22 @@ test_that("util_corr_fit is correct with postsynth ", { expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2)))) expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) }) + +test_that("util_corr_fit works with NA ", { + + syn <- list( + synthetic_data = acs_conf + ) %>% + structure(class = "postsynth") + + corr <- util_corr_fit( + postsynth = syn, + data = acs_conf, + use = "pairwise.complete.obs" + ) + + expect_equal(max(corr$correlation_difference, na.rm = TRUE), 0) + expect_equal(corr$correlation_fit, 0) + expect_equal(corr$correlation_difference_mae, 0) + expect_equal(corr$correlation_difference_rmse, 0) +}) diff --git a/tests/testthat/test-util_ks-distance.R b/tests/testthat/test-util_ks-distance.R index e14b6ab..40e7c07 100644 --- a/tests/testthat/test-util_ks-distance.R +++ b/tests/testthat/test-util_ks-distance.R @@ -1,16 +1,16 @@ df <- data.frame( - a = c(1, 2, 3, 4), - b = c(1, 2, 3, 4), - c = c("a", "a", "b", "b") + a = c(NA, 1, 2, 3, 4), + b = c(NA, 1, 2, 3, 4), + c = c(NA, "a", "a", "b", "b") ) test_that("KS is 0 ", { syn <- list( synthetic_data = data.frame( - a = c(1, 2, 3, 4), - b = c(1, 2, 3, 4), - c = c("a", "a", "b", "b") + a = c(1, 2, 3, 4, NA), + b = c(1, 2, 3, 4, NA), + c = c("a", "a", "b", "b", NA) ), jth_synthesis_time = data.frame( variable = factor(c("a", "b")) @@ -18,19 +18,19 @@ test_that("KS is 0 ", { ) %>% structure(class = "postsynth") - D <- util_ks_distance(postsynth = syn, data = df) + D <- util_ks_distance(postsynth = syn, data = df, na.rm = TRUE) expect_equal(D$D, rep(0, 8)) }) -test_that("KS distance if 0.5 ", { +test_that("KS distance is 0.5 ", { syn <- list( synthetic_data = data.frame( - a = c(3, 4, 5, 6), - b = c(3, 4, 5, 6), - c = c("a", "a", "b", "b") + a = c(3, 4, 5, 6, NA), + b = c(3, 4, 5, 6, NA), + c = c("a", "a", "b", "b", NA) ), jth_synthesis_time = data.frame( variable = factor(c("a", "b")) @@ -38,7 +38,7 @@ test_that("KS distance if 0.5 ", { ) %>% structure(class = "postsynth") - D <- util_ks_distance(postsynth = syn, data = df) + D <- util_ks_distance(postsynth = syn, data = df, na.rm = TRUE) expect_equal(D$D, rep(0.5, 4)) @@ -48,9 +48,9 @@ test_that("KS distance is 1 ", { syn <- list( synthetic_data = data.frame( - a = c(60, 70, 80, 90), - b = c(60, 70, 80, 90), - c = c("a", "a", "b", "b") + a = c(60, 70, 80, 90, NA), + b = c(60, 70, 80, 90, NA), + c = c("a", "a", "b", "b", NA) ), jth_synthesis_time = data.frame( variable = factor(c("a", "b")) @@ -58,8 +58,24 @@ test_that("KS distance is 1 ", { ) %>% structure(class = "postsynth") - D <- util_ks_distance(postsynth = syn, data = df) + D <- util_ks_distance(postsynth = syn, data = df, na.rm = TRUE) expect_equal(D$D, c(1, 1)) }) + +test_that("KS distance works with NA ", { + + syn <- list( + synthetic_data = acs_conf + ) %>% + structure(class = "postsynth") + + D <- util_ks_distance( + postsynth = syn, + data = acs_conf, + na.rm = TRUE) + + expect_equal(max(D$D), 0) + +})