Merge pull request #102 from UrbanInstitute/iss030

Iss030
UrbanInstitute · Oct 25, 2024 · 3adc56b · 3adc56b
2 parents 133a122 + 38b722a
commit 3adc56b
Show file tree

Hide file tree

Showing 11 changed files with 140 additions and 58 deletions.
diff --git a/R/co_occurence.R b/R/co_occurence.R
@@ -1,10 +1,11 @@
 #' Construct a co-occurrence matrix
 #'
 #' @param data A tibble with numeric variables
+#' @param na.rm a logical indicating whether missing values should be removed.
 #'
 #' @return A co-occurrence matrix
 #' 
-co_occurrence <- function(data) {
+co_occurrence <- function(data, na.rm = FALSE) {
 
   # create a vector of variable names
   data_names <- names(data)
@@ -19,8 +20,20 @@ co_occurrence <- function(data) {
 
     for (col_name in data_names) {
 
+      row_var <- dplyr::pull(data, row_name)
+      col_var <- dplyr::pull(data, col_name)
+
+      if (na.rm) {
+
+        # remove missing values
+        na_lgl <- !is.na(row_var) & !is.na(col_var)
+        row_var <- row_var[na_lgl]
+        col_var <- col_var[na_lgl]
+
+      } 
+
       co_occurence_matrix[row_name, col_name] <- 
-        mean(dplyr::pull(data, row_name) != 0 & dplyr::pull(data, col_name) != 0)
+        mean(row_var != 0 & col_var != 0)
 
     }
 

diff --git a/R/util_co_ocurrence.R b/R/util_co_ocurrence.R
@@ -1,7 +1,10 @@
 #' Calculate the co-occurrence fit metric of a confidential data set.
 #'
-#' @param postsynth A postsynth object from tidysynthesis or a tibble
+#' @param postsynth a postsynth object from tidysynthesis or a tibble
 #' @param data an original (observed) data set.
+#' @param na.rm a logical indicating whether missing values should be removed. 
+#'  Note: values are jointly removed for each pair of variables even if only one
+#'  value is missing.
 #'
 #' @return A `list` of fit metrics:
 #'  - `co_occurrence_original`: co-occurrence matrix of the original data.
@@ -19,7 +22,7 @@
 #'
 #' @export
 #' 
-util_co_occurrence <- function(postsynth, data) {
+util_co_occurrence <- function(postsynth, data, na.rm = FALSE) {
 
   if (is_postsynth(postsynth)) {
 
@@ -44,7 +47,7 @@ util_co_occurrence <- function(postsynth, data) {
     co_occurrence_matrix <-
       x %>%
       dplyr::select_if(is.numeric) %>%
-      co_occurrence()
+      co_occurrence(na.rm = na.rm)
 
     # set the values in the upper triangle to zero to avoid double counting
     co_occurrence_matrix[upper.tri(co_occurrence_matrix, diag = TRUE)] <- NA

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
@@ -2,6 +2,10 @@
 #'
 #' @param postsynth A postsynth object from tidysynthesis or a tibble
 #' @param data an original (observed) data set.
+#' @param use optional character string giving a method for computing 
+#' covariances in the presence of missing values. This must be (an abbreviation 
+#' of) one of the strings "everything", "all.obs", "complete.obs", 
+#' "na.or.complete", or "pairwise.complete.obs".
 #'
 #' @return A `list` of fit metrics:
 #'  - `correlation_original`: correlation matrix of the original data.
@@ -16,7 +20,7 @@
 #'
 #' @export
 
-util_corr_fit <- function(postsynth, data) {
+util_corr_fit <- function(postsynth, data, use = "everything") {
 
   if (is_postsynth(postsynth)) {
 
@@ -35,13 +39,13 @@ util_corr_fit <- function(postsynth, data) {
   data <- dplyr::select(data, names(synthetic_data))
 
   # helper function to find a correlation matrix with the upper tri set to zeros
-  lower_triangle <- function(x) {
+  lower_triangle <- function(x, use) {
 
     # find the linear correlation matrix of numeric variables from a data set
     correlation_matrix <-
       x %>%
       dplyr::select_if(is.numeric) %>%
-      stats::cor()
+      stats::cor(use = use)
 
     # set the values in the upper triangle to zero to avoid double counting
     correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA
@@ -50,10 +54,10 @@ util_corr_fit <- function(postsynth, data) {
   }
 
   # find the lower triangle of the original data linear correlation matrix
-  original_lt <- lower_triangle(data)
+  original_lt <- lower_triangle(data, use = use)
 
   # find the lower triangle of the synthetic data linear correlation matrix
-  synthetic_lt <- lower_triangle(synthetic_data)
+  synthetic_lt <- lower_triangle(synthetic_data, use = use)
 
   # compare names
   if (any(rownames(original_lt) != rownames(synthetic_lt))) {

diff --git a/R/util_ks_distance.R b/R/util_ks_distance.R
@@ -1,8 +1,9 @@
 #' Calculate the Kolmogorov-Smirnov distance (D) for each numeric variable in 
 #' the synthetic and confidential data
 #'
-#' @param postsynth A postsynth object or tibble with synthetic data
-#' @param data A data frame with the original data
+#' @param postsynth a postsynth object or tibble with synthetic data
+#' @param data a data frame with the original data
+#' @param na.rm a logical indicating whether missing values should be removed.
 #'
 #' @return A tibble with the D and location of the largest distance for each 
 #' numeric variable
@@ -11,7 +12,7 @@
 #' 
 #' @export
 #'
-util_ks_distance <- function(postsynth, data) {
+util_ks_distance <- function(postsynth, data, na.rm = FALSE) {
 
   if ("postsynth" %in% class(postsynth)) {
 
@@ -55,20 +56,31 @@ util_ks_distance <- function(postsynth, data) {
   names(distances) <- variables
   for (var in variables) {
 
+    var_synth <- dplyr::pull(synthetic_data, var)
+    var_data <- dplyr::pull(data, var)
+
+    # drop missing values
+    if (na.rm) {
+
+      var_synth <- var_synth[!is.na(var_synth)]
+      var_data <- var_data[!is.na(var_data)]
+
+    }
+
     # find the eCDFs for both variables
-    ecdf_synth <- stats::ecdf(dplyr::pull(synthetic_data, var))
-    ecdf_orig <- stats::ecdf(dplyr::pull(data, var))
+    ecdf_synth <- stats::ecdf(var_synth)
+    ecdf_orig <- stats::ecdf(var_data)
 
     # calculate the minimum and maximum across both variables
-    minimum <- min(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var)))
-    maximum <- max(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var)))
+    minimum <- min(c(var_synth, var_data))
+    maximum <- max(c(var_synth, var_data))
 
     # create a grid of values for calculating the distances between the two
     # eCDFs
     z <- seq(
       from = minimum, 
       to = maximum,
-      length.out = min(nrow(synthetic_data), nrow(data), 10000)
+      length.out = min(length(var_synth), length(var_data), 10000)
     )
 
     # for each variable, find D and the location of D

diff --git a/man/co_occurrence.Rd b/man/co_occurrence.Rd
diff --git a/man/util_co_occurrence.Rd b/man/util_co_occurrence.Rd
diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd
diff --git a/man/util_ks_distance.Rd b/man/util_ks_distance.Rd
diff --git a/tests/testthat/test-util_co_occurrence.R b/tests/testthat/test-util_co_occurrence.R
@@ -8,23 +8,6 @@ syn <- list(synthetic_data = data.frame(a = c(1, 0, 0, 0),
                                         b = c(1, 0, 0, 0))) %>%
   structure(class = "postsynth")
 
-
-
-
-
-
-# # difference matrix for tests
-# diff_matrix <- matrix(
-#   c(NA, NA, NA,
-#     -2, NA, NA,
-#     0, -2, NA),
-#   ncol = 3,
-#   byrow = TRUE
-# ) 
-
-# rownames(diff_matrix) <- c("a", "c", "b")
-# colnames(diff_matrix) <- c("a", "c", "b")
-
 # test with postsynth
 test_that("util_co_occurrence() is correct with identical data ", {
 
@@ -54,3 +37,22 @@ test_that("util_co_occurrence() is correct with different data ", {
   expect_equal(co_occurrence$co_occurrence_difference_mae, mean(abs(-0.25)))
   expect_equal(co_occurrence$co_occurrence_difference_rmse, sqrt(mean((-0.25) ^ 2)))
 })
+
+test_that("util_co_occurrence() works with NA ", {
+
+  syn <- list(
+    synthetic_data = acs_conf
+  ) %>%
+    structure(class = "postsynth")
+
+  co_occurrence <- util_co_occurrence(
+    postsynth = syn, 
+    data = acs_conf,
+    na.rm = TRUE
+  )
+
+  expect_equal(max(co_occurrence$co_occurrence_difference, na.rm = TRUE), 0)
+  expect_equal(co_occurrence$co_occurrence_difference_mae, 0)
+  expect_equal(co_occurrence$co_occurrence_difference_rmse, 0)
+
+})
diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R
@@ -48,3 +48,22 @@ test_that("util_corr_fit is correct with postsynth ", {
   expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2))))
   expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
 })
+
+test_that("util_corr_fit works with NA ", {
+
+  syn <- list(
+    synthetic_data = acs_conf
+  ) %>%
+    structure(class = "postsynth")
+
+  corr <- util_corr_fit(
+    postsynth = syn, 
+    data = acs_conf,
+    use = "pairwise.complete.obs"
+  )
+
+  expect_equal(max(corr$correlation_difference, na.rm = TRUE), 0)
+  expect_equal(corr$correlation_fit, 0)
+  expect_equal(corr$correlation_difference_mae, 0)
+  expect_equal(corr$correlation_difference_rmse, 0)
+})