Skip to content

Commit

Permalink
Merge pull request #102 from UrbanInstitute/iss030
Browse files Browse the repository at this point in the history
Iss030
  • Loading branch information
awunderground authored Oct 25, 2024
2 parents 133a122 + 38b722a commit 3adc56b
Show file tree
Hide file tree
Showing 11 changed files with 140 additions and 58 deletions.
17 changes: 15 additions & 2 deletions R/co_occurence.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#' Construct a co-occurrence matrix
#'
#' @param data A tibble with numeric variables
#' @param na.rm a logical indicating whether missing values should be removed.
#'
#' @return A co-occurrence matrix
#'
co_occurrence <- function(data) {
co_occurrence <- function(data, na.rm = FALSE) {

# create a vector of variable names
data_names <- names(data)
Expand All @@ -19,8 +20,20 @@ co_occurrence <- function(data) {

for (col_name in data_names) {

row_var <- dplyr::pull(data, row_name)
col_var <- dplyr::pull(data, col_name)

if (na.rm) {

# remove missing values
na_lgl <- !is.na(row_var) & !is.na(col_var)
row_var <- row_var[na_lgl]
col_var <- col_var[na_lgl]

}

co_occurence_matrix[row_name, col_name] <-
mean(dplyr::pull(data, row_name) != 0 & dplyr::pull(data, col_name) != 0)
mean(row_var != 0 & col_var != 0)

}

Expand Down
9 changes: 6 additions & 3 deletions R/util_co_ocurrence.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#' Calculate the co-occurrence fit metric of a confidential data set.
#'
#' @param postsynth A postsynth object from tidysynthesis or a tibble
#' @param postsynth a postsynth object from tidysynthesis or a tibble
#' @param data an original (observed) data set.
#' @param na.rm a logical indicating whether missing values should be removed.
#' Note: values are jointly removed for each pair of variables even if only one
#' value is missing.
#'
#' @return A `list` of fit metrics:
#' - `co_occurrence_original`: co-occurrence matrix of the original data.
Expand All @@ -19,7 +22,7 @@
#'
#' @export
#'
util_co_occurrence <- function(postsynth, data) {
util_co_occurrence <- function(postsynth, data, na.rm = FALSE) {

if (is_postsynth(postsynth)) {

Expand All @@ -44,7 +47,7 @@ util_co_occurrence <- function(postsynth, data) {
co_occurrence_matrix <-
x %>%
dplyr::select_if(is.numeric) %>%
co_occurrence()
co_occurrence(na.rm = na.rm)

# set the values in the upper triangle to zero to avoid double counting
co_occurrence_matrix[upper.tri(co_occurrence_matrix, diag = TRUE)] <- NA
Expand Down
14 changes: 9 additions & 5 deletions R/util_corr_fit.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#'
#' @param postsynth A postsynth object from tidysynthesis or a tibble
#' @param data an original (observed) data set.
#' @param use optional character string giving a method for computing
#' covariances in the presence of missing values. This must be (an abbreviation
#' of) one of the strings "everything", "all.obs", "complete.obs",
#' "na.or.complete", or "pairwise.complete.obs".
#'
#' @return A `list` of fit metrics:
#' - `correlation_original`: correlation matrix of the original data.
Expand All @@ -16,7 +20,7 @@
#'
#' @export

util_corr_fit <- function(postsynth, data) {
util_corr_fit <- function(postsynth, data, use = "everything") {

if (is_postsynth(postsynth)) {

Expand All @@ -35,13 +39,13 @@ util_corr_fit <- function(postsynth, data) {
data <- dplyr::select(data, names(synthetic_data))

# helper function to find a correlation matrix with the upper tri set to zeros
lower_triangle <- function(x) {
lower_triangle <- function(x, use) {

# find the linear correlation matrix of numeric variables from a data set
correlation_matrix <-
x %>%
dplyr::select_if(is.numeric) %>%
stats::cor()
stats::cor(use = use)

# set the values in the upper triangle to zero to avoid double counting
correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA
Expand All @@ -50,10 +54,10 @@ util_corr_fit <- function(postsynth, data) {
}

# find the lower triangle of the original data linear correlation matrix
original_lt <- lower_triangle(data)
original_lt <- lower_triangle(data, use = use)

# find the lower triangle of the synthetic data linear correlation matrix
synthetic_lt <- lower_triangle(synthetic_data)
synthetic_lt <- lower_triangle(synthetic_data, use = use)

# compare names
if (any(rownames(original_lt) != rownames(synthetic_lt))) {
Expand Down
28 changes: 20 additions & 8 deletions R/util_ks_distance.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#' Calculate the Kolmogorov-Smirnov distance (D) for each numeric variable in
#' the synthetic and confidential data
#'
#' @param postsynth A postsynth object or tibble with synthetic data
#' @param data A data frame with the original data
#' @param postsynth a postsynth object or tibble with synthetic data
#' @param data a data frame with the original data
#' @param na.rm a logical indicating whether missing values should be removed.
#'
#' @return A tibble with the D and location of the largest distance for each
#' numeric variable
Expand All @@ -11,7 +12,7 @@
#'
#' @export
#'
util_ks_distance <- function(postsynth, data) {
util_ks_distance <- function(postsynth, data, na.rm = FALSE) {

if ("postsynth" %in% class(postsynth)) {

Expand Down Expand Up @@ -55,20 +56,31 @@ util_ks_distance <- function(postsynth, data) {
names(distances) <- variables
for (var in variables) {

var_synth <- dplyr::pull(synthetic_data, var)
var_data <- dplyr::pull(data, var)

# drop missing values
if (na.rm) {

var_synth <- var_synth[!is.na(var_synth)]
var_data <- var_data[!is.na(var_data)]

}

# find the eCDFs for both variables
ecdf_synth <- stats::ecdf(dplyr::pull(synthetic_data, var))
ecdf_orig <- stats::ecdf(dplyr::pull(data, var))
ecdf_synth <- stats::ecdf(var_synth)
ecdf_orig <- stats::ecdf(var_data)

# calculate the minimum and maximum across both variables
minimum <- min(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var)))
maximum <- max(c(dplyr::pull(synthetic_data, var), dplyr::pull(data, var)))
minimum <- min(c(var_synth, var_data))
maximum <- max(c(var_synth, var_data))

# create a grid of values for calculating the distances between the two
# eCDFs
z <- seq(
from = minimum,
to = maximum,
length.out = min(nrow(synthetic_data), nrow(data), 10000)
length.out = min(length(var_synth), length(var_data), 10000)
)

# for each variable, find D and the location of D
Expand Down
4 changes: 3 additions & 1 deletion man/co_occurrence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions man/util_co_occurrence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion man/util_corr_fit.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions man/util_ks_distance.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 19 additions & 17 deletions tests/testthat/test-util_co_occurrence.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,6 @@ syn <- list(synthetic_data = data.frame(a = c(1, 0, 0, 0),
b = c(1, 0, 0, 0))) %>%
structure(class = "postsynth")






# # difference matrix for tests
# diff_matrix <- matrix(
# c(NA, NA, NA,
# -2, NA, NA,
# 0, -2, NA),
# ncol = 3,
# byrow = TRUE
# )

# rownames(diff_matrix) <- c("a", "c", "b")
# colnames(diff_matrix) <- c("a", "c", "b")

# test with postsynth
test_that("util_co_occurrence() is correct with identical data ", {

Expand Down Expand Up @@ -54,3 +37,22 @@ test_that("util_co_occurrence() is correct with different data ", {
expect_equal(co_occurrence$co_occurrence_difference_mae, mean(abs(-0.25)))
expect_equal(co_occurrence$co_occurrence_difference_rmse, sqrt(mean((-0.25) ^ 2)))
})

test_that("util_co_occurrence() works with NA ", {

syn <- list(
synthetic_data = acs_conf
) %>%
structure(class = "postsynth")

co_occurrence <- util_co_occurrence(
postsynth = syn,
data = acs_conf,
na.rm = TRUE
)

expect_equal(max(co_occurrence$co_occurrence_difference, na.rm = TRUE), 0)
expect_equal(co_occurrence$co_occurrence_difference_mae, 0)
expect_equal(co_occurrence$co_occurrence_difference_rmse, 0)

})
19 changes: 19 additions & 0 deletions tests/testthat/test-util_corr_fit.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,22 @@ test_that("util_corr_fit is correct with postsynth ", {
expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2))))
expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
})

test_that("util_corr_fit works with NA ", {

syn <- list(
synthetic_data = acs_conf
) %>%
structure(class = "postsynth")

corr <- util_corr_fit(
postsynth = syn,
data = acs_conf,
use = "pairwise.complete.obs"
)

expect_equal(max(corr$correlation_difference, na.rm = TRUE), 0)
expect_equal(corr$correlation_fit, 0)
expect_equal(corr$correlation_difference_mae, 0)
expect_equal(corr$correlation_difference_rmse, 0)
})
Loading

0 comments on commit 3adc56b

Please sign in to comment.