diff --git a/DESCRIPTION b/DESCRIPTION index 65345b2e67..d8f5cca3b6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: admiral Title: ADaM in R Asset Library -Version: 1.2.0.9005 +Version: 1.2.0.9006 Authors@R: c( person("Ben", "Straub", , "ben.x.straub@gsk.com", role = c("aut", "cre")), person("Stefan", "Bundfuss", role = "aut", diff --git a/NEWS.md b/NEWS.md index 2cdf5cd8f6..fe5e14d22a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ ## Updates of Existing Functions +- The function `extract_duplicate_records()` was updated to consider all variables in the input dataset for the by group if the `by_vars` argument is omitted entirely. (#2644) - In `slice_derivation`, previously the derivation is not called for empty subsets, however this can lead to issues when the input dataset is empty. Now the derivation is called for all subsets. ## Breaking Changes diff --git a/R/duplicates.R b/R/duplicates.R index d6c42a221f..5bc384285c 100644 --- a/R/duplicates.R +++ b/R/duplicates.R @@ -39,6 +39,11 @@ get_duplicates_dataset <- function() { #' @param by_vars Grouping variables #' #' Defines groups of records in which to look for duplicates. +#' If omitted, all variables in the input dataset are used in the by group. +#' +#' **Note:** Omitting `by_vars` will increase the function's run-time, so it is +#' recommended to specify the necessary grouping variables for large datasets +#' whenever possible. #' #' `r roxygen_param_by_vars()` #' @@ -55,9 +60,14 @@ get_duplicates_dataset <- function() { #' adsl <- rbind(admiral_adsl[1L, ], admiral_adsl) #' #' extract_duplicate_records(adsl, exprs(USUBJID)) -extract_duplicate_records <- function(dataset, by_vars) { - assert_expr_list(by_vars) - assert_data_frame(dataset, required_vars = extract_vars(by_vars), check_is_grouped = FALSE) +extract_duplicate_records <- function(dataset, by_vars = NULL) { + if (is.null(by_vars)) { + assert_data_frame(dataset, check_is_grouped = FALSE) + by_vars <- exprs(!!!parse_exprs(names(dataset))) + } else { + assert_expr_list(by_vars) + assert_data_frame(dataset, required_vars = extract_vars(by_vars), check_is_grouped = FALSE) + } data_by <- dataset %>% ungroup() %>% diff --git a/man/extract_duplicate_records.Rd b/man/extract_duplicate_records.Rd index 816990253c..abec0f3ce1 100644 --- a/man/extract_duplicate_records.Rd +++ b/man/extract_duplicate_records.Rd @@ -4,7 +4,7 @@ \alias{extract_duplicate_records} \title{Extract Duplicate Records} \usage{ -extract_duplicate_records(dataset, by_vars) +extract_duplicate_records(dataset, by_vars = NULL) } \arguments{ \item{dataset}{Input dataset @@ -14,6 +14,11 @@ The variables specified by the \code{by_vars} argument are expected to be in the \item{by_vars}{Grouping variables Defines groups of records in which to look for duplicates. +If omitted, all variables in the input dataset are used in the by group. + +\strong{Note:} Omitting \code{by_vars} will increase the function's run-time, so it is +recommended to specify the necessary grouping variables for large datasets +whenever possible. \emph{Permitted Values}: list of variables created by \code{exprs()} e.g. \code{exprs(USUBJID, VISIT)}} diff --git a/tests/testthat/_snaps/duplicates.md b/tests/testthat/_snaps/duplicates.md index 47c9371b14..04bb4d6c63 100644 --- a/tests/testthat/_snaps/duplicates.md +++ b/tests/testthat/_snaps/duplicates.md @@ -1,4 +1,4 @@ -# signal_duplicate_records Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()` +# signal_duplicate_records Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()` Code get_duplicates_dataset() diff --git a/tests/testthat/test-duplicates.R b/tests/testthat/test-duplicates.R index d8715d08cb..1c4d191251 100644 --- a/tests/testthat/test-duplicates.R +++ b/tests/testthat/test-duplicates.R @@ -18,9 +18,29 @@ test_that("extract_duplicate_records Test 1: duplicate records are extracted", { ) }) +## Test 2: duplicate records for all variables ---- +test_that("extract_duplicate_records Test 2: duplicate records for all variables", { + input <- tibble::tribble( + ~USUBJID, ~COUNTRY, ~AAGE, + "P01", "GER", 22, + "P01", "JPN", 34, + "P02", "CZE", 41, + "P03", "AUS", 39, + "P04", "BRA", 21, + "P04", "BRA", 21 + ) + expected_ouput <- input[c(5:6), ] + + expect_equal( + expected_ouput, + extract_duplicate_records(input) + ) +}) + + # signal_duplicate_records ---- -## Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()` ---- -test_that("signal_duplicate_records Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()`", { # nolint +## Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()` ---- +test_that("signal_duplicate_records Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()`", { # nolint input <- tibble::tribble( ~USUBJID, ~COUNTRY, ~AAGE, "P01", "GER", 22,