Closes #2644: Use all variables for extract_duplicate_records by defa…

…ult (#2651) * Closes #2644: Use all variables for extract_duplicate_records by default * update `by_vars` to use all variables * add test for `by_vars = NULL` * update documentation * update NEWS * Update NEWS.md Co-authored-by: Edoardo Mancini <[email protected]> * Update R/duplicates.R Co-authored-by: Edoardo Mancini <[email protected]> * Update NEWS.md Co-authored-by: Edoardo Mancini <[email protected]> * Update extract_duplicate_records.Rd * add note for `by_vars` in `extract_duplicate_records` * Update R/duplicates.R Co-authored-by: Edoardo Mancini <[email protected]> * update man/extract_duplicate_records.Rd * fixed linter warning `the trailing whitespace is superfluous ` --------- Co-authored-by: ynsec37 <[email protected]> Co-authored-by: Edoardo Mancini <[email protected]>
pharmaverse · Feb 4, 2025 · 9cd0b01 · 9cd0b01
1 parent 6d44b6d
commit 9cd0b01
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,7 @@
 
 ## Updates of Existing Functions
 
+- The function `extract_duplicate_records()` was updated to consider all variables in the input dataset for the by group if the `by_vars` argument is omitted entirely. (#2644)
 - In `slice_derivation`, previously the derivation is not called for empty subsets, however this can lead to issues when the input dataset is empty. Now the derivation is called for all subsets.
 
 ## Breaking Changes

diff --git a/R/duplicates.R b/R/duplicates.R
@@ -39,6 +39,11 @@ get_duplicates_dataset <- function() {
 #' @param by_vars Grouping variables
 #'
 #'  Defines groups of records in which to look for duplicates.
+#'  If omitted, all variables in the input dataset are used in the by group.
+#'
+#'  **Note:**  Omitting `by_vars` will increase the function's run-time, so it is
+#'  recommended to specify the necessary grouping variables for large datasets
+#'  whenever possible.
 #'
 #' `r roxygen_param_by_vars()`
 #'
@@ -55,9 +60,14 @@ get_duplicates_dataset <- function() {
 #' adsl <- rbind(admiral_adsl[1L, ], admiral_adsl)
 #'
 #' extract_duplicate_records(adsl, exprs(USUBJID))
-extract_duplicate_records <- function(dataset, by_vars) {
-  assert_expr_list(by_vars)
-  assert_data_frame(dataset, required_vars = extract_vars(by_vars), check_is_grouped = FALSE)
+extract_duplicate_records <- function(dataset, by_vars = NULL) {
+  if (is.null(by_vars)) {
+    assert_data_frame(dataset, check_is_grouped = FALSE)
+    by_vars <- exprs(!!!parse_exprs(names(dataset)))
+  } else {
+    assert_expr_list(by_vars)
+    assert_data_frame(dataset, required_vars = extract_vars(by_vars), check_is_grouped = FALSE)
+  }
 
   data_by <- dataset %>%
     ungroup() %>%

diff --git a/man/extract_duplicate_records.Rd b/man/extract_duplicate_records.Rd
diff --git a/tests/testthat/_snaps/duplicates.md b/tests/testthat/_snaps/duplicates.md
@@ -1,4 +1,4 @@
-# signal_duplicate_records Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()`
+# signal_duplicate_records Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()`
 
     Code
       get_duplicates_dataset()

diff --git a/tests/testthat/test-duplicates.R b/tests/testthat/test-duplicates.R
@@ -18,9 +18,29 @@ test_that("extract_duplicate_records Test 1: duplicate records are extracted", {
   )
 })
 
+## Test 2: duplicate records for all variables ----
+test_that("extract_duplicate_records Test 2: duplicate records for all variables", {
+  input <- tibble::tribble(
+    ~USUBJID, ~COUNTRY, ~AAGE,
+    "P01",    "GER",    22,
+    "P01",    "JPN",    34,
+    "P02",    "CZE",    41,
+    "P03",    "AUS",    39,
+    "P04",    "BRA",    21,
+    "P04",    "BRA",    21
+  )
+  expected_ouput <- input[c(5:6), ]
+
+  expect_equal(
+    expected_ouput,
+    extract_duplicate_records(input)
+  )
+})
+
+
 # signal_duplicate_records ----
-## Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()` ----
-test_that("signal_duplicate_records Test 2: dataset of duplicate records can be accessed using `get_duplicates_dataset()`", { # nolint
+## Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()` ----
+test_that("signal_duplicate_records Test 3: dataset of duplicate records can be accessed using `get_duplicates_dataset()`", { # nolint
   input <- tibble::tribble(
     ~USUBJID, ~COUNTRY, ~AAGE,
     "P01",    "GER",    22,