Rdatatable · venom1204 · Jan 8, 2025 · Jan 8, 2025 · Jan 14, 2025 · Jan 14, 2025
@@ -11,15 +11,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
       by = key(x)
     }
   }
-  x0 = length(x)==0L
-  y0 = length(y)==0L
-  if (x0 || y0) {
-    if (x0 && y0)
+  x0 = length(x) == 0L
+  y0 = length(y) == 0L
+  if (x0 || y0){
+    if (x0 && y0) {
       warningf("Neither of the input data.tables to join have columns.")
-    else if (x0)
+    } else if (x0) {
       warningf("Input data.table '%s' has no columns.", "x")
-    else
+    } else if (y0) {
       warningf("Input data.table '%s' has no columns.", "y")
+    }
   }
   check_duplicate_names(x)
   check_duplicate_names(y)
@@ -28,34 +29,42 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   nm_y = names(y)
 
   ## set up 'by'/'by.x'/'by.y'
-  if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
-    stopf("`by.x` and `by.y` must be of same length.")
+  if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y))
+    stopf("'by.x' and 'by.y' must be of same length.")
   if (!missing(by) && !missing(by.x))
-    warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
+    warningf("Supplied both by and 'by.x/by.y.' by argument will be ignored.")
   if (!is.null(by.x)) {
-    if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
-      stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
-    if (!all(by.x %chin% nm_x))
-      stopf("Elements listed in `by.x` must be valid column names in x.")
-    if (!all(by.y %chin% nm_y))
-      stopf("Elements listed in `by.y` must be valid column names in y.")
+    if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y))
+      stopf("A non-empty vector of column names is required for 'by.x' and 'by.y'.")
+    if (!all(by.x %chin% nm_x)) {
+      missing_in_x <- setdiff(by.x, nm_x)
+      stopf("The following columns listed in 'by.x' are missing from x: %s", brackify(missing_in_x))
+    }
+    if (!all(by.y %chin% nm_y)) {
+      missing_in_y <- setdiff(by.y, nm_y)
+      stopf("The following columns listed in 'by.y' are missing from y: %s", brackify(missing_in_y))
+    }
     by = by.x
     names(by) = by.y
   } else {
     if (is.null(by))
       by = intersect(key(x), key(y))
-    if (!length(by))   # was is.null() before PR#5183  changed to !length()
+    if (!length(by)) # was is.null() before PR#5183 changed to !length()
       by = key(x)
     if (!length(by))
       by = intersect(nm_x, nm_y)
     if (length(by) == 0L || !is.character(by))
-      stopf("A non-empty vector of column names for `by` is required.")
-    if (!all(by %chin% intersect(nm_x, nm_y)))
-      stopf("Elements listed in `by` must be valid column names in x and y")
+      stopf("A non-empty vector of column names for by is required.")
+    missing_in_x <- setdiff(by, nm_x)
+    missing_in_y <- setdiff(by, nm_y)
+    if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
+      stopf(gettextf("The following columns are missing:\n%s%s",
+        if (length(missing_in_x) > 0) gettextf(" - From x: %s\n", brackify(missing_in_x)) else "",
+        if (length(missing_in_y) > 0) gettextf(" - From y: %s\n", brackify(missing_in_y)) else ""))
+    }
     by = unname(by)
     by.x = by.y = by
   }
-
   # warn about unused arguments #2587
   if (length(list(...))) {
     ell = as.list(substitute(list(...)))[-1L]
@@ -109,7 +118,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   }
 
   # Throw warning if there are duplicate column names in 'dt' (i.e. if
-  # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
+  # `suffixes=c("",""), to match behaviour in base:::merge.data.frame)
   resultdupnames = names(dt)[duplicated(names(dt))]
   if (length(resultdupnames)) {
     warningf("column names %s are duplicated in the result", brackify(resultdupnames))

@@ -8563,18 +8563,21 @@ test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id
 
 # warn when merge empty data.table #597
 DT0 = data.table(NULL)
-DT1 = data.table(a=1)
-test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
-test(1601.2, merge(DT1, DT0, by="a"),
-     warning="Input data.table 'y' has no columns.",
-     error="Elements listed in `by`")
-test(1601.3, merge(DT0, DT1, by="a"),
-     warning="Input data.table 'x' has no columns.",
-     error="Elements listed in `by`")
-test(1601.4, merge(DT0, DT0, by="a"),
-     warning="Neither of the input data.tables to join have columns.",
-     error="Elements listed in `by`")
-
+DT1 = data.table(a = 1)
+
+test(1601.1, merge(DT1, DT1, by = "a"), data.table(a = 1, key = "a"))
+test(1601.2, 
+     merge(DT1, DT0, by = "a"), 
+     warning = "Input data.table 'y' has no columns.",
+     error = "The following columns are missing:\n - From y: [a]")
+test(1601.3, 
+     merge(DT0, DT1, by = "a"), 
+     warning = "Input data.table 'x' has no columns.",
+     error = "The following columns are missing:\n - From x: [a]")
+test(1601.4, 
+     merge(DT0, DT0, by = "a"), 
+     warning = "Neither of the input data.tables to join have columns.",
+     error = "The following columns are missing:\n - From x: [a]\n - From y: [a]")
 # fix for #1549
 d1 <- data.table(v1=1:2,x=x)
 d2 <- data.table(v1=3:4)
@@ -13546,14 +13549,17 @@ test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')),
 test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
      data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
      warning = 'Supplied both.*argument will be ignored')
-test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
-     error = 'Elements listed in `by.x`')
-test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
-     error = 'Elements listed in `by.y`')
+test(1962.018, 
+     merge(DT1, DT2, by.x = 'z', by.y = 'a'), 
+     error = "The following columns listed in 'by.x' are missing from x: [z]")
+test(1962.019, 
+     merge(DT1, DT2, by.x = 'a', by.y = 'z'), 
+     error = "The following columns listed in 'by.y' are missing from y: [z]")
 test(1962.0201, merge(DT1, DT2, by=character(0L)), ans)  # was error before PR#5183
 test(1962.0202, merge(DT1, DT2, by=NULL),          ans)  # test explicit NULL too as missing() could be used inside merge()
-test(1962.021, merge(DT1, DT2, by = 'z'),
-     error = 'must be valid column names in x and y')
+test(1962.021, {
+  merge(DT1, DT2, by = 'z')
+}, error = 'The following columns are missing:\n - From x: [z]\n - From y: [z]')
 
 ## frank.R
 x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
@@ -16931,7 +16937,7 @@ test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing
 A = data.table(A='a')
 B = data.table(B='b')
 test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector")
-test(2145.2, merge(A, B, by=character(0)                                     ), error = "non-empty vector of column names for `by` is required.")
+test(2145.2, merge(A, B, by=character(0)                                     ), error = "A non-empty vector of column names for by is required.")
 test(2145.3, merge(A, B,                 by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required")
 # Also shouldn't crash when using internal functions
 test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty')
@@ -18014,7 +18020,7 @@ test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x
 test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5)))
 test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.")
 test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L),
-             merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
+             merge(DT, y, by="k2"), warning=c("Supplied both by and 'by.x/by.y.' by argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
 
 # weighted.mean GForce optimized, #3977
 old = options(datatable.optimize=1L)