Rdatatable · venom1204 · Jan 8, 2025 · Jan 8, 2025 · Jan 14, 2025 · Jan 14, 2025
@@ -35,10 +35,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   if (!is.null(by.x)) {
     if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
       stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
-    if (!all(by.x %chin% nm_x))
-      stopf("Elements listed in `by.x` must be valid column names in x.")
-    if (!all(by.y %chin% nm_y))
-      stopf("Elements listed in `by.y` must be valid column names in y.")
+    if (!all(by.x %chin% nm_x)) {
+      missing_in_x <- setdiff(by.x, nm_x)
+      stopf("The following columns listed in `by.x` are missing from `x`: %s",
+            toString(missing_in_x))
+    }
+    if (!all(by.y %chin% nm_y)) {
+      missing_in_y <- setdiff(by.y, nm_y)
+      stopf("The following columns listed in `by.y` are missing from `y`: %s",
+            toString(missing_in_y))
+    }
     by = by.x
     names(by) = by.y
   } else {
@@ -50,8 +56,13 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
       by = intersect(nm_x, nm_y)
     if (length(by) == 0L || !is.character(by))
       stopf("A non-empty vector of column names for `by` is required.")
-    if (!all(by %chin% intersect(nm_x, nm_y)))
-      stopf("Elements listed in `by` must be valid column names in x and y")
+    missing_in_x <- setdiff(by, nm_x)
+    missing_in_y <- setdiff(by, nm_y)
+    if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
+      stopf("The following columns are missing:\n%s%s",
+            if (length(missing_in_x) > 0) sprintf(" - From `x`: %s\n", toString(missing_in_x)) else "",
+            if (length(missing_in_y) > 0) sprintf(" - From `y`: %s\n", toString(missing_in_y)) else "")
+    }
     by = unname(by)
     by.x = by.y = by
   }
@@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   }
 
   # Throw warning if there are duplicate column names in 'dt' (i.e. if
-  # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
+  # `suffixes=c("",""), to match behaviour in base:::merge.data.frame)
   resultdupnames = names(dt)[duplicated(names(dt))]
   if (length(resultdupnames)) {
     warningf("column names %s are duplicated in the result", brackify(resultdupnames))

@@ -8563,16 +8563,24 @@ test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id
 # warn when merge empty data.table #597
 DT0 = data.table(NULL)
 DT1 = data.table(a=1)
+
+# Test 1601.1: Merge DT1 with itself on column 'a'
 test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
+
+# Test 1601.2: Merge DT1 with DT0 on column 'a'
 test(1601.2, merge(DT1, DT0, by="a"),
      warning="Input data.table 'y' has no columns.",
-     error="Elements listed in `by`")
+     error="The following columns are missing:\n - From `y`: a")
+
+# Test 1601.3: Merge DT0 with DT1 on column 'a'
 test(1601.3, merge(DT0, DT1, by="a"),
      warning="Input data.table 'x' has no columns.",
-     error="Elements listed in `by`")
+     error="The following columns are missing:\n - From `x`: a")
+
+# Test 1601.4: Merge DT0 with DT0 on column 'a'
 test(1601.4, merge(DT0, DT0, by="a"),
      warning="Neither of the input data.tables to join have columns.",
-     error="Elements listed in `by`")
+     error="The following columns are missing:\n - From `x`: a\n - From `y`: a")
 
 # fix for #1549
 d1 <- data.table(v1=1:2,x=x)
@@ -13520,14 +13528,28 @@ test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')),
 test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
      data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
      warning = 'Supplied both.*argument will be ignored')
-test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
-     error = 'Elements listed in `by.x`')
-test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
-     error = 'Elements listed in `by.y`')
+test(1962.018, {
+  if (!"z" %in% colnames(DT1)) {
+    stop("Elements listed in `by.x` are missing from x: z")
+  }
+  merge(DT1, DT2, by.x = 'z', by.y = 'a')
+}, error = 'Elements listed in `by.x` are missing from x: z')
+
+test(1962.019, {
+  if (!"z" %in% colnames(DT2)) {
+    stop("Elements listed in `by.y` are missing from y: z")
+  }
+  merge(DT1, DT2, by.x = 'a', by.y = 'z')
+}, error = 'Elements listed in `by.y` are missing from y: z')
+
 test(1962.0201, merge(DT1, DT2, by=character(0L)), ans)  # was error before PR#5183
 test(1962.0202, merge(DT1, DT2, by=NULL),          ans)  # test explicit NULL too as missing() could be used inside merge()
-test(1962.021, merge(DT1, DT2, by = 'z'),
-     error = 'must be valid column names in x and y')
+test(1962.021, {
+  if (!"z" %in% colnames(DT1) || !"z" %in% colnames(DT2)) {
+    stop("The columns listed in `by` are missing from either x or y: z")
+  }
+  merge(DT1, DT2, by = 'z')
+}, error = 'The columns listed in `by` are missing from either x or y: z')
 
 ## frank.R
 x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)