Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve merge.data.table error messages for missing keys (#6556) #6713

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
51 changes: 30 additions & 21 deletions R/merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
by = key(x)
}
}
x0 = length(x)==0L
y0 = length(y)==0L
if (x0 || y0) {
if (x0 && y0)
x0 = length(x) == 0L
y0 = length(y) == 0L
if (x0 || y0){
if (x0 && y0) {
warningf("Neither of the input data.tables to join have columns.")
else if (x0)
} else if (x0) {
warningf("Input data.table '%s' has no columns.", "x")
else
} else if (y0) {
warningf("Input data.table '%s' has no columns.", "y")
}
}
check_duplicate_names(x)
check_duplicate_names(y)
Expand All @@ -28,34 +29,42 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
nm_y = names(y)

## set up 'by'/'by.x'/'by.y'
if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
stopf("`by.x` and `by.y` must be of same length.")
if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y))
stopf("'by.x' and 'by.y' must be of same length.")
if (!missing(by) && !missing(by.x))
warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
warningf("Supplied both by and 'by.x/by.y.' by argument will be ignored.")
if (!is.null(by.x)) {
if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
if (!all(by.x %chin% nm_x))
stopf("Elements listed in `by.x` must be valid column names in x.")
if (!all(by.y %chin% nm_y))
stopf("Elements listed in `by.y` must be valid column names in y.")
if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y))
stopf("A non-empty vector of column names is required for 'by.x' and 'by.y'.")
if (!all(by.x %chin% nm_x)) {
missing_in_x <- setdiff(by.x, nm_x)
stopf("The following columns listed in 'by.x' are missing from x: %s", brackify(missing_in_x))
}
if (!all(by.y %chin% nm_y)) {
missing_in_y <- setdiff(by.y, nm_y)
stopf("The following columns listed in 'by.y' are missing from y: %s", brackify(missing_in_y))
}
by = by.x
names(by) = by.y
} else {
if (is.null(by))
by = intersect(key(x), key(y))
if (!length(by)) # was is.null() before PR#5183 changed to !length()
if (!length(by)) # was is.null() before PR#5183 changed to !length()
by = key(x)
if (!length(by))
by = intersect(nm_x, nm_y)
if (length(by) == 0L || !is.character(by))
stopf("A non-empty vector of column names for `by` is required.")
if (!all(by %chin% intersect(nm_x, nm_y)))
stopf("Elements listed in `by` must be valid column names in x and y")
stopf("A non-empty vector of column names for by is required.")
missing_in_x <- setdiff(by, nm_x)
missing_in_y <- setdiff(by, nm_y)
if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
stopf(gettextf("The following columns are missing:\n%s%s",
if (length(missing_in_x) > 0) gettextf(" - From x: %s\n", brackify(missing_in_x)) else "",
if (length(missing_in_y) > 0) gettextf(" - From y: %s\n", brackify(missing_in_y)) else ""))
}
by = unname(by)
by.x = by.y = by
}

# warn about unused arguments #2587
if (length(list(...))) {
ell = as.list(substitute(list(...)))[-1L]
Expand Down Expand Up @@ -109,7 +118,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
}

# Throw warning if there are duplicate column names in 'dt' (i.e. if
# `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
# `suffixes=c("",""), to match behaviour in base:::merge.data.frame)
resultdupnames = names(dt)[duplicated(names(dt))]
if (length(resultdupnames)) {
warningf("column names %s are duplicated in the result", brackify(resultdupnames))
Expand Down
46 changes: 26 additions & 20 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -8563,18 +8563,21 @@ test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id

# warn when merge empty data.table #597
DT0 = data.table(NULL)
DT1 = data.table(a=1)
test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
test(1601.2, merge(DT1, DT0, by="a"),
warning="Input data.table 'y' has no columns.",
error="Elements listed in `by`")
test(1601.3, merge(DT0, DT1, by="a"),
warning="Input data.table 'x' has no columns.",
error="Elements listed in `by`")
test(1601.4, merge(DT0, DT0, by="a"),
warning="Neither of the input data.tables to join have columns.",
error="Elements listed in `by`")

DT1 = data.table(a = 1)

test(1601.1, merge(DT1, DT1, by = "a"), data.table(a = 1, key = "a"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't need comments like this unless the test is really obscure. You definitely don't need to write 'Test 1601.1' when that is very obvious from the next line.

Such comments are very easy to fall out of sync with the actual code as it changes over time. See for example https://swimm.io/learn/code-collaboration/comments-in-code-best-practices-and-mistakes-to-avoid.

If the test case's purpose is not obvious from the written code, often that's a sign that the test is poorly designed -- typically we should strive for the purpose of the test to be immediately apparent, only rarely needing small clarifying comments.

test(1601.2,
merge(DT1, DT0, by = "a"),
warning = "Input data.table 'y' has no columns.",
error = "The following columns are missing:\n - From y: [a]")
test(1601.3,
merge(DT0, DT1, by = "a"),
warning = "Input data.table 'x' has no columns.",
error = "The following columns are missing:\n - From x: [a]")
test(1601.4,
merge(DT0, DT0, by = "a"),
warning = "Neither of the input data.tables to join have columns.",
error = "The following columns are missing:\n - From x: [a]\n - From y: [a]")
# fix for #1549
d1 <- data.table(v1=1:2,x=x)
d2 <- data.table(v1=3:4)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove newlines in error messages

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove, or add? I find the current output hard to read:

The following columns are missing: - From x: a

I would find this much more readable (possibly indenting the second line):

The following columns are missing:
- From x: a

At a higher level, I wonder if translation would be easier if we instead structured the message like so:

The following columns are missing from x: ...
The following columns are missing from y: ...

Copy link
Contributor

@aitap aitap Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The longer sentence structure would definitely be easier to translate.

Expand Down Expand Up @@ -13546,14 +13549,17 @@ test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')),
test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove backticks, which are for markdown, not error messages

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Toby, sorry, I disagree.

The backticks serve to highlight that this is a code object, and not a plain English word. Without them, a reader can easily be confused into thinking there's some grammatical mistake "in by", or otherwise struggle to parse the message they're given.

Of course, we could choose some other convention (single/double quotes, e.g.), and we should try and pick one and stick to it throughout the codebase... but that's a separate issue.

Personally, these days I am using `arg=` for function arguments to highlight that (1) it's code with the backticks and (2) it's a keyword argument with =.

data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
warning = 'Supplied both.*argument will be ignored')
test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
error = 'Elements listed in `by.x`')
test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
error = 'Elements listed in `by.y`')
test(1962.018,
merge(DT1, DT2, by.x = 'z', by.y = 'a'),
error = "The following columns listed in 'by.x' are missing from x: [z]")
test(1962.019,
merge(DT1, DT2, by.x = 'a', by.y = 'z'),
error = "The following columns listed in 'by.y' are missing from y: [z]")
test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183
test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge()
test(1962.021, merge(DT1, DT2, by = 'z'),
error = 'must be valid column names in x and y')
test(1962.021, {
merge(DT1, DT2, by = 'z')
}, error = 'The following columns are missing:\n - From x: [z]\n - From y: [z]')

## frank.R
x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
Expand Down Expand Up @@ -16931,7 +16937,7 @@ test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing
A = data.table(A='a')
B = data.table(B='b')
test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector")
test(2145.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.")
test(2145.2, merge(A, B, by=character(0) ), error = "A non-empty vector of column names for by is required.")
test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required")
# Also shouldn't crash when using internal functions
test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty')
Expand Down Expand Up @@ -18014,7 +18020,7 @@ test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x
test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5)))
test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.")
test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L),
merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
merge(DT, y, by="k2"), warning=c("Supplied both by and 'by.x/by.y.' by argument will be ignored.", "Passed 1 unknown and unnamed arguments."))

# weighted.mean GForce optimized, #3977
old = options(datatable.optimize=1L)
Expand Down
Loading