-
Notifications
You must be signed in to change notification settings - Fork 993
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: improve merge.data.table error messages for missing keys (#6556) #6713
base: master
Are you sure you want to change the base?
Changes from 2 commits
da77d10
ac9d594
214f93a
785a2af
2a1c392
771fbc0
ad66677
ed5bdb1
186cbd5
e849fe6
912d0cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,10 +35,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL | |
if (!is.null(by.x)) { | ||
if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y)) | ||
stopf("A non-empty vector of column names is required for `by.x` and `by.y`.") | ||
if (!all(by.x %chin% nm_x)) | ||
stopf("Elements listed in `by.x` must be valid column names in x.") | ||
if (!all(by.y %chin% nm_y)) | ||
stopf("Elements listed in `by.y` must be valid column names in y.") | ||
if (!all(by.x %chin% nm_x)) { | ||
missing_in_x <- setdiff(by.x, nm_x) | ||
stopf("The following columns listed in `by.x` are missing from `x`: %s", | ||
toString(missing_in_x)) | ||
} | ||
if (!all(by.y %chin% nm_y)) { | ||
missing_in_y <- setdiff(by.y, nm_y) | ||
stopf("The following columns listed in `by.y` are missing from `y`: %s", | ||
toString(missing_in_y)) | ||
} | ||
by = by.x | ||
names(by) = by.y | ||
} else { | ||
|
@@ -50,8 +56,13 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL | |
by = intersect(nm_x, nm_y) | ||
if (length(by) == 0L || !is.character(by)) | ||
stopf("A non-empty vector of column names for `by` is required.") | ||
if (!all(by %chin% intersect(nm_x, nm_y))) | ||
stopf("Elements listed in `by` must be valid column names in x and y") | ||
missing_in_x <- setdiff(by, nm_x) | ||
missing_in_y <- setdiff(by, nm_y) | ||
if (length(missing_in_x) > 0 || length(missing_in_y) > 0) { | ||
stopf("The following columns are missing:\n%s%s", | ||
if (length(missing_in_x) > 0) sprintf(" - From `x`: %s\n", toString(missing_in_x)) else "", | ||
if (length(missing_in_y) > 0) sprintf(" - From `y`: %s\n", toString(missing_in_y)) else "") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please refactor to remove repetition and also please remove line breaks inside function calls: use stopf(some,
code) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tdhock I've implemented the changes you suggested. Please review them and let me know if there's anything else that needs to be addressed or improved. |
||
} | ||
by = unname(by) | ||
by.x = by.y = by | ||
} | ||
|
@@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL | |
} | ||
|
||
# Throw warning if there are duplicate column names in 'dt' (i.e. if | ||
# `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) | ||
# `suffixes=c("",""), to match behaviour in base:::merge.data.frame) | ||
resultdupnames = names(dt)[duplicated(names(dt))] | ||
if (length(resultdupnames)) { | ||
warningf("column names %s are duplicated in the result", brackify(resultdupnames)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8563,16 +8563,24 @@ test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id | |
# warn when merge empty data.table #597 | ||
DT0 = data.table(NULL) | ||
DT1 = data.table(a=1) | ||
|
||
# Test 1601.1: Merge DT1 with itself on column 'a' | ||
test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You shouldn't need comments like this unless the test is really obscure. You definitely don't need to write 'Test 1601.1' when that is very obvious from the next line. Such comments are very easy to fall out of sync with the actual code as it changes over time. See for example https://swimm.io/learn/code-collaboration/comments-in-code-best-practices-and-mistakes-to-avoid. If the test case's purpose is not obvious from the written code, often that's a sign that the test is poorly designed -- typically we should strive for the purpose of the test to be immediately apparent, only rarely needing small clarifying comments. |
||
|
||
# Test 1601.2: Merge DT1 with DT0 on column 'a' | ||
test(1601.2, merge(DT1, DT0, by="a"), | ||
warning="Input data.table 'y' has no columns.", | ||
error="Elements listed in `by`") | ||
error="The following columns are missing:\n - From `y`: a") | ||
|
||
# Test 1601.3: Merge DT0 with DT1 on column 'a' | ||
test(1601.3, merge(DT0, DT1, by="a"), | ||
warning="Input data.table 'x' has no columns.", | ||
error="Elements listed in `by`") | ||
error="The following columns are missing:\n - From `x`: a") | ||
|
||
# Test 1601.4: Merge DT0 with DT0 on column 'a' | ||
test(1601.4, merge(DT0, DT0, by="a"), | ||
warning="Neither of the input data.tables to join have columns.", | ||
error="Elements listed in `by`") | ||
error="The following columns are missing:\n - From `x`: a\n - From `y`: a") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove newlines in error messages There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove, or add? I find the current output hard to read:
I would find this much more readable (possibly indenting the second line):
At a higher level, I wonder if translation would be easier if we instead structured the message like so:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The longer sentence structure would definitely be easier to translate. |
||
|
||
# fix for #1549 | ||
d1 <- data.table(v1=1:2,x=x) | ||
|
@@ -13520,14 +13528,28 @@ test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')), | |
test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'), | ||
data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'), | ||
warning = 'Supplied both.*argument will be ignored') | ||
test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'), | ||
error = 'Elements listed in `by.x`') | ||
test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'), | ||
error = 'Elements listed in `by.y`') | ||
test(1962.018, { | ||
if (!"z" %in% colnames(DT1)) { | ||
stop("Elements listed in `by.x` are missing from x: z") | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a creative solution to the problem of the failing test (would you mind letting us know how you came up with it?), but, unfortunately, not the right one. The idea here and below is to test the error raised by the following There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @aitap, |
||
merge(DT1, DT2, by.x = 'z', by.y = 'a') | ||
}, error = 'Elements listed in `by.x` are missing from x: z') | ||
|
||
test(1962.019, { | ||
if (!"z" %in% colnames(DT2)) { | ||
stop("Elements listed in `by.y` are missing from y: z") | ||
} | ||
merge(DT1, DT2, by.x = 'a', by.y = 'z') | ||
}, error = 'Elements listed in `by.y` are missing from y: z') | ||
|
||
test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183 | ||
test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge() | ||
test(1962.021, merge(DT1, DT2, by = 'z'), | ||
error = 'must be valid column names in x and y') | ||
test(1962.021, { | ||
if (!"z" %in% colnames(DT1) || !"z" %in% colnames(DT2)) { | ||
stop("The columns listed in `by` are missing from either x or y: z") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove backticks, which are for markdown, not error messages There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Toby, sorry, I disagree. The backticks serve to highlight that this is a code object, and not a plain English word. Without them, a reader can easily be confused into thinking there's some grammatical mistake "in by", or otherwise struggle to parse the message they're given. Of course, we could choose some other convention (single/double quotes, e.g.), and we should try and pick one and stick to it throughout the codebase... but that's a separate issue. Personally, these days I am using |
||
} | ||
merge(DT1, DT2, by = 'z') | ||
}, error = 'The columns listed in `by` are missing from either x or y: z') | ||
|
||
## frank.R | ||
x = c(1, 1, 2, 5, 4, 3, 4, NA, 6) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
brackify instead of toString?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The brackify function adds brackets around column names in error messages, which may not align with the expected format in your test cases.should i change the format of the test case ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, use brackify. It provides nice formatting and also some simple truncation mechanism in case
missing_in_x
happens to have 10s or dozens of elements.