diff --git a/NEWS.md b/NEWS.md index 14972081d..b27e0af56 100644 --- a/NEWS.md +++ b/NEWS.md @@ -131,6 +131,8 @@ rowwiseDT( 18. `as.data.table()` method for `data.frame`s (especially those with extended classes) is more consistent with `as.data.frame()` with respect to rention of attributes, [#5699](https://github.com/Rdatatable/data.table/issues/5699). Thanks @jangorecki for the report and fix. +19. Grouped queries on keyed tables no longer return an incorrectly keyed result if the _ad hoc_ `by=` list has some function call (in particular, a function which happens to return a strictly decreasing function of the keys), e.g. `by=.(a = rev(a))`, [#5583](https://github.com/Rdatatable/data.table/issues/5583). Thanks @AbrJA for the report and @MichaelChirico for the fix. + ## NOTES 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181). diff --git a/R/data.table.R b/R/data.table.R index 909121852..3449d492b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2014,8 +2014,8 @@ replace_dot_alias = function(e) { if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()} setkeyv(ans,names(ans)[seq_along(byval)]) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} - } else if (keyby || (haskey(x) && bysameorder && (byjoin || (length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))))))) { - setattr(ans,"sorted",names(ans)[seq_along(grpcols)]) + } else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) { + setattr(ans, "sorted", names(ans)[seq_along(grpcols)]) } setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } @@ -3051,6 +3051,21 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { ids } +.by_result_is_keyable = function(x, keyby, bysameorder, byjoin, byvars, bysub) { + if (keyby) return(TRUE) + k = key(x) + if (is.null(k)) return(FALSE) # haskey(x) but saving 'k' for below + if (!bysameorder) return(FALSE) + if (byjoin) return(TRUE) + if (!length(byvars)) return(FALSE) + if (!identical(byvars, head(k, length(byvars)))) return(FALSE) # match key exactly, in order + # For #5583, we also ensure there are no function calls in by (which might break sortedness) + if (is.name(bysub)) return(TRUE) + if (identical(bysub[[1L]], quote(list))) bysub = bysub[-1L] + if (length(all.names(bysub)) > length(byvars)) return(FALSE) + TRUE +} + .is_withFALSE_range = function(e, x, root=root_name(e), vars=all.vars(e)) { if (root != ":") return(FALSE) if (!length(vars)) return(TRUE) # e.g. 1:10 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d4513936a..b63475b43 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -20757,3 +20757,10 @@ as.data.frame.tbl = function(x) { x } test(2302, attr(as.data.table(y), "t1"), attr(as.data.frame(y), "t1")) + +# by=foo(KEY) does not retain key (no way to guarantee monotonic transformation), #5583 +DT = data.table(a=1:2, key='a') +test(2303.1, DT[, .N, by=.(b=rev(a))], data.table(b=2:1, N=1L)) +test(2303.2, DT[, .(N=1L), by=.(b=rev(a))], data.table(b=2:1, N=1L)) # ensure no interaction with GForce +DT = data.table(a=2:3, b=1:0, key=c('a', 'b')) +test(2303.3, DT[, .N, by=.(ab=a^b, d=c(1L, 1L))], data.table(ab=c(2, 1), d=1L, N=1L))