From 4c5f1e7420a7852fffac488ffc89b735754465f0 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 20 May 2024 19:39:53 +0200 Subject: [PATCH] Gforce grouping var class (#5568) * copy classes to grouping vars * add tests * add different optimization levels to test * add news * add output * fix news * fix typo * add NEWS info and tests about attributes * hone NEWS * hone comment * Reframe test annotation * tweak test * Second call site --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 4 +++- R/data.table.R | 2 +- inst/tests/tests.Rraw | 9 +++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index af2a55747..0cb593a9b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,7 +40,7 @@ 12. `setDT` is faster for data with many columns, thanks @MichaelChirico for reporting and fixing the issue, [#5426](https://github.com/Rdatatable/data.table/issues/5426). -13. `dcast`gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. +13. `dcast` gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. 14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. @@ -62,6 +62,8 @@ 8. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. +9. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/data.table.R b/R/data.table.R index 5773bca73..7975d2a3a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1842,7 +1842,7 @@ replace_dot_alias = function(e) { if (use.I) assign(".I", seq_len(nrow(x)), thisEnv) ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971. gi = if (length(o__)) o__[f__] else f__ - g = lapply(grpcols, function(i) groups[[i]][gi]) + g = lapply(grpcols, function(i) .Call(CsubsetVector, groups[[i]], gi)) # use CsubsetVector instead of [ to preserve attributes #5567 # returns all rows instead of one per group nrow_funs = c("gshift") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a1c68149b..6263b833d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18585,3 +18585,12 @@ test(2262.5, null.data.table()[, c("a","b") := list(1:2, 3:4)], dt3) test(2262.6, set(null.data.table(), j=c("a","b"), value=list(1:2, 3:4)), dt3) test(2262.7, data.table(a=1, b=2)[, c("a", "b") := list(NULL, NULL)], null.data.table()) test(2262.8, data.table(a=1, b=2)[, c("a", "b") := list(NULL)], null.data.table()) + +# GForce retains attributes in by arguments #5567 +dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) +test(2263.1, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce FALSE") +test(2263.2, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce FALSE") +test(2263.3, options=list(datatable.verbose=TRUE, datatable.optimize=0L), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce FALSE") +test(2263.4, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce optimized j to") +test(2263.5, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce optimized j to") +test(2263.6, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce optimized j to")