From e67e8e0afe2ade3c3a1bc927465383114b2eae03 Mon Sep 17 00:00:00 2001 From: pwwang <1188067+pwwang@users.noreply.github.com> Date: Tue, 15 Mar 2022 21:01:44 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=A9=B9=20Make=20`base.ntile()`=20labels?= =?UTF-8?q?=201-based=20(#92)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🩹 Make `base.ntile()` labels 1-based * 🚑 Allow `base.paste/paste0()` to work with grouped data * Update CHANGLOG --- datar/base/string.py | 46 +++++++++++++++++++---------- datar/dplyr/_rank.py | 4 +-- datar/dplyr/rank.py | 6 ++-- docs/CHANGELOG.md | 2 ++ tests/base/test_string.py | 10 ++++++- tests/dplyr/test_mutate_windowed.py | 4 +-- tests/dplyr/test_rank.py | 45 ++++++++++++++-------------- 7 files changed, 71 insertions(+), 46 deletions(-) diff --git a/datar/base/string.py b/datar/base/string.py index 720c4e7f..5b82733a 100644 --- a/datar/base/string.py +++ b/datar/base/string.py @@ -4,12 +4,13 @@ import numpy as np import pandas as pd from pandas import Series +from pandas.core.base import PandasObject from pandas.core.groupby import SeriesGroupBy from pandas.api.types import is_string_dtype, is_scalar from pipda import register_func -from ..core.tibble import TibbleRowwise +from ..core.tibble import TibbleGrouped, TibbleRowwise from ..core.contexts import Context from ..core.factory import func_factory, dispatching from ..core.utils import ( @@ -21,7 +22,6 @@ from .casting import _as_type from .testing import _register_type_testing from .logical import as_logical -from .seq import lengths def _recycle_value(value, size, name=None): @@ -396,6 +396,9 @@ def _nchar_scalar(x, retn, allow_na, keep_na, na_len): # paste and paste0 -------------------- +_is_empty = lambda x: ( + (is_scalar(x) and not x) or (not is_scalar(x) and len(x) == 0) +) @register_func(None, context=Context.EVAL) @@ -411,24 +414,35 @@ def paste(*args, sep=" ", collapse=None): A single string if collapse is given, otherwise an array of strings. """ if len(args) == 1 and isinstance(args[0], TibbleRowwise): - return args[0].apply( - lambda row: paste(*row, sep=sep, collapse=collapse), axis=1 + out = args[0].apply( + lambda row: row.astype(str).str.cat(sep=sep), axis=1 ) + return collapse.join(out) if collapse else out - maxlen = max(regcall(lengths, args)) - args = zip( - *( - _recycle_value(arg, maxlen, f"{i}th value") - for i, arg in enumerate(args) - ) - ) + from ..tibble import tibble - args = [as_character(arg, _na="NA") for arg in args] - out = [sep.join(arg) for arg in args] - if collapse is not None: - return collapse.join(out) + if all(_is_empty(arg) for arg in args): + df = tibble(*args, _name_repair="minimal") + else: + df = tibble( + *("" if _is_empty(arg) else arg for arg in args), + _name_repair="minimal", + ) - return np.array(out, dtype=object) + if not isinstance(df, TibbleGrouped): + out = df.apply(lambda col: col.astype(str).str.cat(sep=sep), axis=1) + if collapse: + return collapse.join(out) + if any(isinstance(x, PandasObject) for x in args): + return out + return np.array(out, dtype=object) + + out = df.apply( + lambda row: row.astype(str).str.cat(sep=sep), axis=1 + ).groupby(df._datar["grouped"].grouper) + if collapse: + out = out.agg(lambda x: x.str.cat(sep=collapse)) + return out @register_func(None, context=Context.EVAL) diff --git a/datar/dplyr/_rank.py b/datar/dplyr/_rank.py index d8dcc64b..cf8da431 100644 --- a/datar/dplyr/_rank.py +++ b/datar/dplyr/_rank.py @@ -123,7 +123,7 @@ def _(x, n): return Categorical([np.nan] * x.size) n = min(n, x.size) - return pd.cut(x, n, labels=range(n)) + return pd.cut(x, n, labels=np.arange(n) + 1) @_ntile.register(GroupBy) @@ -132,7 +132,7 @@ def _(x, n): lambda grup: pd.cut( grup, min(n, len(grup)), - labels=range(min(n, len(grup))), + labels=np.arange(min(n, len(grup))) + 1, ) ) diff --git a/datar/dplyr/rank.py b/datar/dplyr/rank.py index 0415b029..413d297b 100644 --- a/datar/dplyr/rank.py +++ b/datar/dplyr/rank.py @@ -43,13 +43,13 @@ def ntile( """A rough rank, which breaks the input vector into `n` buckets. Note: - The output tiles are 0-based. + The output tiles are 1-based. The result is slightly different from dplyr's ntile. >>> ntile(c(1,2,NA,1,0,NA), 2) # dplyr >>> # 1 2 NA 2 1 NA >>> ntile([1,2,NA,1,0,NA], n=2) # datar - >>> # [0, 1, NA, 0, 0, NA] - >>> # Categories (2, int64): [0 < 1] + >>> # [1, 2, NA, 1, 1, NA] + >>> # Categories (2, int64): [1 < 2] """ if isinstance(x, int) and n is None: n = x diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index dc72e971..56c3cb89 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -5,12 +5,14 @@ - ✨ Allow `forcats.fct_inorder()` to work with groupby data - ✨ Allow `base.rep()`'s arguments `length` and `each` to work with grouped data - ✨ Allow `base.c()` to work with grouped data +- ✨ Allow `base.paste()`/`base.paste0()` to work with grouped data - 🐛 Force `&/|` operators to return boolean data - 🚑 Fix `base.diff()` not keep empty groups - 🐛 Fix recycling non-ordered grouped data - 🩹 Fix `dplyr.count()/tally()`'s warning about the new name - 🚑 Make `dplyr.n()` return groupoed data - 🐛 Make `dplyr.slice()` work better with rows/indices from grouped data +- 🩹 Make `dplyr.ntile()` labels 1-based - ✨ Add `datar.attrgetter()`, `datar.pd_str()`, `datar.pd_cat()` and `datar.pd_dt()` ## 0.6.2 diff --git a/tests/base/test_string.py b/tests/base/test_string.py index 7c6f2ef8..bc701cea 100644 --- a/tests/base/test_string.py +++ b/tests/base/test_string.py @@ -135,7 +135,15 @@ def test_paste(): assert_iterable_equal(out, []) out = paste0([], ["a"]) - assert_iterable_equal(out, ["NAa"]) + assert_iterable_equal(out, ["a"]) + + df = tibble(x=[1, 2, 3], y=[4, 5, 5]) + out = paste(df) + assert_iterable_equal(out, ["1 4", "2 5", "3 5"]) + + gf = df.group_by("y") + out = paste0(gf, collapse="|") + assert_iterable_equal(out, ['14', '25|35']) def test_sprintf(): diff --git a/tests/dplyr/test_mutate_windowed.py b/tests/dplyr/test_mutate_windowed.py index 38310285..e97e6a6f 100644 --- a/tests/dplyr/test_mutate_windowed.py +++ b/tests/dplyr/test_mutate_windowed.py @@ -181,7 +181,7 @@ def test_rank_functions_deal_correctly_with_na(): assert res.min_rank[[0, 1, 3, 4]].tolist() == c(2, 4, 2, 1) assert res.dense_rank[[0, 1, 3, 4]].tolist() == c(2, 3, 2, 1) assert res.cume_dist[[0, 1, 3, 4]].tolist() == c(0.75, 1.0, 0.75, 0.25) - assert res.ntile[[0, 1, 3, 4]].tolist() == c(0, 1, 0, 0) + assert res.ntile[[0, 1, 3, 4]].tolist() == c(1, 2, 1, 1) assert res.row_number[[0, 1, 3, 4]].tolist() == c(2, 4, 3, 1) data = tibble(x=rep(c(1, 2, NA, 1, 0, NA), 2), g=rep([1, 2], each=6)) @@ -224,7 +224,7 @@ def test_rank_functions_deal_correctly_with_na(): ) assert ( res.ntile.obj[[0, 1, 3, 4, 6, 7, 9, 10]].tolist() - == rep(c(0, 1, 0, 0), 2).tolist() + == rep(c(1, 2, 1, 1), 2).tolist() ) assert ( res.row_number.obj[[0, 1, 3, 4, 6, 7, 9, 10]].tolist() diff --git a/tests/dplyr/test_rank.py b/tests/dplyr/test_rank.py index 0f78d114..aee7fc61 100644 --- a/tests/dplyr/test_rank.py +++ b/tests/dplyr/test_rank.py @@ -1,5 +1,6 @@ # tests grabbed from: # https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-rank.r +import numpy as np import pytest from datar import f @@ -29,16 +30,16 @@ def ntile_h(x, n): def test_ntile_ignores_number_of_nas(): x = c(1, 2, 3, NA, NA) out = ntile(x, 3) - assert_iterable_equal(out, [0, 1, 2, NA, NA]) + assert_iterable_equal(out, [1, 2, 3, NA, NA]) out = ntile_h(x, 3) - assert_iterable_equal(out, [0, 1, 2, NA, NA]) + assert_iterable_equal(out, [1, 2, 3, NA, NA]) x1 = c(1, 1, 1, NA, NA, NA) out = ntile(x1, n=1) - assert_iterable_equal(out, [0, 0, 0, NA, NA, NA]) + assert_iterable_equal(out, [1, 1, 1, NA, NA, NA]) out = ntile_h(x1, 1) - assert_iterable_equal(out, [0, 0, 0, NA, NA, NA]) + assert_iterable_equal(out, [1, 1, 1, NA, NA, NA]) def test_ntile_always_returns_an_integer(): @@ -107,20 +108,20 @@ def test_lead_lag_inside_mutates_handles_expressions_as_value_for_default(): def test_ntile_puts_large_groups_first(): - assert_iterable_equal(ntile(range(1), n=5), [0]) - assert_iterable_equal(ntile(range(2), n=5), list(range(2))) - assert_iterable_equal(ntile(range(3), n=5), list(range(3))) - assert_iterable_equal(ntile(range(4), n=5), list(range(4))) - assert_iterable_equal(ntile(range(5), n=5), list(range(5))) - assert_iterable_equal(ntile(range(6), n=5), c(0, range(5))) - assert_iterable_equal(ntile(range(1), n=7), [0]) - assert_iterable_equal(ntile(range(2), n=7), list(range(2))) - assert_iterable_equal(ntile(range(3), n=7), list(range(3))) - assert_iterable_equal(ntile(range(4), n=7), list(range(4))) - assert_iterable_equal(ntile(range(5), n=7), list(range(5))) - assert_iterable_equal(ntile(range(6), n=7), list(range(6))) - assert_iterable_equal(ntile(range(7), n=7), list(range(7))) - assert_iterable_equal(ntile(range(8), n=7), c(0, range(7))) + assert_iterable_equal(ntile(range(1), n=5), [1]) + assert_iterable_equal(ntile(range(2), n=5), np.arange(2) + 1) + assert_iterable_equal(ntile(range(3), n=5), np.arange(3) + 1) + assert_iterable_equal(ntile(range(4), n=5), np.arange(4) + 1) + assert_iterable_equal(ntile(range(5), n=5), np.arange(5) + 1) + assert_iterable_equal(ntile(range(6), n=5), c(1, np.arange(5) + 1)) + assert_iterable_equal(ntile(range(1), n=7), [1]) + assert_iterable_equal(ntile(range(2), n=7), np.arange(2) + 1) + assert_iterable_equal(ntile(range(3), n=7), np.arange(3) + 1) + assert_iterable_equal(ntile(range(4), n=7), np.arange(4) + 1) + assert_iterable_equal(ntile(range(5), n=7), np.arange(5) + 1) + assert_iterable_equal(ntile(range(6), n=7), np.arange(6) + 1) + assert_iterable_equal(ntile(range(7), n=7), np.arange(7) + 1) + assert_iterable_equal(ntile(range(8), n=7), c(1, np.arange(7) + 1)) def test_plain_arrays(): @@ -129,9 +130,9 @@ def test_plain_arrays(): out = row_number([1, 1, 2]) assert_iterable_equal(out, [1, 2, 3]) out = ntile(1, 1) - assert_iterable_equal(out, [0]) + assert_iterable_equal(out, [1]) out = ntile((i for i in range(1)), 1) - assert_iterable_equal(out, [0]) + assert_iterable_equal(out, [1]) out = cume_dist(1) assert_iterable_equal(out, [1]) out = cume_dist([]) @@ -155,11 +156,11 @@ def test_row_number_with_groups(): def test_ntile_with_groups(): df = tibble(x=f[1:9], y=[1] * 4 + [2] * 4) out = ntile(df.x, 2) - assert out.tolist() == [0, 0, 0, 0, 1, 1, 1, 1] + assert out.tolist() == [1, 1, 1, 1, 2, 2, 2, 2] df = df.groupby("y") out = ntile(df.x, 2) - assert out.tolist() == [0, 0, 1, 1, 0, 0, 1, 1] + assert out.tolist() == [1, 1, 2, 2, 1, 1, 2, 2] def test_min_rank_with_groups():