diff --git a/datar/__init__.py b/datar/__init__.py index b9af686d..3b3d9061 100644 --- a/datar/__init__.py +++ b/datar/__init__.py @@ -30,7 +30,7 @@ ) __all__ = ("f", "get_versions") -__version__ = "0.6.1" +__version__ = "0.6.2" def get_versions(prnt: bool = True) -> _VersionsTuple: diff --git a/datar/all.py b/datar/all.py index e39c5d7e..bbbfd1e8 100644 --- a/datar/all.py +++ b/datar/all.py @@ -14,6 +14,7 @@ from .dplyr import _warn as _ from .tibble import * from .tidyr import * +from .base import rank # overwrite dplyr.rank _builtin_names = _base_builtin_names.copy() _builtin_names.update(_dplyr_builtin_names) diff --git a/datar/base/__init__.py b/datar/base/__init__.py index 736d8e2d..6d041b46 100644 --- a/datar/base/__init__.py +++ b/datar/base/__init__.py @@ -59,6 +59,7 @@ from .funs import ( cut, data_context, + diff, expandgrid, identity, make_unique, diff --git a/datar/base/funs.py b/datar/base/funs.py index 7067cf81..b32428cb 100644 --- a/datar/base/funs.py +++ b/datar/base/funs.py @@ -12,6 +12,7 @@ from ..core.middlewares import WithDataEnv from ..core.contexts import Context +from ..core.factory import func_factory from ..core.tibble import Tibble from ..core.utils import arg_match, name_of from ..core.names import repair_names @@ -66,6 +67,38 @@ def cut( ) +@func_factory("agg", "x") +def diff(x, lag: int = 1, differences: int = 1): + """Calculates suitably lagged and iterated differences. + + If the data is a vector of length n and differences = 1, then the computed + result is equal to the successive differences + `x[lag:] – x[:-lag]`. + + Examples: + >>> rv = [52, 21, 10, 11, 19] + >>> data = diff(rv) + >>> # -31 -11 1 8 + >>> # rv[1:] - rv[:-1] + >>> # rv[1:] [21, 10, 11, 19] + >>> # rv[:-1] [52, 21, 10, 11] + + Args: + x: The data + lag: The lag to use. Could be negative. + It always calculates `x[lag:] - x[:-lag]` even when `lag` is negative + differences: The order of the difference + + Returns: + An array of `x[lag:] – x[:-lag]`. + If `differences > 1`, the rule applies `differences` times on `x` + """ + x = x.values + for _ in range(differences): + x = x[lag:] - x[:-lag] + return x + + @register_func(None, context=Context.EVAL) def identity(x): """Return whatever passed in diff --git a/datar/core/broadcast.py b/datar/core/broadcast.py index 83ee6a1d..2bbe3e8c 100644 --- a/datar/core/broadcast.py +++ b/datar/core/broadcast.py @@ -668,7 +668,9 @@ def init_tibble_from(value, name: str) -> Tibble: @init_tibble_from.register(Series) def _(value: Series, name: str) -> Tibble: - name = name or value.name + # Deprecate warning, None will be used as series name in the future + # So use 0 as default here + name = name or value.name or 0 return Tibble(value.to_frame(name=name), copy=False) diff --git a/datar/core/operator.py b/datar/core/operator.py index f224f2e7..eb4b80ef 100644 --- a/datar/core/operator.py +++ b/datar/core/operator.py @@ -14,8 +14,15 @@ def _binop(op, left, right, fill_false=False): left, right, grouper, is_rowwise = broadcast2(left, right) if fill_false: - left = Series(left).fillna(False).values - right = Series(right).fillna(False).values + if isinstance(left, Series): + left = left.fillna(False) + else: + left = Series(left).fillna(False).values + + if isinstance(right, Series): + right = right.fillna(False) + else: + right = Series(right).fillna(False).values out = op(left, right) if grouper: diff --git a/datar/dplyr/relocate.py b/datar/dplyr/relocate.py index 7bad01e3..6dbd5172 100644 --- a/datar/dplyr/relocate.py +++ b/datar/dplyr/relocate.py @@ -52,6 +52,7 @@ def relocate( *args, **kwargs, _group_vars=gvars, + _missing_gvars_inform=False, ) to_move = list(to_move) @@ -65,6 +66,7 @@ def relocate( all_columns, _before, _group_vars=[], + _missing_gvars_inform=False, )[0] ) if where not in to_move: @@ -76,6 +78,7 @@ def relocate( all_columns, _after, _group_vars=[], + _missing_gvars_inform=False, )[0] ) if where not in to_move: diff --git a/datar/dplyr/rename.py b/datar/dplyr/rename.py index 40cf6fc0..fbc0e5e6 100644 --- a/datar/dplyr/rename.py +++ b/datar/dplyr/rename.py @@ -29,6 +29,7 @@ def rename(_data, **kwargs): selected, new_names = _eval_select( all_columns, _group_vars=gvars, + _missing_gvars_inform=False, **kwargs, ) diff --git a/datar/dplyr/select.py b/datar/dplyr/select.py index e3c1335d..08e13782 100644 --- a/datar/dplyr/select.py +++ b/datar/dplyr/select.py @@ -66,6 +66,7 @@ def _eval_select( _all_columns: Index, *args: Any, _group_vars: Sequence[str], + _missing_gvars_inform: bool = True, **kwargs: Any, ) -> Tuple[Sequence[int], Mapping[str, str]]: """Evaluate selections to get locations @@ -79,9 +80,10 @@ def _eval_select( *kwargs.values(), ) - missing = regcall(setdiff, _group_vars, _all_columns[selected_idx]) - if len(missing) > 0: - logger.info("Adding missing grouping variables: %s", missing) + if _missing_gvars_inform: + missing = regcall(setdiff, _group_vars, _all_columns[selected_idx]) + if len(missing) > 0: + logger.info("Adding missing grouping variables: %s", missing) selected_idx = regcall( union, diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 3d60cdcd..21841a39 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.6.2 + +- 🚑 Fix #87 boolean operator losing index +- 🚑 Fix false alarm from `rename()`/`relocate()` for missing grouping variables (#89) +- ✨ Add `base.diff()` +- 📝 [doc] Update/Fix doc for case_when (#87) +- 📝 [doc] Fix links in reference map +- 📝 [doc] Update docs for `dplyr.base` + ## 0.6.1 - 🐛 Fix `rep(df, n)` producing a nested df diff --git a/docs/notebooks/base-arithmetic.ipynb b/docs/notebooks/base-arithmetic.ipynb new file mode 100644 index 00000000..bd5d2e8c --- /dev/null +++ b/docs/notebooks/base-arithmetic.ipynb @@ -0,0 +1,1521 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Diot({'warn_builtin_names': False}, diot_transform= at 0x7fad66f3b790>)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
Try this notebook on binder.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # sum " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Sum of the input.\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `na_rm`: Exclude the NAs. If `x` is SeriesGroupBy object, this is always \n", + "    True, and you might want to use `f.x.sum(min_count=...)` to control \n", + "    NA produces \n", + "    And also unlike the function in `R`. It defaults to `True` rather \n", + "    than `False` \n", + "\n", + "##### Returns:\n", + "  The sum of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # prod " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Product of the input.\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `na_rm`: Exclude the NAs. If `x` is SeriesGroupBy object, this is always \n", + "    True, and you might want to use `f.x.prod(min_count=...)` to control \n", + "    NA produces \n", + "    And also unlike the function in `R`. It defaults to `True` rather \n", + "    than `False` \n", + "\n", + "##### Returns:\n", + "  The prod of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # mean " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Mean of the input.\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `na_rm`: Exclude the NAs. If `x` is SeriesGroupBy object, this is always \n", + "    True. \n", + "    And also unlike the function in `R`. It defaults to `True` rather \n", + "    than `False` \n", + "\n", + "##### Returns:\n", + "  The mean of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # median " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Median of the input.\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `na_rm`: Exclude the NAs. If `x` is SeriesGroupBy object, this is always \n", + "    True. \n", + "    And also unlike the function in `R`. It defaults to `True` rather \n", + "    than `False` \n", + "\n", + "##### Returns:\n", + "  The median of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # min " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Min of the input.\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `na_rm`: Exclude the NAs. If `x` is SeriesGroupBy object, this is always \n", + "    True, and you might want to use `f.x.min(min_count=...)` to control \n", + "    NA produces \n", + "    And also unlike the function in `R`. It defaults to `True` rather \n", + "    than `False` \n", + "\n", + "##### Returns:\n", + "  The min of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # max " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Max of the input.\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `na_rm`: Exclude the NAs. If `x` is SeriesGroupBy object, this is always \n", + "    True, and you might want to use `f.x.max(min_count=...)` to control \n", + "    NA produces \n", + "    And also unlike the function in `R`. It defaults to `True` rather \n", + "    than `False` \n", + "\n", + "##### Returns:\n", + "  The max of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # var " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Variance of the input.\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `na_rm`: Exclude the NAs. If `x` is SeriesGroupBy object, this is always \n", + "    True \n", + "    And also unlike the function in `R`. It defaults to `True` rather \n", + "    than `False` \n", + "\n", + "  `ddof`: Delta Degrees of Freedom \n", + "\n", + "##### Returns:\n", + "  The variance of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # pmin " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the min value rowwisely\n", + "\n", + "##### Args:\n", + "  `*x`: The iterables. Elements will be recycled to the max length \n", + "  `na_rm`: Whether ignore the NAs \n", + "\n", + "##### Returns:\n", + "  The rowwise min of `*x` \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # pmax " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the max value rowwisely\n", + "\n", + "##### Args:\n", + "  `*x`: The iterables. Elements will be recycled to the max length \n", + "  `na_rm`: Whether ignore the NAs \n", + "\n", + "##### Returns:\n", + "  The rowwise max of `*x` \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # round_ " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Rounding a number\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "  `ndigits`: number of digits to keep. Must be positional argument. \n", + "\n", + "##### Returns:\n", + "  The rounded input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # sqrt " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the square root of a number/numbers\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "\n", + "##### Returns:\n", + "  The square root of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # absolute " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the absolute value of a number/numbers\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "\n", + "##### Returns:\n", + "  The absolute values of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # sign " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the signs of the corresponding elements of x\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "\n", + "##### Returns:\n", + "  The signs of the corresponding elements of x \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # trunc " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the integers truncated for each element in x\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "\n", + "##### Returns:\n", + "  The ingeters of elements in x being truncated \n", + "  Note the dtype is still float. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # ceiling " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the ceiling integer of a number/numbers\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "\n", + "##### Returns:\n", + "  The ceiling integer of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # floor " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get the floor integer of a number/numbers\n", + "\n", + "##### Args:\n", + "  `x`: The input \n", + "\n", + "##### Returns:\n", + "  The floor integer of the input \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # signif " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Rounds the values in its first argument to the specified number of\n", + "significant digits \n", + "\n", + "##### Args:\n", + "  `x`: A numeric vector or scalar \n", + "  `digits`: integer indicating the number of significant digits to be used \n", + "\n", + "##### Returns:\n", + "  The rounded values for each element in x \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # log " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Computes logarithms, by default natural logarithm\n", + "\n", + "##### Args:\n", + "  `x`: A numeric scalar or vector \n", + "  `base`: The base of the logarithm \n", + "\n", + "##### Returns:\n", + "  The value of the logarithm if x is scalar, otherwise element-wise \n", + "  logarithm of elements in x \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # exp " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculates the power of natural number\n", + "\n", + "##### Args:\n", + "  `x`: A numeric scalar or vector \n", + "\n", + "##### Returns:\n", + "  Power of natural number of element-wise power of natural number for x \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # log2 " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Computes logarithms with base 2\n", + "\n", + "##### Args:\n", + "  `x`: A numeric scalar or vector \n", + "\n", + "##### Returns:\n", + "  The value of log2 if x is scalar, otherwise element-wise \n", + "  log2 of elements in x \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # log10 " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Computes logarithms with base 10\n", + "\n", + "##### Args:\n", + "  `x`: A numeric scalar or vector \n", + "\n", + "##### Returns:\n", + "  The value of log10 if x is scalar, otherwise element-wise \n", + "  log10 of elements in x \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # log1p " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Computes log(1+x)\n", + "\n", + "##### Args:\n", + "  `x`: A numeric scalar or vector \n", + "\n", + "##### Returns:\n", + "  The value of log(1+x) if x is scalar, otherwise element-wise \n", + "  log(1+x) of elements in x \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # cov " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Compute pairwise covariance of dataframe columns,\n", + "or between two variables \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # _scale " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Scaling and Centering of a numeric data frame\n", + "\n", + "See Details in `?scale` in `R` \n", + "\n", + "##### Args:\n", + "  `x`: The numeric data frame to scale \n", + "  `center`: either a logical value or numeric-alike vector of length \n", + "    equal to the number of columns of `x` \n", + "\n", + "  `scale`: either a logical value or a numeric-alike vector of length \n", + "    equal to the number of columns of `x`. \n", + "\n", + "##### Returns:\n", + "  The centered, scaled data frame \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # col_sums " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate sum of a data frame by column\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The sums by column. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # row_sums " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate sum of a data frame by row\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The sums by row. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # col_means " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate mean of a data frame by column\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The means by column. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # row_means " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate mean of a data frame by row\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The means by row. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # col_sds " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate stdev of a data frame by column\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `ddof`: Delta Degrees of Freedom. \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The stdevs by column. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # row_sds " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate stdev of a data frame by row\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `ddof`: Delta Degrees of Freedom. \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The stdevs by row. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # col_medians " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate median of a data frame by column\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The medians by column. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # row_medians " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate median of a data frame by row\n", + "\n", + "##### Args:\n", + "  `x`: The data frame \n", + "  `na_rm`: Specifies how to handle missing values in `x`. \n", + "\n", + "##### Returns:\n", + "  The medians by row. \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # quantile " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### produces sample quantiles corresponding to the given probabilities.\n", + "\n", + "##### Args:\n", + "  `x`: The data to sample \n", + "  `probs`: numeric vector of probabilities with values in [0,1] \n", + "  `na_rm`: if true, any ‘NA’ and ‘NaN’'s are removed from ‘x’ \n", + "    before the quantiles are computed. \n", + "\n", + "  `quantile`: {'linear', 'lower', 'higher', 'midpoint', 'nearest'} \n", + "    This optional parameter specifies the interpolation method to use, \n", + "    when the desired quantile lies between two data points i and j. \n", + "    fractional part of the index surrounded by i and j. \n", + "\n", + "    - lower: i.\n", + "\n", + "    - higher: j.\n", + "\n", + "    - nearest: i or j whichever is nearest.\n", + "\n", + "    - midpoint: (i + j) / 2.\n", + "\n", + "##### Returns:\n", + "  An array of quantile values \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # std " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Get standard deviation of the input\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # weighted_mean " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculate weighted mean\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from datar import options\n", + "options(warn_builtin_names=False)\n", + "\n", + "from datar.all import *\n", + "\n", + "%run nb_helpers.py\n", + "debug_kwargs = {'prefix': '\\n', 'sep': f'\\n{\"-\" * 20}\\n'}\n", + "nb_header(\n", + " sum, prod, mean, median, min, max, var, pmin, pmax,\n", + " round, sqrt, abs, sign, trunc, ceiling, floor, signif,\n", + " log, exp, log2, log10, log1p, cov, scale, col_sums,\n", + " row_sums, col_means, row_means, col_sds, row_sds,\n", + " col_medians, row_medians, quantile, sd, weighted_mean\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "sum([1,2,4,6])\n", + "--------------------\n", + "13\n", + "\n", + "prod([1,2,4,6])\n", + "--------------------\n", + "48\n", + "\n", + "mean([1,2,4,6])\n", + "--------------------\n", + "3.25\n", + "\n", + "median([1,2,4,6])\n", + "--------------------\n", + "3.0\n", + "\n", + "min([1,2,4,6])\n", + "--------------------\n", + "1\n", + "\n", + "max([1,2,4,6])\n", + "--------------------\n", + "6\n", + "\n", + "var([1,2,4,6])\n", + "--------------------\n", + "4.916666666666667\n", + "\n", + "pmin([1,4], [2,3])\n", + "--------------------\n", + "0 1\n", + "1 3\n", + "dtype: int64\n", + "\n", + "pmax([1,4], [2,3])\n", + "--------------------\n", + "0 2\n", + "1 4\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "\n", + "debug(\n", + " sum([1,2,4,6]),\n", + " prod([1,2,4,6]),\n", + " mean([1,2,4,6]),\n", + " median([1,2,4,6]),\n", + " min([1,2,4,6]),\n", + " max([1,2,4,6]),\n", + " var([1,2,4,6]),\n", + " pmin([1,4], [2,3]),\n", + " pmax([1,4], [2,3]),\n", + " **debug_kwargs\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "round([1.4, 1.5])\n", + "--------------------\n", + "array([1., 2.])\n", + "\n", + "sqrt([1.1, 2.1])\n", + "--------------------\n", + "array([1.04880885, 1.44913767])\n", + "\n", + "abs([1, -1])\n", + "--------------------\n", + "array([1, 1])\n", + "\n", + "sign([10, -10])\n", + "--------------------\n", + "array([ 1, -1])\n", + "\n", + "trunc([1.1, 2.1])\n", + "--------------------\n", + "array([1., 2.])\n", + "\n", + "ceiling([1.1, 2.1])\n", + "--------------------\n", + "array([2., 3.])\n", + "\n", + "floor([1.1, 2.1])\n", + "--------------------\n", + "array([1., 2.])\n", + "\n", + "signif(3.14567e-10, 3)\n", + "--------------------\n", + "array([3.15e-10])\n" + ] + } + ], + "source": [ + "debug(\n", + " round([1.4, 1.5]),\n", + " sqrt([1.1, 2.1]),\n", + " abs([1, -1]),\n", + " sign([10, -10]),\n", + " trunc([1.1, 2.1]),\n", + " ceiling([1.1, 2.1]),\n", + " floor([1.1, 2.1]),\n", + " signif(3.14567e-10, 3),\n", + " **debug_kwargs\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "log(exp(2))\n", + "--------------------\n", + "array([2.])\n", + "\n", + "exp(2)\n", + "--------------------\n", + "array([7.3890561])\n", + "\n", + "log2(4)\n", + "--------------------\n", + "array([2.])\n", + "\n", + "log10(100)\n", + "--------------------\n", + "array([2.])\n", + "\n", + "log1p(exp(1)-1)\n", + "--------------------\n", + "array([1.])\n", + "\n", + "cov([1, 2, 3], [3, 2, 1])\n", + "--------------------\n", + "-1.0\n", + "\n", + "scale([1, 2, 3])\n", + "--------------------\n", + "0 -1.0\n", + "1 0.0\n", + "2 1.0\n", + "Name: scaled, dtype: float64\n" + ] + } + ], + "source": [ + "debug(\n", + " log(exp(2)),\n", + " exp(2),\n", + " log2(4),\n", + " log10(100),\n", + " log1p(exp(1)-1),\n", + " cov([1, 2, 3], [3, 2, 1]),\n", + " scale([1, 2, 3]),\n", + " **debug_kwargs\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "df\n", + "--------------------\n", + " v1 v2 v3\n", + " \n", + "0 -0.527205 -0.563083 -1.030619\n", + "1 -0.652748 -0.620141 -0.410482\n", + "2 -1.464778 1.274939 0.505686\n", + "3 0.505498 -0.103748 0.805596\n", + "4 1.431942 -0.759830 -0.755049\n", + "\n", + "col_sums(df)\n", + "--------------------\n", + "v1 -0.707291\n", + "v2 -0.771863\n", + "v3 -0.884868\n", + "dtype: float64\n", + "\n", + "row_sums(df)\n", + "--------------------\n", + "0 -2.120907\n", + "1 -1.683371\n", + "2 0.315848\n", + "3 1.207346\n", + "4 -0.082938\n", + "dtype: float64\n", + "\n", + "col_means(df)\n", + "--------------------\n", + "v1 -0.141458\n", + "v2 -0.154373\n", + "v3 -0.176974\n", + "dtype: float64\n", + "\n", + "row_means(df)\n", + "--------------------\n", + "0 -0.706969\n", + "1 -0.561124\n", + "2 0.105283\n", + "3 0.402449\n", + "4 -0.027646\n", + "dtype: float64\n", + "\n", + "col_sds(df)\n", + "--------------------\n", + "v1 1.124226\n", + "v2 0.836071\n", + "v3 0.798260\n", + "dtype: float64\n", + "\n", + "row_sds(df)\n", + "--------------------\n", + "0 0.280863\n", + "1 0.131475\n", + "2 1.413066\n", + "3 0.463348\n", + "4 1.264042\n", + "dtype: float64\n", + "\n", + "col_medians(df)\n", + "--------------------\n", + "v1 -0.527205\n", + "v2 -0.563083\n", + "v3 -0.410482\n", + "dtype: float64\n", + "\n", + "row_medians(df)\n", + "--------------------\n", + "0 -0.563083\n", + "1 -0.620141\n", + "2 0.505686\n", + "3 0.505498\n", + "4 -0.755049\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "# column and row stats\n", + "\n", + "df = tribble(f.v1, f.v2, f.v3, *rnorm(15))\n", + "debug(\n", + " df,\n", + " col_sums(df),\n", + " row_sums(df),\n", + " col_means(df),\n", + " row_means(df),\n", + " col_sds(df),\n", + " row_sds(df),\n", + " col_medians(df),\n", + " row_medians(df),\n", + " **debug_kwargs,\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "quantile([1, 2, 3, 4, 5])\n", + "--------------------\n", + "0.00 1.0\n", + "0.25 2.0\n", + "0.50 3.0\n", + "0.75 4.0\n", + "1.00 5.0\n", + "Name: x, dtype: float64\n", + "\n", + "quantile([1, 2, 3, 4, 5], [0, 1])\n", + "--------------------\n", + "0.0 1.0\n", + "1.0 5.0\n", + "Name: x, dtype: float64\n", + "\n", + "sd([1, 2, 3, 4, 5])\n", + "--------------------\n", + "1.5811388300841898\n", + "\n", + "weighted_mean([1, 2, 3, 4, 5])\n", + "--------------------\n", + "3.0\n", + "\n", + "weighted_mean([1, 2, 3, 4, 5], [5, 4, 3, 2, 1])\n", + "--------------------\n", + "2.3333333333333335\n" + ] + } + ], + "source": [ + "debug(\n", + " quantile([1, 2, 3, 4, 5]),\n", + " quantile([1, 2, 3, 4, 5], [0, 1]),\n", + " sd([1, 2, 3, 4, 5]),\n", + " weighted_mean([1, 2, 3, 4, 5]),\n", + " weighted_mean([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),\n", + " **debug_kwargs,\n", + ")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "9ed5c94d10bf621c6841991b7e31ffd0f3c8de8ec4167710459737a50edc58e4" + }, + "kernelspec": { + "display_name": "Python 3.9.5 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/notebooks/base-funs.ipynb b/docs/notebooks/base-funs.ipynb new file mode 100644 index 00000000..6b8dd737 --- /dev/null +++ b/docs/notebooks/base-funs.ipynb @@ -0,0 +1,545 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Try this notebook on binder.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # cut " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Divides the range of x into intervals and codes the values in x\n", + "according to which interval they fall. The leftmost interval corresponds \n", + "to level one, the next leftmost to level two and so on. \n", + "\n", + "##### Args:\n", + "  `x`: a numeric vector which is to be converted to a factor by cutting. \n", + "  `breaks`: either a numeric vector of two or more unique cut points or \n", + "    a single number (greater than or equal to 2) giving the number of \n", + "    intervals into which x is to be cut. \n", + "\n", + "  `labels`: labels for the levels of the resulting category. By default, \n", + "    labels are constructed using \"(a,b]\" interval notation. \n", + "    If labels = False, simple integer codes are returned instead \n", + "    of a factor. \n", + "\n", + "  `include_lowest`: bool, indicating if an ‘x[i]` equal to the lowest \n", + "    (or highest, for right = FALSE) ‘breaks’ value should be included. \n", + "\n", + "  `right`: bool, indicating if the intervals should be closed on the right \n", + "    (and open on the left) or vice versa. \n", + "\n", + "  precision:integer which is used when labels are not given. It determines \n", + "    the precision used in formatting the break numbers. Note, this \n", + "    argument is different from R's API, which is dig.lab. \n", + "  `ordered_result`: bool, should the result be an ordered categorical? \n", + "\n", + "##### Returns:\n", + "  A categorical object with the cuts \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # diff " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Calculates suitably lagged and iterated differences.\n", + "\n", + "If the data is a vector of length n and differences = 1, then the computed \n", + "result is equal to the successive differences \n", + "`x[lag:] – x[:-lag]`. \n", + "\n", + "##### Examples:\n", + "  >>> rv = [52, 21, 10, 11, 19] \n", + "  >>> data = diff(rv) \n", + "  >>> # -31 -11 1 8 \n", + "  >>> # rv[1:] - rv[:-1] \n", + "  >>> # rv[1:] [21, 10, 11, 19] \n", + "  >>> # rv[:-1] [52, 21, 10, 11] \n", + "\n", + "##### Args:\n", + "  `x`: The data \n", + "  `lag`: The lag to use. Could be negative. \n", + "    It always calculates `x[lag:] - x[:-lag]` even when `lag` is negative \n", + "\n", + "  `differences`: The order of the difference \n", + "\n", + "##### Returns:\n", + "  An array of `x[lag:] – x[:-lag]`. \n", + "  If `differences > 1`, the rule applies `differences` times on `x` \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # identity " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Return whatever passed in\n", + "\n", + "Expression objects are evaluated using parent context \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # expandgrid " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Expand all combinations into a dataframe. R's `expand.grid()`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # outer " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Compute the outer product of two vectors.\n", + "\n", + "##### Args:\n", + "  `x`: The first vector \n", + "  `y`: The second vector \n", + "  `fun`: The function to handle how the result of the elements from \n", + "    the first and second vectors should be computed. \n", + "    The function has to be vectorized at the second argument, and \n", + "    return the same shape as y. \n", + "\n", + "##### Returns:\n", + "  The data frame of the outer product of x and y \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # make_names " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Make names available as columns and can be accessed by `df.`\n", + "\n", + "The names will be transformed using `python-slugify` with \n", + "`lowercase=False` and `separator=\"_\"`. When the first character is \n", + "a digit, preface it with \"_\". \n", + "\n", + "If `unique` is True, the results will be fed into \n", + "`datar.core.names.repair_names(names, \"unique\")` \n", + "\n", + "##### Args:\n", + "  `names`: The names \n", + "    if it is scalar, will make it into a list. \n", + "    Then all elements will be converted into strings \n", + "\n", + "  `unique`: Whether to make the names unique \n", + "\n", + "##### Returns:\n", + "  Converted names \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # make_unique " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Make the names unique.\n", + "\n", + "It's a shortcut for `make_names(names, unique=True)` \n", + "\n", + "##### Args:\n", + "  `names`: The names \n", + "    if it is scalar, will make it into a list. \n", + "    Then all elements will be converted into strings \n", + "\n", + "##### Returns:\n", + "  Converted names \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # rank " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Returns the sample ranks of the values in a vector.\n", + "\n", + "##### Args:\n", + "  `x`: A numeric vector \n", + "  `na_last`: for controlling the treatment of `NA`s. If `True`, missing \n", + "    values in the data are put last; if `False`, they are put \n", + "    first; if `\"keep\"` they are kept with rank `NA`. \n", + "\n", + "  `ties_method`: a character string specifying how ties are treated \n", + "    One of `average`, `first`, `dense`, `max`, and `min` \n", + "    Note that the ties_method candidates are different than the ones \n", + "    from R, because we are using `pandas.Series.rank()`. See \n", + "    https://pandas.pydata.org/docs/reference/api/pandas.Series.rank.html \n", + "\n", + "##### Returns:\n", + "  A numeric rank vector of the same length as `x` \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### # data_context " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "##### Evaluate verbs, functions in the\n", + "possibly modifying (a copy of) the original data. \n", + "\n", + "It mimic the `with` function in R, but you have to write it in a python way, \n", + "which is using the `with` statement. And you have to use it with `as`, since \n", + "we need the value returned by `__enter__`. \n", + "\n", + "##### Args:\n", + "  `data`: The data \n", + "  `func`: A function that is registered by \n", + "    `pipda.register_verb` or `pipda.register_func`. \n", + "\n", + "  `*args`: Arguments for func \n", + "  `**kwargs`: Keyword arguments for func \n", + "\n", + "##### Returns:\n", + "  The original or modified data \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from datar import options\n", + "options(warn_builtin_names=False)\n", + "\n", + "from datar.all import *\n", + "\n", + "%run nb_helpers.py\n", + "debug_kwargs = {'prefix': '\\n', 'sep': f'\\n{\"-\" * 20}\\n'}\n", + "nb_header(\n", + " cut, diff, identity, expandgrid, outer, \n", + " make_names, make_unique, rank, data_context\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2022-03-11 21:23:06][datar][WARNING] New names:\n", + "[2022-03-11 21:23:06][datar][WARNING] * '_1' -> '__0'\n", + "[2022-03-11 21:23:06][datar][WARNING] * '_1' -> '__1'\n", + "[2022-03-11 21:23:06][datar][WARNING] * '_1' -> '__2'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "cut(seq(1,10), 3)\n", + "--------------------\n", + "[(0.99, 4.0], (0.99, 4.0], (0.99, 4.0], (0.99, 4.0], (4.0, 7.0], (4.0, 7.0], (4.0, 7.0], (7.0, 10.0], (7.0, 10.0], (7.0, 10.0]]\n", + "Categories (3, interval[float64, right]): [(0.99, 4.0] < (4.0, 7.0] < (7.0, 10.0]]\n", + "\n", + "diff([1, 2, 3])\n", + "--------------------\n", + "array([1, 1])\n", + "\n", + "identity(1.23)\n", + "--------------------\n", + "1.23\n", + "\n", + "expandgrid([1,2], [3,4])\n", + "--------------------\n", + " [1, 2] [3, 4]\n", + " \n", + "0 1 3\n", + "1 1 4\n", + "2 2 3\n", + "3 2 4\n", + "\n", + "outer([1,2], [3,4])\n", + "--------------------\n", + " 0 1\n", + " \n", + "0 3 4\n", + "1 6 8\n", + "\n", + "make_names([1, 2, 3])\n", + "--------------------\n", + "['_1', '_2', '_3']\n", + "\n", + "make_unique([1, 1, 1])\n", + "--------------------\n", + "['__0', '__1', '__2']\n", + "\n", + "rank([3, 4, 1, -1])\n", + "--------------------\n", + "array([3., 4., 2., 1.])\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
<int64><int64>
013
114
223
324
\n", + "
\n" + ], + "text/plain": [ + " a b\n", + " \n", + "0 1 3\n", + "1 1 4\n", + "2 2 3\n", + "3 2 4" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "debug(\n", + " cut(seq(1,10), 3), \n", + " diff([1, 2, 3]),\n", + " identity(1.23),\n", + " expandgrid([1,2], [3,4]),\n", + " outer([1,2], [3,4]),\n", + " make_names([1, 2, 3]),\n", + " make_unique([1, 1, 1]),\n", + " rank([3, 4, 1, -1]),\n", + " **debug_kwargs\n", + ")\n", + "with data_context(tibble(a=[1,2], b=[3,4])) as _:\n", + " expandgrid(f.a, f.b)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "9ed5c94d10bf621c6841991b7e31ffd0f3c8de8ec4167710459737a50edc58e4" + }, + "kernelspec": { + "display_name": "Python 3.9.5 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/notebooks/base.ipynb b/docs/notebooks/base.ipynb index 5b3b9b13..47c5727c 100644 --- a/docs/notebooks/base.ipynb +++ b/docs/notebooks/base.ipynb @@ -228,247 +228,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": 19, - "id": "117f6f03", - "metadata": { - "execution": { - "iopub.execute_input": "2021-07-16T22:27:46.670572Z", - "iopub.status.busy": "2021-07-16T22:27:46.669793Z", - "iopub.status.idle": "2021-07-16T22:27:46.707602Z", - "shell.execute_reply": "2021-07-16T22:27:46.707958Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "cut(seq(1,10), 3)\n", - "--------------------\n", - "[(0.99, 4.0], (0.99, 4.0], (0.99, 4.0], (0.99, 4.0], (4.0, 7.0], (4.0, 7.0], (4.0, 7.0], (7.0, 10.0], (7.0, 10.0], (7.0, 10.0]]\n", - "Categories (3, interval[float64, right]): [(0.99, 4.0] < (4.0, 7.0] < (7.0, 10.0]]\n", - "\n", - "identity(1.23)\n", - "--------------------\n", - "1.23\n", - "\n", - "expandgrid([1,2], [3,4])\n", - "--------------------\n", - " [1, 2] [3, 4]\n", - " \n", - "0 1 3\n", - "1 1 4\n", - "2 2 3\n", - "3 2 4\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
<int64><int64>
013
114
223
324
\n", - "
\n" - ], - "text/plain": [ - " a b\n", - " \n", - "0 1 3\n", - "1 1 4\n", - "2 2 3\n", - "3 2 4" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# funs\n", - "\n", - "debug(\n", - " cut(seq(1,10), 3), \n", - " identity(1.23),\n", - " expandgrid([1,2], [3,4]),\n", - " **debug_kwargs\n", - ")\n", - "with data_context(tibble(a=[1,2], b=[3,4])) as _:\n", - " expandgrid(f.a, f.b)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "2ccd25e3", - "metadata": { - "execution": { - "iopub.execute_input": "2021-07-16T22:27:46.747621Z", - "iopub.status.busy": "2021-07-16T22:27:46.747048Z", - "iopub.status.idle": "2021-07-16T22:27:46.927565Z", - "shell.execute_reply": "2021-07-16T22:27:46.926614Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "mean(arr)\n", - "--------------------\n", - "3.25\n", - "\n", - "median(arr)\n", - "--------------------\n", - "3.0\n", - "\n", - "min(arr)\n", - "--------------------\n", - "1\n", - "\n", - "max(arr)\n", - "--------------------\n", - "6\n", - "\n", - "sum(arr)\n", - "--------------------\n", - "13\n", - "\n", - "abs([1, -1])\n", - "--------------------\n", - "0 1\n", - "1 1\n", - "Name: x, dtype: int64\n", - "\n", - "round([1.4, 1.5])\n", - "--------------------\n", - "0 1.0\n", - "1 2.0\n", - "Name: x, dtype: float64\n", - "\n", - "all([True, False])\n", - "--------------------\n", - "False\n", - "\n", - "any([True, False])\n", - "--------------------\n", - "True\n", - "\n", - "pmin([1,4], [2,3])\n", - "--------------------\n", - "0 1\n", - "1 3\n", - "dtype: int64\n", - "\n", - "pmax([1,4], [2,3])\n", - "--------------------\n", - "0 2\n", - "1 4\n", - "dtype: int64\n", - "\n", - "var(arr)\n", - "--------------------\n", - "4.916666666666667\n", - "\n", - "ceiling([1.1, 2.1])\n", - "--------------------\n", - "0 2.0\n", - "1 3.0\n", - "Name: x, dtype: float64\n", - "\n", - "floor([1.1, 2.1])\n", - "--------------------\n", - "0 1.0\n", - "1 2.0\n", - "Name: x, dtype: float64\n", - "\n", - "sqrt([1.1, 2.1])\n", - "--------------------\n", - "0 1.048809\n", - "1 1.449138\n", - "Name: x, dtype: float64\n", - "\n", - "cov([1,2,3], [3,2,1])\n", - "--------------------\n", - "-1.0\n" - ] - } - ], - "source": [ - "# arithmetic\n", - "arr = [1,2,4,6]\n", - "debug(\n", - " mean(arr),\n", - " median(arr),\n", - " min(arr),\n", - " max(arr),\n", - " sum(arr),\n", - " abs([1, -1]),\n", - " round([1.4, 1.5]),\n", - " all([True, False]),\n", - " any([True, False]),\n", - " pmin([1,4], [2,3]),\n", - " pmax([1,4], [2,3]),\n", - " var(arr),\n", - " ceiling([1.1, 2.1]),\n", - " floor([1.1, 2.1]),\n", - " sqrt([1.1, 2.1]),\n", - " cov([1,2,3], [3,2,1]),\n", - " **debug_kwargs\n", - ")" - ] - }, { "cell_type": "code", "execution_count": 8, diff --git a/docs/notebooks/case_when.ipynb b/docs/notebooks/case_when.ipynb index d4b19366..686bb74d 100644 --- a/docs/notebooks/case_when.ipynb +++ b/docs/notebooks/case_when.ipynb @@ -17,16 +17,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"min\" has been overriden by datar.\n", - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"max\" has been overriden by datar.\n", - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"sum\" has been overriden by datar.\n", - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"abs\" has been overriden by datar.\n", - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"round\" has been overriden by datar.\n", - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"all\" has been overriden by datar.\n", - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"any\" has been overriden by datar.\n", - "[2022-03-06 00:30:25][datar][WARNING] Builtin name \"re\" has been overriden by datar.\n", - "[2022-03-06 00:30:26][datar][WARNING] Builtin name \"filter\" has been overriden by datar.\n", - "[2022-03-06 00:30:26][datar][WARNING] Builtin name \"slice\" has been overriden by datar.\n" + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"min\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"max\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"sum\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"abs\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"round\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"all\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"any\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"re\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"filter\" has been overriden by datar.\n", + "[2022-03-10 12:56:17][datar][WARNING] Builtin name \"slice\" has been overriden by datar.\n" ] }, { @@ -100,57 +100,12 @@ { "data": { "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 4\n", - "4 fizz\n", - "5 6\n", - "6 buzz\n", - "7 8\n", - "8 9\n", - "9 fizz\n", - "10 11\n", - "11 12\n", - "12 13\n", - "13 buzz\n", - "14 fizz\n", - "15 16\n", - "16 17\n", - "17 18\n", - "18 19\n", - "19 fizz\n", - "20 buzz\n", - "21 22\n", - "22 23\n", - "23 24\n", - "24 fizz\n", - "25 26\n", - "26 27\n", - "27 buzz\n", - "28 29\n", - "29 fizz\n", - "30 31\n", - "31 32\n", - "32 33\n", - "33 34\n", - "34 fizz buzz\n", - "35 36\n", - "36 37\n", - "37 38\n", - "38 39\n", - "39 fizz\n", - "40 41\n", - "41 buzz\n", - "42 43\n", - "43 44\n", - "44 fizz\n", - "45 46\n", - "46 47\n", - "47 48\n", - "48 buzz\n", - "49 fizz\n", - "Name: y, dtype: object" + "array(['1', '2', '3', '4', 'fizz', '6', 'buzz', '8', '9', 'fizz', '11',\n", + " '12', '13', 'buzz', 'fizz', '16', '17', '18', '19', 'fizz', 'buzz',\n", + " '22', '23', '24', 'fizz', '26', '27', 'buzz', '29', 'fizz', '31',\n", + " '32', '33', '34', 'fizz buzz', '36', '37', '38', '39', 'fizz',\n", + " '41', 'buzz', '43', '44', 'fizz', '46', '47', '48', 'buzz', 'fizz'],\n", + " dtype=object)" ] }, "execution_count": 2, @@ -165,7 +120,7 @@ " f.x % 5 == 0, \"fizz\",\n", " f.x % 7 == 0, \"buzz\",\n", " True, as_character(f.x)\n", - ")) >> pull(f.y)" + ")) >> pull(f.y, to=\"array\")" ] }, { @@ -184,57 +139,11 @@ { "data": { "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 4\n", - "4 5\n", - "5 6\n", - "6 7\n", - "7 8\n", - "8 9\n", - "9 10\n", - "10 11\n", - "11 12\n", - "12 13\n", - "13 14\n", - "14 15\n", - "15 16\n", - "16 17\n", - "17 18\n", - "18 19\n", - "19 20\n", - "20 21\n", - "21 22\n", - "22 23\n", - "23 24\n", - "24 25\n", - "25 26\n", - "26 27\n", - "27 28\n", - "28 29\n", - "29 30\n", - "30 31\n", - "31 32\n", - "32 33\n", - "33 34\n", - "34 35\n", - "35 36\n", - "36 37\n", - "37 38\n", - "38 39\n", - "39 40\n", - "40 41\n", - "41 42\n", - "42 43\n", - "43 44\n", - "44 45\n", - "45 46\n", - "46 47\n", - "47 48\n", - "48 49\n", - "49 50\n", - "Name: y, dtype: object" + "array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',\n", + " '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',\n", + " '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',\n", + " '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45',\n", + " '46', '47', '48', '49', '50'], dtype=object)" ] }, "execution_count": 3, @@ -248,7 +157,7 @@ " f.x % 5 == 0, \"fizz\",\n", " f.x % 7 == 0, \"buzz\",\n", " f.x % 35 == 0, \"fizz buzz\"\n", - ")) >> pull(f.y)" + ")) >> pull(f.y, to=\"array\")" ] }, { @@ -267,57 +176,11 @@ { "data": { "text/plain": [ - "0 NaN\n", - "1 NaN\n", - "2 NaN\n", - "3 NaN\n", - "4 fizz\n", - "5 NaN\n", - "6 buzz\n", - "7 NaN\n", - "8 NaN\n", - "9 fizz\n", - "10 NaN\n", - "11 NaN\n", - "12 NaN\n", - "13 buzz\n", - "14 fizz\n", - "15 NaN\n", - "16 NaN\n", - "17 NaN\n", - "18 NaN\n", - "19 fizz\n", - "20 buzz\n", - "21 NaN\n", - "22 NaN\n", - "23 NaN\n", - "24 fizz\n", - "25 NaN\n", - "26 NaN\n", - "27 buzz\n", - "28 NaN\n", - "29 fizz\n", - "30 NaN\n", - "31 NaN\n", - "32 NaN\n", - "33 NaN\n", - "34 fizz\n", - "35 NaN\n", - "36 NaN\n", - "37 NaN\n", - "38 NaN\n", - "39 fizz\n", - "40 NaN\n", - "41 buzz\n", - "42 NaN\n", - "43 NaN\n", - "44 fizz\n", - "45 NaN\n", - "46 NaN\n", - "47 NaN\n", - "48 buzz\n", - "49 fizz\n", - "Name: y, dtype: object" + "array([nan, nan, nan, nan, 'fizz', nan, 'buzz', nan, nan, 'fizz', nan,\n", + " nan, nan, 'buzz', 'fizz', nan, nan, nan, nan, 'fizz', 'buzz', nan,\n", + " nan, nan, 'fizz', nan, nan, 'buzz', nan, 'fizz', nan, nan, nan,\n", + " nan, 'fizz', nan, nan, nan, nan, 'fizz', nan, 'buzz', nan, nan,\n", + " 'fizz', nan, nan, nan, 'buzz', 'fizz'], dtype=object)" ] }, "execution_count": 4, @@ -330,7 +193,7 @@ " f.x % 5 == 0, \"fizz\",\n", " f.x % 7 == 0, \"buzz\",\n", " f.x % 35 == 0, \"fizz buzz\"\n", - ")) >> pull(f.y)" + ")) >> pull(f.y, to=\"array\")" ] }, { @@ -349,57 +212,13 @@ { "data": { "text/plain": [ - "0 1.0\n", - "1 nope\n", - "2 nope\n", - "3 nope\n", - "4 fizz\n", - "5 6.0\n", - "6 buzz\n", - "7 8.0\n", - "8 9.0\n", - "9 fizz\n", - "10 11.0\n", - "11 12.0\n", - "12 13.0\n", - "13 buzz\n", - "14 fizz\n", - "15 16.0\n", - "16 17.0\n", - "17 18.0\n", - "18 19.0\n", - "19 fizz\n", - "20 buzz\n", - "21 22.0\n", - "22 23.0\n", - "23 24.0\n", - "24 fizz\n", - "25 26.0\n", - "26 27.0\n", - "27 buzz\n", - "28 29.0\n", - "29 fizz\n", - "30 31.0\n", - "31 32.0\n", - "32 33.0\n", - "33 34.0\n", - "34 fizz buzz\n", - "35 36.0\n", - "36 37.0\n", - "37 38.0\n", - "38 39.0\n", - "39 fizz\n", - "40 41.0\n", - "41 buzz\n", - "42 43.0\n", - "43 44.0\n", - "44 fizz\n", - "45 46.0\n", - "46 47.0\n", - "47 48.0\n", - "48 buzz\n", - "49 fizz\n", - "Name: y, dtype: object" + "array(['1.0', 'nope', 'nope', 'nope', 'fizz', '6.0', 'buzz', '8.0', '9.0',\n", + " 'fizz', '11.0', '12.0', '13.0', 'buzz', 'fizz', '16.0', '17.0',\n", + " '18.0', '19.0', 'fizz', 'buzz', '22.0', '23.0', '24.0', 'fizz',\n", + " '26.0', '27.0', 'buzz', '29.0', 'fizz', '31.0', '32.0', '33.0',\n", + " '34.0', 'fizz buzz', '36.0', '37.0', '38.0', '39.0', 'fizz',\n", + " '41.0', 'buzz', '43.0', '44.0', 'fizz', '46.0', '47.0', '48.0',\n", + " 'buzz', 'fizz'], dtype=object)" ] }, "execution_count": 5, @@ -416,7 +235,7 @@ " f.x % 7 == 0, \"buzz\",\n", " is_na(f.x), \"nope\",\n", " True, as_character(f.x)\n", - ")) >> pull(f.y)" + ")) >> pull(f.y, to=\"array\")" ] }, { @@ -435,57 +254,13 @@ { "data": { "text/plain": [ - "0 1.0\n", - "1 NaN\n", - "2 NaN\n", - "3 NaN\n", - "4 fizz\n", - "5 6.0\n", - "6 buzz\n", - "7 8.0\n", - "8 9.0\n", - "9 fizz\n", - "10 11.0\n", - "11 12.0\n", - "12 13.0\n", - "13 buzz\n", - "14 fizz\n", - "15 16.0\n", - "16 17.0\n", - "17 18.0\n", - "18 19.0\n", - "19 fizz\n", - "20 buzz\n", - "21 22.0\n", - "22 23.0\n", - "23 24.0\n", - "24 fizz\n", - "25 26.0\n", - "26 27.0\n", - "27 buzz\n", - "28 29.0\n", - "29 fizz\n", - "30 31.0\n", - "31 32.0\n", - "32 33.0\n", - "33 34.0\n", - "34 NaN\n", - "35 36.0\n", - "36 37.0\n", - "37 38.0\n", - "38 39.0\n", - "39 fizz\n", - "40 41.0\n", - "41 buzz\n", - "42 43.0\n", - "43 44.0\n", - "44 fizz\n", - "45 46.0\n", - "46 47.0\n", - "47 48.0\n", - "48 buzz\n", - "49 fizz\n", - "Name: y, dtype: object" + "array(['1.0', nan, nan, nan, 'fizz', '6.0', 'buzz', '8.0', '9.0', 'fizz',\n", + " '11.0', '12.0', '13.0', 'buzz', 'fizz', '16.0', '17.0', '18.0',\n", + " '19.0', 'fizz', 'buzz', '22.0', '23.0', '24.0', 'fizz', '26.0',\n", + " '27.0', 'buzz', '29.0', 'fizz', '31.0', '32.0', '33.0', '34.0',\n", + " nan, '36.0', '37.0', '38.0', '39.0', 'fizz', '41.0', 'buzz',\n", + " '43.0', '44.0', 'fizz', '46.0', '47.0', '48.0', 'buzz', 'fizz'],\n", + " dtype=object)" ] }, "execution_count": 6, @@ -499,7 +274,7 @@ " f.x % 5 == 0, \"fizz\",\n", " f.x % 7 == 0, \"buzz\",\n", " True, as_character(f.x)\n", - ")) >> pull(f.y)" + ")) >> pull(f.y, to=\"array\")" ] }, { @@ -518,57 +293,10 @@ { "data": { "text/plain": [ - "0 NaN\n", - "1 NaN\n", - "2 NaN\n", - "3 NaN\n", - "4 5.0\n", - "5 NaN\n", - "6 7.0\n", - "7 NaN\n", - "8 NaN\n", - "9 5.0\n", - "10 NaN\n", - "11 NaN\n", - "12 NaN\n", - "13 7.0\n", - "14 5.0\n", - "15 NaN\n", - "16 NaN\n", - "17 NaN\n", - "18 NaN\n", - "19 5.0\n", - "20 7.0\n", - "21 NaN\n", - "22 NaN\n", - "23 NaN\n", - "24 5.0\n", - "25 NaN\n", - "26 NaN\n", - "27 7.0\n", - "28 NaN\n", - "29 5.0\n", - "30 NaN\n", - "31 NaN\n", - "32 NaN\n", - "33 NaN\n", - "34 35.0\n", - "35 NaN\n", - "36 NaN\n", - "37 NaN\n", - "38 NaN\n", - "39 5.0\n", - "40 NaN\n", - "41 7.0\n", - "42 NaN\n", - "43 NaN\n", - "44 5.0\n", - "45 NaN\n", - "46 NaN\n", - "47 NaN\n", - "48 7.0\n", - "49 5.0\n", - "Name: y, dtype: float64" + "array([nan, nan, nan, nan, 5., nan, 7., nan, nan, 5., nan, nan, nan,\n", + " 7., 5., nan, nan, nan, nan, 5., 7., nan, nan, nan, 5., nan,\n", + " nan, 7., nan, 5., nan, nan, nan, nan, 35., nan, nan, nan, nan,\n", + " 5., nan, 7., nan, nan, 5., nan, nan, nan, 7., 5.])" ] }, "execution_count": 7, @@ -582,7 +310,7 @@ " f.x % 5 == 0, 5,\n", " f.x % 7 == 0, 7,\n", " True, NA)\n", - ") >> pull(f.y)" + ") >> pull(f.y, to=\"array\")" ] }, { @@ -609,16 +337,8 @@ { "data": { "text/plain": [ - "0 -2.000000\n", - "1 -1.500000\n", - "2 -1.000000\n", - "3 -0.500000\n", - "4 0.000000\n", - "5 0.707107\n", - "6 1.000000\n", - "7 1.224745\n", - "8 1.414214\n", - "Name: y, dtype: float64" + "array([-2. , -1.5 , -1. , -0.5 , 0. ,\n", + " 0.70710678, 1. , 1.22474487, 1.41421356])" ] }, "execution_count": 8, @@ -631,12 +351,12 @@ "df >> mutate(y=case_when(\n", " f.x >= 0, sqrt(f.x),\n", " True, f.x\n", - ")) >> pull(f.y)" + ")) >> pull(f.y, to=\"array\")" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "intermediate-edmonton", "metadata": { "execution": { @@ -670,6 +390,7 @@ " \n", " name\n", " height\n", + " mass\n", " gender\n", " species\n", " type\n", @@ -680,6 +401,7 @@ " \n", " <object>\n", " <float64>\n", + " <float64>\n", " <object>\n", " <object>\n", " <object>\n", @@ -688,6 +410,7 @@ " 0\n", " Luke Skywalker\n", " 172.0\n", + " 77.0\n", " masculine\n", " Human\n", " other\n", @@ -696,6 +419,7 @@ " 1\n", " C-3PO\n", " 167.0\n", + " 75.0\n", " masculine\n", " Droid\n", " robot\n", @@ -704,6 +428,7 @@ " 2\n", " R2-D2\n", " 96.0\n", + " 32.0\n", " masculine\n", " Droid\n", " robot\n", @@ -712,6 +437,7 @@ " 3\n", " Darth Vader\n", " 202.0\n", + " 136.0\n", " masculine\n", " Human\n", " large\n", @@ -723,11 +449,13 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 4\n", " Leia Organa\n", " 150.0\n", + " 49.0\n", " feminine\n", " Human\n", " other\n", @@ -736,6 +464,7 @@ " 82\n", " Rey\n", " NaN\n", + " NaN\n", " feminine\n", " Human\n", " other\n", @@ -744,6 +473,7 @@ " 83\n", " Poe Dameron\n", " NaN\n", + " NaN\n", " masculine\n", " Human\n", " other\n", @@ -752,6 +482,7 @@ " 84\n", " BB8\n", " NaN\n", + " NaN\n", " masculine\n", " Droid\n", " robot\n", @@ -762,50 +493,52 @@ " NaN\n", " NaN\n", " NaN\n", + " NaN\n", " other\n", " \n", " \n", " 86\n", " Padmé Amidala\n", " 165.0\n", + " 45.0\n", " feminine\n", " Human\n", " other\n", " \n", " \n", "\n", - "

87 rows × 5 columns

\n", + "

87 rows × 6 columns

\n", "\n" ], "text/plain": [ - " name height gender species type\n", - " \n", - "0 Luke Skywalker 172.0 masculine Human other\n", - "1 C-3PO 167.0 masculine Droid robot\n", - "2 R2-D2 96.0 masculine Droid robot\n", - "3 Darth Vader 202.0 masculine Human large\n", - ".. ... ... ... ... ...\n", - "4 Leia Organa 150.0 feminine Human other\n", - "82 Rey NaN feminine Human other\n", - "83 Poe Dameron NaN masculine Human other\n", - "84 BB8 NaN masculine Droid robot\n", - "85 Captain Phasma NaN NaN NaN other\n", - "86 Padmé Amidala 165.0 feminine Human other\n", + " name height mass gender species type\n", + " \n", + "0 Luke Skywalker 172.0 77.0 masculine Human other\n", + "1 C-3PO 167.0 75.0 masculine Droid robot\n", + "2 R2-D2 96.0 32.0 masculine Droid robot\n", + "3 Darth Vader 202.0 136.0 masculine Human large\n", + ".. ... ... ... ... ... ...\n", + "4 Leia Organa 150.0 49.0 feminine Human other\n", + "82 Rey NaN NaN feminine Human other\n", + "83 Poe Dameron NaN NaN masculine Human other\n", + "84 BB8 NaN NaN masculine Droid robot\n", + "85 Captain Phasma NaN NaN NaN NaN other\n", + "86 Padmé Amidala 165.0 45.0 feminine Human other\n", "\n", - "[87 rows x 5 columns]" + "[87 rows x 6 columns]" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "starwars >> \\\n", - " select(f[f.name:f.mass], f.gender, f.species) >> \\\n", + " select(f[f.name:f.hair_color], f.gender, f.species) >> \\\n", " mutate(\n", " type = case_when(\n", - " f.height > 200 or f.mass > 200, \"large\",\n", + " (f.height > 200) | (f.mass > 200), \"large\",\n", " f.species == \"Droid\" , \"robot\",\n", " True , \"other\"\n", " )\n", @@ -828,18 +561,19 @@ { "data": { "text/plain": [ - "0 other\n", - "1 robot\n", - "2 robot\n", - "3 large\n", - "4 other\n", - " ... \n", - "82 other\n", - "83 other\n", - "84 robot\n", - "85 other\n", - "86 other\n", - "Name: type, Length: 87, dtype: object" + "array(['other', 'robot', 'robot', 'large', 'other', 'other', 'other',\n", + " 'robot', 'other', 'other', 'other', 'other', 'large', 'other',\n", + " 'other', 'large', 'other', 'other', 'other', 'other', 'other',\n", + " 'robot', 'other', 'other', 'other', 'other', 'other', 'other',\n", + " 'other', 'other', 'other', 'other', 'other', 'other', 'large',\n", + " 'large', 'other', 'other', 'other', 'other', 'other', 'other',\n", + " 'other', 'other', 'other', 'other', 'other', 'other', 'other',\n", + " 'other', 'other', 'other', 'other', 'large', 'other', 'other',\n", + " 'other', 'other', 'other', 'other', 'other', 'other', 'other',\n", + " 'other', 'other', 'other', 'other', 'other', 'large', 'large',\n", + " 'other', 'other', 'robot', 'other', 'other', 'other', 'large',\n", + " 'large', 'other', 'other', 'large', 'other', 'other', 'other',\n", + " 'robot', 'other', 'other'], dtype=object)" ] }, "execution_count": 10, @@ -850,11 +584,11 @@ "source": [ "starwars >> \\\n", " mutate(type=case_when(\n", - " f.height > 200 or f.mass > 200, \"large\",\n", + " (f.height > 200) | (f.mass > 200), \"large\",\n", " f.species == \"Droid\", \"robot\",\n", " True, \"other\"\n", " )) >> \\\n", - " pull(f.type)" + " pull(f.type, to=\"array\")" ] }, { diff --git a/docs/reference-maps/base.md b/docs/reference-maps/base.md index 867b6ddf..764e69bc 100644 --- a/docs/reference-maps/base.md +++ b/docs/reference-maps/base.md @@ -11,7 +11,7 @@ ## Reference of `datar.base` -See [here](../stats.md) for APIs ported from `r-stats` and [here](../utils.md) for APIs ported from `r-utils` +See [here](../stats) for APIs ported from `r-stats` and [here](../utils) for APIs ported from `r-utils` **Legend:** @@ -43,29 +43,40 @@ See [here](../stats.md) for APIs ported from `r-stats` and [here](../utils.md) f ### Arithmetic functions |API|Description|Notebook example| |---|---|---:| -|[`mean()`][8]|Calculate the mean of the values|[:material-notebook:][4]| -|[`median()`][9]|Calculate the median of the values|[:material-notebook:][4]| -|[`min()`][10]|Calculate the min of the values|[:material-notebook:][4]| -|[`max()`][11]|Calculate the max of the values|[:material-notebook:][4]| -|[`pmin()`][12]|Calculate the min of the values rowwisely|[:material-notebook:][4]| -|[`pmax()`][13]|Calculate the max of the values rowwisely|[:material-notebook:][4]| -|[`sum()`][14]|Calculate the sum of the values|[:material-notebook:][4]| -|[`abs()`][15]|Calculate the absolute values of the values|[:material-notebook:][4]| -|[`round()`][16]|Round the numbers|[:material-notebook:][4]| -|[`var()`][17]|Calculate the variance of the values|[:material-notebook:][4]| -|[`ceiling()`][18]|Get the ceiling integers of the numbers|[:material-notebook:][4]| -|[`floor()`][19]|Get the floor integers of the numbers|[:material-notebook:][4]| -|[`sqrt()`][20]|Get the square root of the numbers|[:material-notebook:][4]| -|[`cov()`][21]|Calculate the covariance of the values|[:material-notebook:][4]| -|[`prod()`][117]|Calculate Product of the input|| -|[`sign()`][118]|Get the signs of the corresponding elements of x|| -|[`signif()`][125]|Rounds the values in its first argument to the specified number of significant digits|| -|[`trunc()`][119]|Get the integers truncated for each element in x|| -|[`exp()`][120]|Calculates the power of natural number|| -|[`log()`][121]|Computes logarithms, by default natural logarithm|| -|[`log2()`][122]|Computes logarithms with base 2|| -|[`log10()`][123]|Computes logarithms with base 10|| -|[`log1p()`][124]|Computes log(1+x)|| +|[`mean()`][8]|Calculate the mean of the values|[:material-notebook:][151]| +|[`median()`][9]|Calculate the median of the values|[:material-notebook:][151]| +|[`min()`][10]|Calculate the min of the values|[:material-notebook:][151]| +|[`max()`][11]|Calculate the max of the values|[:material-notebook:][151]| +|[`pmin()`][12]|Calculate the min of the values rowwisely|[:material-notebook:][151]| +|[`pmax()`][13]|Calculate the max of the values rowwisely|[:material-notebook:][151]| +|[`sum()`][14]|Calculate the sum of the values|[:material-notebook:][151]| +|[`abs()`][15]|Calculate the absolute values of the values|[:material-notebook:][151]| +|[`round()`][16]|Round the numbers|[:material-notebook:][151]| +|[`var()`][17]|Calculate the variance of the values|[:material-notebook:][151]| +|[`ceiling()`][18]|Get the ceiling integers of the numbers|[:material-notebook:][151]| +|[`floor()`][19]|Get the floor integers of the numbers|[:material-notebook:][151]| +|[`sqrt()`][20]|Get the square root of the numbers|[:material-notebook:][151]| +|[`cov()`][21]|Calculate the covariance of the values|[:material-notebook:][151]| +|[`prod()`][117]|Calculate Product of the input|[:material-notebook:][151]| +|[`sign()`][118]|Get the signs of the corresponding elements of x|[:material-notebook:][151]| +|[`signif()`][125]|Rounds the values in its first argument to the specified number of significant digits|[:material-notebook:][151]| +|[`trunc()`][119]|Get the integers truncated for each element in x|[:material-notebook:][151]| +|[`exp()`][120]|Calculates the power of natural number|[:material-notebook:][151]| +|[`log()`][121]|Computes logarithms, by default natural logarithm|[:material-notebook:][151]| +|[`log2()`][122]|Computes logarithms with base 2|[:material-notebook:][151]| +|[`log10()`][123]|Computes logarithms with base 10|[:material-notebook:][151]| +|[`log1p()`][124]|Computes log(1+x)|[:material-notebook:][151]| +|[`quantile()`][152]|Produces sample quantiles corresponding to the given probabilities.|[:material-notebook:][151]| +|[`sd()`, `std()`][153]|Computes the standard deviation of the values|[:material-notebook:][151]| +|[`weighted_mean()`][154]|Computes the weighted mean of the values|[:material-notebook:][151]| +|[`col_sums()`][155]|Computes column sums of a dataframe|[:material-notebook:][151]| +|[`row_sums()`][156]|Computes row sums of a dataframe|[:material-notebook:][151]| +|[`col_means()`][157]|Computes column means of a dataframe|[:material-notebook:][151]| +|[`row_means()`][158]|Computes row means of a dataframe|[:material-notebook:][151]| +|[`col_sds()`][159]|Computes column sds of a dataframe|[:material-notebook:][151]| +|[`row_sds()`][160]|Computes row sds of a dataframe|[:material-notebook:][151]| +|[`col_medians()`][161]|Computes column medians of a dataframe|[:material-notebook:][151]| +|[`row_medians()`][162]|Computes row medians of a dataframe|[:material-notebook:][151]| ### Bessel functions @@ -179,7 +190,7 @@ See [here](../stats.md) for APIs ported from `r-stats` and [here](../utils.md) f |[`sample`][64]|Sample the elements from sequence|[:material-notebook:][4]| |[`length`][65]|Get the length of data|[:material-notebook:][4]| |[`match`][129]|match returns a vector of the positions of (first) matches of its first argument in its second.|| -|[`rank`][143]|Returns the sample ranks of the values in a vector.|| +|[`rank`][143]|Returns the sample ranks of the values in a vector.|[:material-notebook:][163]| |[`order`][144]|Returns a permutation which rearranges its first argument into ascending or descending order|| |[`sort`][145]|Sorting or Ordering Vectors|| @@ -240,7 +251,7 @@ See [here](../stats.md) for APIs ported from `r-stats` and [here](../utils.md) f |[`is_integer`][93] [`is_int`][93]|Test if data is integer|[:material-notebook:][4]| |[`is_numeric`][94]|Test if data is numeric|[:material-notebook:][4]| |[`is_atomic`][95]|Test is data is atomic|[:material-notebook:][4]| -|[`is_element`][96] [`is_in`][96]|Test if value is an element of an array (R's `%in`)|[:material-notebook:][4]| +|[`is_element, `is_in`][96]|Test if value is an element of an array (R's `%in`)|[:material-notebook:][4]| ### Trigonometric and hyper bolic functions @@ -274,16 +285,18 @@ See [here](../stats.md) for APIs ported from `r-stats` and [here](../utils.md) f |API|Description|Notebook example| |---|---|---:| -|[`cut`][113]|Convert Numeric to Factor|[:material-notebook:][4]| -|[`identity`][114]|Identity Function|[:material-notebook:][4]| -|[`expandgrid`][115]|Create a Data Frame from All Combinations of Factor Variables|[:material-notebook:][4]| +|[`cut`][113]|Convert Numeric to Factor|[:material-notebook:][163]| +|[`diff`][164]|Returns suitably lagged and iterated differences.|[:material-notebook:][163]| +|[`identity`][114]|Identity Function|[:material-notebook:][163]| +|[`expandgrid`][115]|Create a Data Frame from All Combinations of Factor Variables|[:material-notebook:][163]| +|[`outer`][165]|Compute the outer product of two vectors.|[:material-notebook:][163]| |[`max_col`][136]|Find the maximum position for each row of a matrix|| |[`append`][147]|Add elements to a vector.|| |[`complete_cases`][137]|Get a bool array indicating whether the values of rows are complete in a data frame.|| |[`proportions`][147], [`prop_table`][147]|Returns conditional proportions given `margins`|| -|[`make_names`][137]|Make names available as columns and can be accessed by `df.`|| -|[`make_unique`][138]|Make the names unique, alias of `make_names(names, unique=True)`|| -|[**`data_context`**][116]|Mimic R's `with`|[:material-notebook:][4]| +|[`make_names`][137]|Make names available as columns and can be accessed by `df.`|[:material-notebook:][163]| +|[`make_unique`][138]|Make the names unique, alias of `make_names(names, unique=True)`|[:material-notebook:][163]| +|[**`data_context`**][116]|Mimic R's `with`|[:material-notebook:][163]| [1]: ../../api/datar.base.which/#datar.dplyr.which.which @@ -436,3 +449,18 @@ See [here](../stats.md) for APIs ported from `r-stats` and [here](../utils.md) f [148]: ../../api/datar.base.verbs/#datar.base.verbs.proportions [149]: ../../api/datar.base.string/#datar.base.string.trimws [150]: ../../api/datar.base.date/#datar.base.date.as_pd_date +[151]: ../../notebooks/base-arithmetic +[152]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.quantile +[153]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.sd +[154]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.weighted_mean +[155]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.col_sums +[156]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.row_sums +[157]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.col_means +[158]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.row_means +[159]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.col_sds +[160]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.row_sds +[161]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.col_medians +[162]: ../../api/datar.base.arithmetic/#datar.base.arithmetic.row_medians +[163]: ../../notebooks/base-funs +[164]: ../../api/datar.base.funs/#datar.base.funs.diff +[165]: ../../api/datar.base.funs/#datar.base.funs.outer diff --git a/docs/reference-maps/stats.md b/docs/reference-maps/stats.md index 8268b412..d523d4a7 100644 --- a/docs/reference-maps/stats.md +++ b/docs/reference-maps/stats.md @@ -27,14 +27,8 @@ |[`rnorm()`][1]|Generates random deviates for the normal distribution|| |[`rpois()`][2]|Generates random deviates for the Poisson distribution|| |[`runif()`][3]|Generates random deviates for the uniform distribution|| -|[`quantile()`][4]|produces sample quantiles corresponding to the given probabilities.|| -|[`sd()`][5]|Computes the standard deviation of the values|| -|[`weighted_mean()`][6]|Computes the weighted mean of the values|| [1]: ../../api/datar.base.stats/#datar.base.stats.rnorm [2]: ../../api/datar.base.stats/#datar.base.stats.rpois [3]: ../../api/datar.base.stats/#datar.base.stats.runif -[4]: ../../api/datar.base.stats/#datar.base.stats.quantile -[5]: ../../api/datar.base.stats/#datar.base.stats.sd -[6]: ../../api/datar.base.stats/#datar.base.stats.weighted_mean diff --git a/docs/requirements.txt b/docs/requirements.txt index d499880c..75617edb 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,6 +5,7 @@ pymdown-extensions mkapi-fix mkdocs-jupyter ipykernel +ipython_genutils # to compile readme.ipynb plotnine klib diff --git a/mkdocs.yml b/mkdocs.yml index 94f14296..ecf9b04b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -62,6 +62,8 @@ nav: 'add_row': 'notebooks/add_row.ipynb' 'arrange': 'notebooks/arrange.ipynb' 'base': 'notebooks/base.ipynb' + 'base-arithmetic': 'notebooks/base-arithmetic.ipynb' + 'base-funs': 'notebooks/base-funs.ipynb' 'between': 'notebooks/between.ipynb' 'bind': 'notebooks/bind.ipynb' 'case_when': 'notebooks/case_when.ipynb' diff --git a/pyproject.toml b/pyproject.toml index e6325319..0a309b9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "datar" -version = "0.6.1" +version = "0.6.2" description = "Port of dplyr and other related R packages in python, using pipda." authors = ["pwwang "] readme = "README.md" diff --git a/tests/base/test_funs.py b/tests/base/test_funs.py index 2bfd5b0c..1c229a81 100644 --- a/tests/base/test_funs.py +++ b/tests/base/test_funs.py @@ -11,9 +11,10 @@ make_unique, data_context, expandgrid, + diff, rank, ) -from datar.base import table, pi, paste0, rnorm +from datar.base import table, pi, paste0, rnorm, cumsum, seq from ..conftest import assert_iterable_equal @@ -65,6 +66,14 @@ def test_cut(): assert str(ct[0]) == "(0.994, 3.0]" +def test_diff(): + x = cumsum(cumsum(seq(1, 10))) + assert_iterable_equal(diff(x, lag=2), x[2:] - x[:-2]) + assert_iterable_equal(diff(x, lag=2), seq(3, 10)**2) + + assert_iterable_equal(diff(diff(x)), diff(x, differences=2)) + + def test_identity(): assert identity(1) == 1 assert identity(1.23) == 1.23