0.3.2 (#34)

* Fix #28 * Fix #26, #29, #31 * Fix #38 * Add `str_dtype` argument to `as_character()` to partially fix #36 * 0.3.2 * Delete grouped2.py
pwwang · Jul 13, 2021 · c95a4b4 · c95a4b4
1 parent 0b68a31
commit c95a4b4
Show file tree

Hide file tree

Showing 19 changed files with 243 additions and 159 deletions.
diff --git a/datar/__init__.py b/datar/__init__.py
@@ -4,4 +4,4 @@
 from .core import _frame_format_patch
 from .core.defaults import f
 
-__version__ = "0.3.1"
+__version__ = "0.3.2"
diff --git a/datar/base/string.py b/datar/base/string.py
@@ -7,7 +7,7 @@
 from pipda import register_func
 
 from ..core.contexts import Context
-from ..core.types import IntOrIter, StringOrIter, is_scalar, is_null
+from ..core.types import Dtype, IntOrIter, StringOrIter, is_scalar, is_null
 from ..core.utils import (
     arg_match,
     get_option,
@@ -30,13 +30,14 @@
 
 
 @register_func(None, context=Context.EVAL)
-def as_character(x: Any, _na: Any = NA) -> StringOrIter:
+def as_character(x: Any, str_dtype: Dtype = str, _na: Any = NA) -> StringOrIter:
     """Convert an object or elements of an iterable into string
 
     Aliases `as_str` and `as_string`
 
     Args:
         x: The object
+        str_dtype: The string dtype to convert to
         _na: How NAs should be casted. Specify NA will keep them unchanged.
             But the dtype will be object then.
 
@@ -45,8 +46,7 @@ def as_character(x: Any, _na: Any = NA) -> StringOrIter:
         When x is iterable, convert elements of it into strings
         Otherwise, convert x to string.
     """
-    return _as_type(x, str, na=_na)
-
+    return _as_type(x, str_dtype, na=_na)
 
 as_str = as_string = as_character
 

diff --git a/datar/core/_frame_format_patch.py b/datar/core/_frame_format_patch.py
@@ -1,3 +1,36 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc.
+# and PyData Development Team
+# All rights reserved.
+
+# Copyright (c) 2011-2021, Open source contributors.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 """Monkey-patch data frame format to
 1. add dtypes next to column names when printing
 2. collapse data frames when they are elements of a parent data frame.
@@ -37,12 +70,7 @@
 
 from .options import add_option
 
-# pylint: disable=c-extension-no-member
-# pylint: disable=invalid-name
-# pylint: disable=too-many-branches
-# pylint: disable=too-many-statements
-# pylint: disable=consider-using-enumerate
-# pylint: disable=too-many-nested-blocks
+# pylint: skip-file
 
 # TODO: patch more formatters
 

diff --git a/datar/core/operator.py b/datar/core/operator.py
@@ -12,6 +12,15 @@
 from .exceptions import DataUnrecyclable
 from .types import BoolOrIter
 
+class DatarOperatorMeta(type):
+    """Allow attributes with '_op_' to pass for operator functions"""
+    def __getattr__(cls, name: str) -> Any:
+        """If name starts with '_op_', let it go self for the real function
+        Otherwise, do regular getattr.
+        """
+        if name.startswith('_op_'):
+            return True
+        return super().__getattr__(name)
 
 @register_operator
 class DatarOperator(Operator):
@@ -30,19 +39,19 @@ def _arithmetize2(self, left: Any, right: Any, op: str) -> Any:
         left, right = _recycle_left_right(left, right)
         return op_func(left, right)
 
-    def invert(self, operand: Any) -> Any:
+    def _op_invert(self, operand: Any) -> Any:
         """Interpretation for ~x"""
-        if isinstance(operand, (slice, str, list, tuple, Collection)):
+        if isinstance(operand, (slice, str, list, tuple)):
             return Inverted(operand)
         return self._arithmetize1(operand, "invert")
 
-    def neg(self, operand: Any) -> Any:
+    def _op_neg(self, operand: Any) -> Any:
         """Interpretation for -x"""
         if isinstance(operand, (slice, list)):
             return Negated(operand)
         return self._arithmetize1(operand, "neg")
 
-    def and_(self, left: Any, right: Any) -> Any:
+    def _op_and_(self, left: Any, right: Any) -> Any:
         """Mimic the & operator in R.
 
         This has to have Expression objects to be involved to work
@@ -63,7 +72,7 @@ def and_(self, left: Any, right: Any) -> Any:
         right = Series(right).fillna(False)
         return left & right
 
-    def or_(self, left: Any, right: Any) -> Any:
+    def _op_or_(self, left: Any, right: Any) -> Any:
         """Mimic the & operator in R.
 
         This has to have Expression objects to be involved to work
@@ -84,9 +93,9 @@ def or_(self, left: Any, right: Any) -> Any:
         return left | right
 
     # pylint: disable=invalid-name
-    def ne(self, left: Any, right: Any) -> BoolOrIter:
+    def _op_ne(self, left: Any, right: Any) -> BoolOrIter:
         """Interpret for left != right"""
-        out = self.eq(left, right)
+        out = self._op_eq(left, right)
         if isinstance(out, (numpy.ndarray, Series)):
             neout = ~out
             # neout[pandas.isna(out)] = numpy.nan
@@ -96,11 +105,11 @@ def ne(self, left: Any, right: Any) -> BoolOrIter:
 
     def __getattr__(self, name: str) -> Any:
         """Other operators"""
-        if not hasattr(operator, name):
-            raise AttributeError
-        attr = partial(self._arithmetize2, op=name)
-        attr.__qualname__ = self._arithmetize2.__qualname__
-        return attr
+        if name.startswith('_op_'):
+            attr = partial(self._arithmetize2, op=name[4:])
+            attr.__qualname__ = self._arithmetize2.__qualname__
+            return attr
+        return super().__getattr__(name)
 
 
 def _recycle_left_right(left: Any, right: Any) -> Tuple:

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.3.2
+- Adopt `pipda` v0.4.1 to fix `getattr()` failure for operater-connected expressions (#38)
+- Add `str_dtype` argument to `as_character()` to partially fix #36
+- Update license in `core._frame_format_patch` (#28)
+
 ## 0.3.1
 - Adopt `pipda` v0.4.0
 - Change argument `_dtypes` to `dtypes_` for tibble-families

diff --git a/docs/caveats/NAs.md b/docs/caveats/NAs.md
@@ -0,0 +1,22 @@
+
+- dtype
+
+    `NA` in datar sets to `numpy.nan`, which is a float. So that it causes problems for other dtypes of data, because setting a value to NA (float) in an array with other dtype is not compatible. Unlink R, python does not have missing value type for other dtypes.
+
+    pandas has introduced it's own `NA` and some `NA` compatible dtypes. However, `numpy` is still not aware of it, which causes problems for internal computations.
+
+- string
+
+    When initialize a string array intentionally: `numpy.array(['a', NA])`, the `NA` will be converted to a string `'nan'`. That may not be what we want sometimes. To avoid that, use `None` or `NULL` instead:
+
+    ```python
+    >>> numpy.array(['a', None])
+    array(['a', None], dtype=object)
+    ```
+
+    Just pay attention that the dtype falls back to object.
+
+
+- `NaN`
+
+    Since `NA` is already a float, `NaN` here is equivalent to `NA`.
diff --git a/docs/caveats/df_index_colname.md b/docs/caveats/df_index_colname.md
@@ -0,0 +1,17 @@
+
+Most APIs from tidyverse packages ignore/reset the index (row names) of data frames, so do the APIs from `datar`. So when selecting rows, row indices are always used. With most APIs, the indices of the data frames are dropped, so they are actually ranging from 0 to `nrow(df) - 1`.
+
+!!! Note
+
+    when using 1-based indexing (default), 1 selects the first row. Even though the first row shows index 0 when it's printed.
+
+No `MultiIndex` indices/column names are supported for the APIs to select or manipulate data frames and the data frames generated by the APIs will not have `MultiIndex` indices/column names. However, since it's still pandas DataFrame, you can always do it in pandas way:
+
+```python
+df = tibble(x=1, y=2)
+df2 = df >> mutate(z=f.x+f.y)
+# pandas way to select
+df2.iloc[0, z] # 3
+# add multiindex to it:
+df.columns = pd.MultiIndex.from_product([df.columns, ['C']])
+```
diff --git a/docs/caveats/grouped.md b/docs/caveats/grouped.md
@@ -0,0 +1,9 @@
+
+`datar` doesn't use `pandas`' `DataFrameGroupBy`/`SeriesGroupBy` classes. Instead, we have our own `DataFrameGroupBy` class, which is actually a subclass of `DataFrame`, with 3 extra properties: `_group_data`, `_group_vars` and `_group_drop`, carring the grouping data, grouping variables/columns and whether drop the non-observable values. This is very similar to `grouped_df` from `dplyr`.
+
+The reasons that we implement this are:
+
+1. Pandas DataFrameGroupBy cannot handle mutilpe categorical columns as
+        groupby variables with non-obserable values
+2. It is very hard to retrieve group indices and data when doing apply
+3. NAs unmatched in grouping variables
diff --git a/docs/caveats/in.md b/docs/caveats/in.md
@@ -0,0 +1,80 @@
+`%in%` in R is a shortcut for `is.element()` to test if the elements are in a container.
+
+```r
+r$> c(1,3,5) %in% 1:4
+[1]  TRUE  TRUE FALSE
+
+r$> is.element(c(1,3,5), 1:4)
+[1]  TRUE  TRUE FALSE
+```
+
+However, `in` in python acts differently:
+
+```python
+>>> import numpy as np
+>>>
+>>> arr = np.array([1,2,3,4])
+>>> elts = np.array([1,3,5])
+>>>
+>>> elts in arr
+/.../bin/bpython:1: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.
+  #!/.../bin/python
+False
+>>> [1,2] in [1,2,3]
+False
+```
+
+It simply tests if the element on the left side of `in` is equal to any of the elements in the right side. Regardless of whether the element on the left side is scalar or not.
+
+Yes, we can redefine the behavior of this by writing your own `__contains__()` methods of the right object. For example:
+
+```python
+>>> class MyList(list):
+...     def __contains__(self, key):
+...         # Just an example to let it return the reversed result
+...         return not super().__contains__(key)
+...
+>>> 1 in MyList([1,2,3])
+False
+>>> 4 in MyList([1,2,3])
+True
+```
+
+But the problem is that the result `__contains__()` is forced to be a scalar bool by python. In this sense, we cannot let `x in y` to be evaluated as a bool array or even a pipda `Expression` object.
+```python
+>>> class MyList(list):
+...     def __contains__(self, key):
+...         # Just an example
+...         return [True, False, True] # logically True in python
+...
+>>> 1 in MyList([1,2,3])
+True
+>>> 4 in MyList([1,2,3])
+True
+```
+
+So instead, we ported `is.element()` from R:
+
+```python
+>>> import numpy as np
+>>> from datar.base import is_element
+>>>
+>>> arr = np.array([1,2,3,4])
+>>> elts = np.array([1,3,5])
+>>>
+>>> is_element(elts, arr)
+>>> is_element(elts, arr)
+array([ True,  True, False])
+```
+
+So, as @rleyvasal pointed out in https://github.com/pwwang/datar/issues/31#issuecomment-877499212,
+
+if the left element is a pandas `Series`:
+```python
+>>> import pandas as pd
+>>> pd.Series(elts).isin(arr)
+0     True
+1     True
+2    False
+dtype: bool
+```
diff --git a/docs/indexing.md → docs/caveats/indexing.md b/docs/indexing.md → docs/caveats/indexing.md
@@ -20,6 +20,8 @@ In `R`, negative indexes mean removal. However, here negative indexes are still
 selection, as `-1` for the last column, `-2` for the second last, etc. It is
 the same for both 0-based and 1-based indexing.
 
+If you want to do negative selection, use tilde `~` instead of `-`.
+
 ## Temporary index base change
 
 For example:

diff --git a/docs/caveats/list.md b/docs/caveats/list.md
@@ -0,0 +1,10 @@
+
+R's list is actually a name-value pair container. When there is a need for it, we use python's dict instead, since python's list doesn't support names.
+
+For example:
+```python
+>>> names({'a':1}, 'x')
+{'x': 1}
+```
+
+We have `base.c()` to mimic `c()` in R, which will concatenate and flatten anything passed into it. Unlike `list()` in python, it accepts multiple arguments. So that you can do `c(1,2,3)`, but you cannot do `list(1,2,3)` in python.
diff --git a/docs/caveats/nested_data_frames.md b/docs/caveats/nested_data_frames.md
@@ -0,0 +1,18 @@
+
+pandas DataFrame doesn't support nested data frames. However, some R packages do, especially `tidyr`.
+
+Here we uses fake nested data frames:
+
+```python
+>>> df = tibble(x=1, y=tibble(a=2, b=3))
+>>> df
+        x     y$a     y$b
+  <int64> <int64> <int64>
+0       1       2       3
+```
+
+Now `df` is a fake nested data frame, with an inner data frame as column `y` in `df`.
+
+!!! Warning
+
+    For APIs from `tidyr` that tidies nested data frames, this is fully supported, but just pay attention when you operate it in pandas way. For other APIs, this feature is still experimental.
diff --git a/docs/caveats/ptypes.md b/docs/caveats/ptypes.md
@@ -0,0 +1,2 @@
+
+Unlike some APIs from `tidyverse` packages that uses a data frame as `ptypes` tempate, here we use dtypes directly or a dict with name-dtype pairs for the columns.
diff --git a/docs/caveats/tibble_vs_dataframe.md b/docs/caveats/tibble_vs_dataframe.md
@@ -0,0 +1,6 @@
+
+`datar` introduced `tibble` package as well.
+
+However, unlike in R, `tidyverse`'s `tibble` is a different class than the `data.frame` from base R, the data frame created by `datar.tibble.tibble()` and family is actually a pandas `DataFrame`. It's just a wrapper around the constructor.
+
+So you can do anything you do using pandas API after creation.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		Unlike some APIs from `tidyverse` packages that uses a data frame as `ptypes` tempate, here we use dtypes directly or a dict with name-dtype pairs for the columns.