Skip to content

Commit

Permalink
0.3.2 (#34)
Browse files Browse the repository at this point in the history
* Fix #28

* Fix #26, #29, #31

* Fix #38

* Add `str_dtype` argument to `as_character()` to partially fix #36

* 0.3.2

* Delete grouped2.py
  • Loading branch information
pwwang authored Jul 13, 2021
1 parent 0b68a31 commit c95a4b4
Show file tree
Hide file tree
Showing 19 changed files with 243 additions and 159 deletions.
2 changes: 1 addition & 1 deletion datar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .core import _frame_format_patch
from .core.defaults import f

__version__ = "0.3.1"
__version__ = "0.3.2"
8 changes: 4 additions & 4 deletions datar/base/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pipda import register_func

from ..core.contexts import Context
from ..core.types import IntOrIter, StringOrIter, is_scalar, is_null
from ..core.types import Dtype, IntOrIter, StringOrIter, is_scalar, is_null
from ..core.utils import (
arg_match,
get_option,
Expand All @@ -30,13 +30,14 @@


@register_func(None, context=Context.EVAL)
def as_character(x: Any, _na: Any = NA) -> StringOrIter:
def as_character(x: Any, str_dtype: Dtype = str, _na: Any = NA) -> StringOrIter:
"""Convert an object or elements of an iterable into string
Aliases `as_str` and `as_string`
Args:
x: The object
str_dtype: The string dtype to convert to
_na: How NAs should be casted. Specify NA will keep them unchanged.
But the dtype will be object then.
Expand All @@ -45,8 +46,7 @@ def as_character(x: Any, _na: Any = NA) -> StringOrIter:
When x is iterable, convert elements of it into strings
Otherwise, convert x to string.
"""
return _as_type(x, str, na=_na)

return _as_type(x, str_dtype, na=_na)

as_str = as_string = as_character

Expand Down
40 changes: 34 additions & 6 deletions datar/core/_frame_format_patch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,36 @@
# BSD 3-Clause License

# Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc.
# and PyData Development Team
# All rights reserved.

# Copyright (c) 2011-2021, Open source contributors.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:

# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.

# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.

# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""Monkey-patch data frame format to
1. add dtypes next to column names when printing
2. collapse data frames when they are elements of a parent data frame.
Expand Down Expand Up @@ -37,12 +70,7 @@

from .options import add_option

# pylint: disable=c-extension-no-member
# pylint: disable=invalid-name
# pylint: disable=too-many-branches
# pylint: disable=too-many-statements
# pylint: disable=consider-using-enumerate
# pylint: disable=too-many-nested-blocks
# pylint: skip-file

# TODO: patch more formatters

Expand Down
33 changes: 21 additions & 12 deletions datar/core/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
from .exceptions import DataUnrecyclable
from .types import BoolOrIter

class DatarOperatorMeta(type):
"""Allow attributes with '_op_' to pass for operator functions"""
def __getattr__(cls, name: str) -> Any:
"""If name starts with '_op_', let it go self for the real function
Otherwise, do regular getattr.
"""
if name.startswith('_op_'):
return True
return super().__getattr__(name)

@register_operator
class DatarOperator(Operator):
Expand All @@ -30,19 +39,19 @@ def _arithmetize2(self, left: Any, right: Any, op: str) -> Any:
left, right = _recycle_left_right(left, right)
return op_func(left, right)

def invert(self, operand: Any) -> Any:
def _op_invert(self, operand: Any) -> Any:
"""Interpretation for ~x"""
if isinstance(operand, (slice, str, list, tuple, Collection)):
if isinstance(operand, (slice, str, list, tuple)):
return Inverted(operand)
return self._arithmetize1(operand, "invert")

def neg(self, operand: Any) -> Any:
def _op_neg(self, operand: Any) -> Any:
"""Interpretation for -x"""
if isinstance(operand, (slice, list)):
return Negated(operand)
return self._arithmetize1(operand, "neg")

def and_(self, left: Any, right: Any) -> Any:
def _op_and_(self, left: Any, right: Any) -> Any:
"""Mimic the & operator in R.
This has to have Expression objects to be involved to work
Expand All @@ -63,7 +72,7 @@ def and_(self, left: Any, right: Any) -> Any:
right = Series(right).fillna(False)
return left & right

def or_(self, left: Any, right: Any) -> Any:
def _op_or_(self, left: Any, right: Any) -> Any:
"""Mimic the & operator in R.
This has to have Expression objects to be involved to work
Expand All @@ -84,9 +93,9 @@ def or_(self, left: Any, right: Any) -> Any:
return left | right

# pylint: disable=invalid-name
def ne(self, left: Any, right: Any) -> BoolOrIter:
def _op_ne(self, left: Any, right: Any) -> BoolOrIter:
"""Interpret for left != right"""
out = self.eq(left, right)
out = self._op_eq(left, right)
if isinstance(out, (numpy.ndarray, Series)):
neout = ~out
# neout[pandas.isna(out)] = numpy.nan
Expand All @@ -96,11 +105,11 @@ def ne(self, left: Any, right: Any) -> BoolOrIter:

def __getattr__(self, name: str) -> Any:
"""Other operators"""
if not hasattr(operator, name):
raise AttributeError
attr = partial(self._arithmetize2, op=name)
attr.__qualname__ = self._arithmetize2.__qualname__
return attr
if name.startswith('_op_'):
attr = partial(self._arithmetize2, op=name[4:])
attr.__qualname__ = self._arithmetize2.__qualname__
return attr
return super().__getattr__(name)


def _recycle_left_right(left: Any, right: Any) -> Tuple:
Expand Down
5 changes: 5 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.3.2
- Adopt `pipda` v0.4.1 to fix `getattr()` failure for operater-connected expressions (#38)
- Add `str_dtype` argument to `as_character()` to partially fix #36
- Update license in `core._frame_format_patch` (#28)

## 0.3.1
- Adopt `pipda` v0.4.0
- Change argument `_dtypes` to `dtypes_` for tibble-families
Expand Down
22 changes: 22 additions & 0 deletions docs/caveats/NAs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

- dtype

`NA` in datar sets to `numpy.nan`, which is a float. So that it causes problems for other dtypes of data, because setting a value to NA (float) in an array with other dtype is not compatible. Unlink R, python does not have missing value type for other dtypes.

pandas has introduced it's own `NA` and some `NA` compatible dtypes. However, `numpy` is still not aware of it, which causes problems for internal computations.

- string

When initialize a string array intentionally: `numpy.array(['a', NA])`, the `NA` will be converted to a string `'nan'`. That may not be what we want sometimes. To avoid that, use `None` or `NULL` instead:

```python
>>> numpy.array(['a', None])
array(['a', None], dtype=object)
```

Just pay attention that the dtype falls back to object.


- `NaN`

Since `NA` is already a float, `NaN` here is equivalent to `NA`.
17 changes: 17 additions & 0 deletions docs/caveats/df_index_colname.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

Most APIs from tidyverse packages ignore/reset the index (row names) of data frames, so do the APIs from `datar`. So when selecting rows, row indices are always used. With most APIs, the indices of the data frames are dropped, so they are actually ranging from 0 to `nrow(df) - 1`.

!!! Note

when using 1-based indexing (default), 1 selects the first row. Even though the first row shows index 0 when it's printed.

No `MultiIndex` indices/column names are supported for the APIs to select or manipulate data frames and the data frames generated by the APIs will not have `MultiIndex` indices/column names. However, since it's still pandas DataFrame, you can always do it in pandas way:

```python
df = tibble(x=1, y=2)
df2 = df >> mutate(z=f.x+f.y)
# pandas way to select
df2.iloc[0, z] # 3
# add multiindex to it:
df.columns = pd.MultiIndex.from_product([df.columns, ['C']])
```
9 changes: 9 additions & 0 deletions docs/caveats/grouped.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

`datar` doesn't use `pandas`' `DataFrameGroupBy`/`SeriesGroupBy` classes. Instead, we have our own `DataFrameGroupBy` class, which is actually a subclass of `DataFrame`, with 3 extra properties: `_group_data`, `_group_vars` and `_group_drop`, carring the grouping data, grouping variables/columns and whether drop the non-observable values. This is very similar to `grouped_df` from `dplyr`.

The reasons that we implement this are:

1. Pandas DataFrameGroupBy cannot handle mutilpe categorical columns as
groupby variables with non-obserable values
2. It is very hard to retrieve group indices and data when doing apply
3. NAs unmatched in grouping variables
80 changes: 80 additions & 0 deletions docs/caveats/in.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
`%in%` in R is a shortcut for `is.element()` to test if the elements are in a container.

```r
r$> c(1,3,5) %in% 1:4
[1] TRUE TRUE FALSE

r$> is.element(c(1,3,5), 1:4)
[1] TRUE TRUE FALSE
```

However, `in` in python acts differently:

```python
>>> import numpy as np
>>>
>>> arr = np.array([1,2,3,4])
>>> elts = np.array([1,3,5])
>>>
>>> elts in arr
/.../bin/bpython:1: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.
#!/.../bin/python
False
>>> [1,2] in [1,2,3]
False
```

It simply tests if the element on the left side of `in` is equal to any of the elements in the right side. Regardless of whether the element on the left side is scalar or not.

Yes, we can redefine the behavior of this by writing your own `__contains__()` methods of the right object. For example:

```python
>>> class MyList(list):
... def __contains__(self, key):
... # Just an example to let it return the reversed result
... return not super().__contains__(key)
...
>>> 1 in MyList([1,2,3])
False
>>> 4 in MyList([1,2,3])
True
```

But the problem is that the result `__contains__()` is forced to be a scalar bool by python. In this sense, we cannot let `x in y` to be evaluated as a bool array or even a pipda `Expression` object.
```python
>>> class MyList(list):
... def __contains__(self, key):
... # Just an example
... return [True, False, True] # logically True in python
...
>>> 1 in MyList([1,2,3])
True
>>> 4 in MyList([1,2,3])
True
```

So instead, we ported `is.element()` from R:

```python
>>> import numpy as np
>>> from datar.base import is_element
>>>
>>> arr = np.array([1,2,3,4])
>>> elts = np.array([1,3,5])
>>>
>>> is_element(elts, arr)
>>> is_element(elts, arr)
array([ True, True, False])
```

So, as @rleyvasal pointed out in https://github.com/pwwang/datar/issues/31#issuecomment-877499212,

if the left element is a pandas `Series`:
```python
>>> import pandas as pd
>>> pd.Series(elts).isin(arr)
0 True
1 True
2 False
dtype: bool
```
2 changes: 2 additions & 0 deletions docs/indexing.md → docs/caveats/indexing.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ In `R`, negative indexes mean removal. However, here negative indexes are still
selection, as `-1` for the last column, `-2` for the second last, etc. It is
the same for both 0-based and 1-based indexing.

If you want to do negative selection, use tilde `~` instead of `-`.

## Temporary index base change

For example:
Expand Down
10 changes: 10 additions & 0 deletions docs/caveats/list.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

R's list is actually a name-value pair container. When there is a need for it, we use python's dict instead, since python's list doesn't support names.

For example:
```python
>>> names({'a':1}, 'x')
{'x': 1}
```

We have `base.c()` to mimic `c()` in R, which will concatenate and flatten anything passed into it. Unlike `list()` in python, it accepts multiple arguments. So that you can do `c(1,2,3)`, but you cannot do `list(1,2,3)` in python.
18 changes: 18 additions & 0 deletions docs/caveats/nested_data_frames.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

pandas DataFrame doesn't support nested data frames. However, some R packages do, especially `tidyr`.

Here we uses fake nested data frames:

```python
>>> df = tibble(x=1, y=tibble(a=2, b=3))
>>> df
x y$a y$b
<int64> <int64> <int64>
0 1 2 3
```

Now `df` is a fake nested data frame, with an inner data frame as column `y` in `df`.

!!! Warning

For APIs from `tidyr` that tidies nested data frames, this is fully supported, but just pay attention when you operate it in pandas way. For other APIs, this feature is still experimental.
2 changes: 2 additions & 0 deletions docs/caveats/ptypes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

Unlike some APIs from `tidyverse` packages that uses a data frame as `ptypes` tempate, here we use dtypes directly or a dict with name-dtype pairs for the columns.
6 changes: 6 additions & 0 deletions docs/caveats/tibble_vs_dataframe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

`datar` introduced `tibble` package as well.

However, unlike in R, `tidyverse`'s `tibble` is a different class than the `data.frame` from base R, the data frame created by `datar.tibble.tibble()` and family is actually a pandas `DataFrame`. It's just a wrapper around the constructor.

So you can do anything you do using pandas API after creation.
Loading

0 comments on commit c95a4b4

Please sign in to comment.