diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 01b7ff61..8adae48d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -37,15 +37,15 @@ jobs: - name: Deploy docs run: | mkdocs gh-deploy --clean --force - if: success() && github.ref == 'refs/heads/master' + # if: success() && github.ref == 'refs/heads/master' fix-index: needs: docs runs-on: ubuntu-latest - if: github.ref == 'refs/heads/master' + # if: github.ref == 'refs/heads/master' strategy: matrix: - python-version: [3.8] + python-version: [3.9] steps: - uses: actions/checkout@v2 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d41295df..7e71eca1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,7 +55,7 @@ repos: files: ^tests/.+$|^datar/.+$ - id: notebooks name: Executing all notebooks - entry: jupyter nbconvert --to notebook --execute + entry: jupyter nbconvert --output-dir /tmp --to notebook --execute language: system pass_filenames: true files: ^docs/.+\.ipynb$ diff --git a/datar/__init__.py b/datar/__init__.py index 84410305..3355ef9f 100644 --- a/datar/__init__.py +++ b/datar/__init__.py @@ -7,7 +7,7 @@ from .core.defaults import f __all__ = ('f', 'get_versions') -__version__ = "0.5.0" +__version__ = "0.5.1" def get_versions( prnt: bool = True diff --git a/datar/base/__init__.py b/datar/base/__init__.py index 2f92c3a7..2e4b0c67 100644 --- a/datar/base/__init__.py +++ b/datar/base/__init__.py @@ -40,7 +40,7 @@ from .complex import arg, as_complex, conj, im, is_complex, mod, re as re_ from .constants import LETTERS, letters, month_abb, month_name, pi from .cum import cummax, cummin, cumprod, cumsum -from .date import as_date +from .date import as_date, as_pd_date from .factor import ( as_categorical, as_factor, @@ -125,6 +125,7 @@ startswith, endswith, strtoi, + trimws, chartr, tolower, toupper, diff --git a/datar/base/date.py b/datar/base/date.py index 1b62ae53..15c1f3c0 100644 --- a/datar/base/date.py +++ b/datar/base/date.py @@ -2,16 +2,21 @@ import datetime import functools -from typing import Any, Union, List, Iterable +from typing import TYPE_CHECKING, Any, Iterable, List, Union import numpy -from pandas import Series, DataFrame +import pandas +from pandas import DataFrame, Series from pipda import register_func -from ..core.types import IntType, is_scalar_int, is_scalar from ..core.contexts import Context +from ..core.types import IntType, is_scalar, is_scalar_int from .na import NA +if TYPE_CHECKING: # pragma: no cover + # pylint: disable=ungrouped-imports + from pandas import DatetimeIndex, Timestamp + # pylint: disable=invalid-name # pylint: disable=redefined-builtin @@ -144,7 +149,7 @@ def as_date( optional: bool = False, tz: Union[IntType, datetime.timedelta] = 0, origin: Any = None, -) -> Iterable[datetime.date]: +) -> Union[Series, "Timestamp", "DatetimeIndex"]: """Convert an object to a datetime.date object See: https://rdrr.io/r/base/as.Date.html @@ -155,6 +160,11 @@ def as_date( the first non-NA element, and give an error if none works. Otherwise, the processing is via strptime try_formats: vector of format strings to try if format is not specified. + Default formats to try: + "%Y-%m-%d" + "%Y/%m/%d" + "%Y-%m-%d %H:%M:%S" + "%Y/%m/%d %H:%M:%S" optional: indicating to return NA (instead of signalling an error) if the format guessing does not succeed. origin: a datetime.date/datetime object, or something which can be @@ -168,7 +178,7 @@ def as_date( if not isinstance(x, Series): x = Series([x]) if is_scalar(x) else Series(x) - return x.transform( + out = x.transform( _as_date_dummy, format=format, try_formats=try_formats, @@ -176,3 +186,25 @@ def as_date( tz=tz, origin=origin, ) + return pandas.to_datetime(out) + +@register_func(None, context=Context.EVAL) +def as_pd_date( + arg: Union[int, str, float, datetime.datetime, Iterable], + *args: Any, + **kwargs: Any, +) -> Union[Series, "Timestamp", "DatetimeIndex"]: + """Alias of pandas.to_datetime(), but registered as a function + so that it can be used in verbs. + + See https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html + + Args: + arg: The argument to be converted to datetime + *args: and + **kwargs: Other arguments passing to `pandas.to_datetime()` + + Returns: + Converted datetime + """ + return pandas.to_datetime(arg, *args, **kwargs) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 09f7cf3f..4d6ac031 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.5.1 +- Add documentation about "blind" environment (#45, #54, #55) +- Change `base.as_date()` to return pandas datetime types instead python datetime types (#56) +- Add `base.as_pd_date()` to be an alias of `pandas.to_datetime()` (#56) +- Expose `trimws` to `datar.all` (#58) + ## 0.5.0 Added: diff --git a/docs/caveats/blind.md b/docs/caveats/blind.md new file mode 100644 index 00000000..e0886c8f --- /dev/null +++ b/docs/caveats/blind.md @@ -0,0 +1,89 @@ + +Related issues: [GH#45][1] [GH#54][2] [GH#55][3] + +## Why? +To make `datar` in both regular calling and piping calling for verbs: + +```python +# regular calling +num_rows = nrow(df) + +# piping calling +num_rows = df >> nrow() +``` + +we need the source code available to detect the AST node, especially the piping sign (`ast.BinOp(op=ast.RShift)`), so we can preserve the slot of the first argument for the data to pipe in. + +However, the source code is not always avaiable at runtime (i.e. raw python REPL, `exec()`), or the there could be some environment that modifies the AST tree (`assert` from `pytest`). We call those environments "blind". + +A quick example to simulate this siutation: + +```python +>>> from datar.all import * +>>> df = tibble(a="a") +>>> df >> mutate(A=f.a.str.upper()) + a A + +0 a A +>>> source = "df >> mutate(A=f.a.str.upper())" +>>> exec(source) +/path/to/site-packages/pipda/utils.py:161: UserWarning: Failed to fet +ch the node calling the function, call it with the original function. + warnings.warn( +Traceback (most recent call last): + ... +``` + +## Solutions + +- Try switching to a REPL that maintains the source code (`ipython` instead of raw python REPL, for example) +- Save the code into a file, and run that script with python interpreter +- Stick with the regular calling: + + ```python + >>> source = "df2 = mutate(df, A=f.a.str.upper())" + >>> exec(source) # you still get a warning, but the code works + /home/pwwang/miniconda3/lib/python3.9/site-packages/pipda/utils.py:161: UserWarning: Failed to fet + ch the node calling the function, call it with the original function. + warnings.warn( + >>> df2 + a A + + 0 a A + ``` + +- Stick with the piping calling: + + ```python + >>> from pipda import options + >>> options.assume_all_piping = True + >>> source = "df2 = df >> mutate(A=f.a.str.upper())" + >>> exec(source) # no warnings, we know we don't need the AST node anymore + >>> df2 + a A + + 0 a A + ``` + + !!! Note + + Whichever calling mode you are sticking with, you have to stick with it for all verbs, even for those simple ones (i.e. `dim()`, `nrow()`, etc) + + !!! Tip + + If you wonder whether a python function is registered as a verb or a plain function: + + ```python + >>> mutate.__pipda__ + 'Verb' + >>> nrow.__pipda__ + 'Verb' + >>> as_integer.__pipda__ + 'PlainFunction' + ``` + + + +[1]: https://github.com/pwwang/datar/issues/45 +[2]: https://github.com/pwwang/datar/issues/54 +[3]: https://github.com/pwwang/datar/issues/55 diff --git a/docs/reference-maps/base.md b/docs/reference-maps/base.md index b2c1107e..394d9c7d 100644 --- a/docs/reference-maps/base.md +++ b/docs/reference-maps/base.md @@ -109,6 +109,7 @@ |API|Description|Notebook example| |---|---|---:| |[`as_date`][41]|Cast data to date|[:material-notebook:][4]| +|[**`as_pd_date`**][150]|Alias of `pandas.to_datetime()`|| ### Factor data @@ -220,6 +221,7 @@ |[`chartr`][133]|Replace characters in strings|| |[`tolower`][134]|Transform strings to lower case|| |[`toupper`][135]|Transform strings to upper case|| +|[`trimws`][149]|Remove leading and/or trailing whitespace from character strings.|| ### Table @@ -430,3 +432,5 @@ [146]: ../../api/datar.base.table/#datar.base.table.tabulate [147]: ../../api/datar.base.verbs/#datar.base.verbs.append [148]: ../../api/datar.base.verbs/#datar.base.verbs.proportions +[149]: ../../api/datar.base.string/#datar.base.string.trimws +[150]: ../../api/datar.base.date/#datar.base.date.as_pd_date diff --git a/docs/style.css b/docs/style.css index cece3bfb..099b7766 100644 --- a/docs/style.css +++ b/docs/style.css @@ -1,4 +1,7 @@ +.md-main__inner.md-grid { + max-width: 80%; +} .md-typeset .admonition, .md-typeset details { font-size: .7rem !important; diff --git a/mkdocs.yml b/mkdocs.yml index f03cd40a..c07df8e7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,6 +58,7 @@ nav: 'Grouped/rowwise data frame': 'caveats/grouped.md' 'NAs': 'caveats/NAs.md' 'in vs %in%': 'caveats/in.md' + '"blind" enviroment': 'caveats/blind.md' - 'Datasets': 'datasets.md' - 'Advanced usage': 'advanced.md' - 'Examples': @@ -85,6 +86,11 @@ nav: 'fill': 'notebooks/fill.ipynb' 'filter': 'notebooks/filter.ipynb' 'filter-joins': 'notebooks/filter-joins.ipynb' + 'forcats_fct_multi': 'notebooks/forcats_fct_multi.ipynb' + 'forcats_lvl_addrm': 'notebooks/forcats_lvl_addrm.ipynb' + 'forcats_lvl_order': 'notebooks/forcats_lvl_order.ipynb' + 'forcats_lvl_value': 'notebooks/forcats_lvl_value.ipynb' + 'forcats_misc': 'notebooks/forcats_misc.ipynb' 'full_seq': 'notebooks/full_seq.ipynb' 'datar': 'notebooks/datar.ipynb' 'group_by': 'notebooks/group_by.ipynb' diff --git a/pyproject.toml b/pyproject.toml index f57e3c56..1ce94431 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "datar" -version = "0.5.0" +version = "0.5.1" description = "Port of dplyr and other related R packages in python, using pipda." authors = ["pwwang "] readme = "README.md" diff --git a/tests/test_base_date.py b/tests/test_base_date.py index 9a7c6081..4754fa35 100644 --- a/tests/test_base_date.py +++ b/tests/test_base_date.py @@ -1,5 +1,6 @@ import pytest +import pandas from datar.base.date import * from .conftest import assert_iterable_equal @@ -47,7 +48,7 @@ def test_as_date(x, format, try_formats, optional, tz, origin, expected): optional, tz, origin - ), expected) + ), pandas.to_datetime(expected)) def test_as_date_error(): with pytest.raises(ValueError): @@ -57,3 +58,7 @@ def test_as_date_error(): as_date("1990-1-1", "%Y") assert as_date("1990-1-1", "Y", optional=True).isna().all() + +def test_as_pd_date(): + + assert as_pd_date("Sep 16, 2021") == pandas.Timestamp('2021-09-16 00:00:00')