diff --git a/docs/flow/modin/experimental/pandas.rst b/docs/flow/modin/experimental/pandas.rst index 25d9d8f3bcc..d429003c735 100644 --- a/docs/flow/modin/experimental/pandas.rst +++ b/docs/flow/modin/experimental/pandas.rst @@ -13,4 +13,6 @@ Experimental API Reference .. autofunction:: read_csv_glob .. autofunction:: read_custom_text .. autofunction:: read_pickle_distributed +.. autofunction:: read_parquet_glob .. automethod:: modin.pandas.DataFrame.modin::to_pickle_distributed +.. automethod:: modin.pandas.DataFrame.modin::to_parquet_glob diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index bcd5a364221..7d29af21265 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -421,6 +421,8 @@ default to pandas. | | | | ``path`` parameter specifies a directory where one | | | | | file is written per row partition of the Modin | | | | | dataframe. | +| | | | Experimental implementation: | +| | | | DataFrame.modin.to_parquet_glob | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_period`` | `to_period`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/io_supported.rst b/docs/supported_apis/io_supported.rst index c29c0792ef6..11f2a99f5e7 100644 --- a/docs/supported_apis/io_supported.rst +++ b/docs/supported_apis/io_supported.rst @@ -46,6 +46,7 @@ default to pandas. | | | passed via ``**kwargs`` are not supported. | | | | ``use_nullable_dtypes`` == True is not supported. | | | | | +| | | Experimental implementation: read_parquet_glob | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_json`_ | P | Implemented for ``lines=True`` | +-------------------+---------------------------------+--------------------------------------------------------+ diff --git a/docs/usage_guide/advanced_usage/index.rst b/docs/usage_guide/advanced_usage/index.rst index 66560bebbe8..3151c28cff7 100644 --- a/docs/usage_guide/advanced_usage/index.rst +++ b/docs/usage_guide/advanced_usage/index.rst @@ -30,8 +30,10 @@ Modin also supports these experimental APIs on top of pandas that are under acti - :py:func:`~modin.experimental.pandas.read_csv_glob` -- read multiple files in a directory - :py:func:`~modin.experimental.pandas.read_sql` -- add optional parameters for the database connection - :py:func:`~modin.experimental.pandas.read_custom_text` -- read custom text data from file -- :py:func:`~modin.experimental.pandas.read_pickle_distributed` -- read multiple files in a directory -- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_distributed` -- write to multiple files in a directory +- :py:func:`~modin.experimental.pandas.read_pickle_distributed` -- read multiple pickle files in a directory +- :py:func:`~modin.experimental.pandas.read_parquet_glob` -- read multiple parquet files in a directory +- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_distributed` -- write to multiple pickle files in a directory +- :py:meth:`~modin.pandas.DataFrame.modin.to_parquet_glob` -- write to multiple parquet files in a directory DataFrame partitioning API -------------------------- diff --git a/modin/core/execution/dispatching/factories/dispatcher.py b/modin/core/execution/dispatching/factories/dispatcher.py index c4d119921b5..8c4ecfa7ace 100644 --- a/modin/core/execution/dispatching/factories/dispatcher.py +++ b/modin/core/execution/dispatching/factories/dispatcher.py @@ -297,12 +297,12 @@ def to_pickle_distributed(cls, *args, **kwargs): return cls.get_factory()._to_pickle_distributed(*args, **kwargs) @classmethod - # @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob) + @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob) def read_parquet_glob(cls, *args, **kwargs): return cls.get_factory()._read_parquet_glob(*args, **kwargs) @classmethod - # @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob) + @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob) def to_parquet_glob(cls, *args, **kwargs): return cls.get_factory()._to_parquet_glob(*args, **kwargs) diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index 4d8eea02728..91d6273421a 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -517,20 +517,31 @@ def _to_pickle_distributed(cls, *args, **kwargs): return cls.io_cls.to_pickle_distributed(*args, **kwargs) @classmethod - # @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob) - def _read_parquet_glob(cls, *args, **kwargs): - # TODO: add docstring + @doc( + _doc_io_method_raw_template, + source="Parquet files", + params=_doc_io_method_kwargs_params, + ) + def _read_parquet_glob(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_read_parquet_glob()` is not implemented for {current_execution} execution." ) - return cls.io_cls.read_parquet_glob(*args, **kwargs) + return cls.io_cls.read_parquet_glob(**kwargs) @classmethod - # @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob) def _to_parquet_glob(cls, *args, **kwargs): - # TODO: add docstring + """ + Write query compiler content to several parquet files. + + Parameters + ---------- + *args : args + Arguments to pass to the writer method. + **kwargs : kwargs + Arguments to pass to the writer method. + """ current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( diff --git a/modin/experimental/core/io/glob/glob_dispatcher.py b/modin/experimental/core/io/glob/glob_dispatcher.py index 7d003c939e4..5210b6b84e5 100644 --- a/modin/experimental/core/io/glob/glob_dispatcher.py +++ b/modin/experimental/core/io/glob/glob_dispatcher.py @@ -122,7 +122,8 @@ def write(cls, qc, **kwargs): cls.base_write(qc, filepath_or_buffer, **kwargs) return - # just to try + # Be careful, this is a kind of limitation, but at the time of the first implementation, + # getting a name in this way is quite convenient. write_func_name = cls.base_write.__name__ def func(df, **kw): # pragma: no cover diff --git a/modin/experimental/pandas/io.py b/modin/experimental/pandas/io.py index 3f84b908211..3e6fd2de369 100644 --- a/modin/experimental/pandas/io.py +++ b/modin/experimental/pandas/io.py @@ -354,7 +354,7 @@ def to_pickle_distributed( compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, -): +) -> None: """ Pickle (serialize) object to file. @@ -363,7 +363,7 @@ def to_pickle_distributed( Parameters ---------- - filepath_or_buffer : str, path object or file-like object + filepath_or_buffer : str File path where the pickled object will be stored. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' A string representing the compression to use in the output file. By @@ -412,8 +412,23 @@ def read_parquet_glob( filesystem=None, filters=None, **kwargs, -): - # TODO: add docstring +) -> DataFrame: # noqa: PR01 + """ + Load a parquet object from the file path, returning a DataFrame. + + This experimental feature provides parallel reading from multiple parquet files which are + defined by glob pattern. The files must contain parts of one dataframe, which can be + obtained, for example, by `DataFrame.modin.to_parquet_glob` function. + + Returns + ------- + DataFrame + + Notes + ----- + * Only string type supported for `path` argument. + * The rest of the arguments are the same as for `pandas.read_parquet`. + """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame( @@ -434,15 +449,25 @@ def read_parquet_glob( @expanduser_path_arg("path") def to_parquet_glob( self, - path=None, + path, engine="auto", compression="snappy", index=None, partition_cols=None, storage_options: StorageOptions = None, **kwargs, -): - # TODO: add docstring +) -> None: # noqa: PR01 + """ + Write a DataFrame to the binary parquet format. + + This experimental feature provides parallel writing into multiple pickle files which are + defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used. + + Notes + ----- + * Only string type supported for `path` argument. + * The rest of the arguments are the same as for `pandas.to_parquet`. + """ obj = self from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher diff --git a/modin/pandas/accessor.py b/modin/pandas/accessor.py index 65dc54ec5b1..c208d576162 100644 --- a/modin/pandas/accessor.py +++ b/modin/pandas/accessor.py @@ -215,7 +215,7 @@ def to_pickle_distributed( compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, - ): + ) -> None: """ Pickle (serialize) object to file. @@ -224,7 +224,7 @@ def to_pickle_distributed( Parameters ---------- - filepath_or_buffer : str, path object or file-like object + filepath_or_buffer : str File path where the pickled object will be stored. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' A string representing the compression to use in the output file. By @@ -260,17 +260,37 @@ def to_pickle_distributed( def to_parquet_glob( self, - path=None, + path, engine="auto", compression="snappy", index=None, partition_cols=None, storage_options: StorageOptions = None, **kwargs, - ): - # TODO: add docstring + ) -> None: # noqa: PR01 + """ + Load a parquet object from the file path, returning a DataFrame. + + This experimental feature provides parallel reading from multiple parquet files which are + defined by glob pattern. The files must contain parts of one dataframe, which can be + obtained, for example, by `DataFrame.modin.to_parquet_glob` function. + + Returns + ------- + DataFrame + + Notes + ----- + * Only string type supported for `path` argument. + * The rest of the arguments are the same as for `pandas.read_parquet`. + """ from modin.experimental.pandas.io import to_parquet_glob + if path is None: + raise NotImplementedError( + "`to_parquet_glob` doesn't support path=None, use `to_parquet` in that case." + ) + to_parquet_glob( self._data, path=path,