diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml index 58b33bbfb22..755199e0ef8 100644 --- a/.github/workflows/ci-required.yml +++ b/.github/workflows/ci-required.yml @@ -66,7 +66,6 @@ jobs: asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \ asv_bench/benchmarks/scalability/__init__.py \ modin/core/io \ - modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \ modin/pandas/series.py \ modin/core/execution/python \ modin/pandas/dataframe.py \ @@ -90,7 +89,6 @@ jobs: python scripts/doc_checker.py modin/experimental/pandas/io.py \ modin/experimental/pandas/__init__.py - run: python scripts/doc_checker.py modin/core/storage_formats/base - - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow - run: python scripts/doc_checker.py modin/core/storage_formats/pandas - run: | python scripts/doc_checker.py \ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0e0f1944f6c..c51d0b22a98 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -683,36 +683,6 @@ jobs: - run: python -m pytest modin/pandas/test/test_io.py --verbose - uses: ./.github/actions/upload-coverage - test-pyarrow: - needs: [lint-flake8, lint-black-isort] - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - strategy: - matrix: - python-version: ["3.9"] - env: - MODIN_STORAGE_FORMAT: pyarrow - MODIN_EXPERIMENTAL: "True" - name: test (pyarrow, python ${{matrix.python-version}}) - services: - moto: - image: motoserver/moto - ports: - - 5000:5000 - env: - AWS_ACCESS_KEY_ID: foobar_key - AWS_SECRET_ACCESS_KEY: foobar_secret - steps: - - uses: actions/checkout@v3 - - uses: ./.github/actions/mamba-env - with: - environment-file: environment-dev.yml - python-version: ${{matrix.python-version}} - - run: sudo apt update && sudo apt install -y libhdf5-dev - - run: python -m pytest modin/pandas/test/test_io.py::TestCsv --verbose - test-spreadsheet: needs: [lint-flake8, lint-black-isort] runs-on: ubuntu-latest diff --git a/asv_bench/benchmarks/utils/compatibility.py b/asv_bench/benchmarks/utils/compatibility.py index 2aa27f3d8de..0fa4bf93e68 100644 --- a/asv_bench/benchmarks/utils/compatibility.py +++ b/asv_bench/benchmarks/utils/compatibility.py @@ -47,4 +47,4 @@ assert ASV_USE_IMPL in ("modin", "pandas") assert ASV_DATASET_SIZE in ("big", "small") assert ASV_USE_ENGINE in ("ray", "dask", "python", "native", "unidist") -assert ASV_USE_STORAGE_FORMAT in ("pandas", "hdk", "pyarrow") +assert ASV_USE_STORAGE_FORMAT in ("pandas", "hdk") diff --git a/docs/conf.py b/docs/conf.py index 7993d8cf254..882527b4d56 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,6 @@ def noop_decorator(*args, **kwargs): for mod_name in ( "cudf", "cupy", - "pyarrow.gandiva", "pyhdk", "pyhdk.hdk", "xgboost", diff --git a/docs/development/architecture.rst b/docs/development/architecture.rst index 4cebbca7345..cdb311d928b 100644 --- a/docs/development/architecture.rst +++ b/docs/development/architecture.rst @@ -56,7 +56,7 @@ For the simplicity the other execution systems - Dask and MPI are omitted and on on a selected storage format and mapping or compiling the Dataframe Algebra DAG to and actual execution sequence. * Storage formats module is responsible for mapping the abstract operation to an actual executor call, e.g. pandas, - PyArrow, custom format. + HDK, custom format. * Orchestration subsystem is responsible for spawning and controlling the actual execution environment for the selected execution. It spawns the actual nodes, fires up the execution environment, e.g. Ray, monitors the state of executors and provides telemetry @@ -228,10 +228,6 @@ documentation page on :doc:`contributing `. - Uses HDK as an engine. - The storage format is `hdk` and the in-memory partition type is a pyarrow Table. When defaulting to pandas, the pandas DataFrame is used. - For more information on the execution path, see the :doc:`HDK on Native ` page. -- :doc:`Pyarrow on Ray ` (experimental) - - Uses the Ray_ execution framework. - - The storage format is `pyarrow` and the in-memory partition type is a pyarrow Table. - - For more information on the execution path, see the :doc:`Pyarrow on Ray ` page. - cuDF on Ray (experimental) - Uses the Ray_ execution framework. - The storage format is `cudf` and the in-memory partition type is a cuDF DataFrame. @@ -252,7 +248,7 @@ following figure illustrates this concept. :align: center Currently, the main in-memory format of each partition is a `pandas DataFrame`_ (:doc:`pandas storage format `). -:doc:`HDK `, :doc:`PyArrow ` +:doc:`HDK ` and cuDF are also supported as experimental in-memory formats in Modin. @@ -333,8 +329,7 @@ details. The documentation covers most modules, with more docs being added every │ │ │ │ └───implementations │ │ │ │ └─── :doc:`hdk_on_native ` │ │ │ ├─── :doc:`storage_formats ` - | │ │ | ├─── :doc:`hdk ` - │ │ │ | └─── :doc:`pyarrow ` + | │ │ | └───:doc:`hdk ` | | | └─── :doc:`io ` │ │ ├─── :doc:`pandas ` │ │ ├─── :doc:`sklearn ` @@ -350,7 +345,6 @@ details. The documentation covers most modules, with more docs being added every └───stress_tests .. _pandas Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html -.. _Arrow tables: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html .. _Ray: https://github.com/ray-project/ray .. _Unidist: https://github.com/modin-project/unidist .. _MPI: https://www.mpi-forum.org/ diff --git a/docs/development/index.rst b/docs/development/index.rst index b1e5c3f1212..5e257501857 100644 --- a/docs/development/index.rst +++ b/docs/development/index.rst @@ -12,7 +12,6 @@ Development using_pandas_on_python using_pandas_on_mpi using_hdk - using_pyarrow_on_ray .. meta:: :description lang=en: diff --git a/docs/development/using_pyarrow_on_ray.rst b/docs/development/using_pyarrow_on_ray.rst deleted file mode 100644 index c21da7ec9ae..00000000000 --- a/docs/development/using_pyarrow_on_ray.rst +++ /dev/null @@ -1,4 +0,0 @@ -PyArrow on Ray -============== - -Coming Soon! diff --git a/docs/flow/modin/core/storage_formats/index.rst b/docs/flow/modin/core/storage_formats/index.rst index 833cecc84b0..1d98af0d8dc 100644 --- a/docs/flow/modin/core/storage_formats/index.rst +++ b/docs/flow/modin/core/storage_formats/index.rst @@ -8,9 +8,8 @@ of objects that are stored in the partitions of the selected Core Modin Datafram The base storage format in Modin is pandas. In that format, Modin Dataframe operates with partitions that hold ``pandas.DataFrame`` objects. Pandas is the most natural storage format since high-level DataFrame objects mirror its API, however, Modin's storage formats are not -limited to the objects that conform to pandas API. There are formats that are able to store -``pyarrow.Table`` (:doc:`pyarrow storage format `) or even instances of -SQL-like databases (:doc:`HDK storage format `) +limited to the objects that conform to pandas API. There is format that are able to store +even instances of SQL-like databases (:doc:`HDK storage format `) inside Modin Dataframe's partitions. The storage format + execution engine (Ray, Dask, etc.) form the execution backend. diff --git a/docs/flow/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray.rst b/docs/flow/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray.rst deleted file mode 100644 index de6cb6048ae..00000000000 --- a/docs/flow/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray.rst +++ /dev/null @@ -1,27 +0,0 @@ -:orphan: - -PyArrow-on-Ray Module Description -""""""""""""""""""""""""""""""""" - -High-Level Module Overview -'''''''''''''''''''''''''' - -This module houses experimental functionality with PyArrow storage format and Ray -engine. The biggest difference from core engines is that internally each partition -is represented as ``pyarrow.Table`` put in the ``Ray`` Plasma store. - -Why to Use PyArrow Tables -''''''''''''''''''''''''' - -As it was `mentioned `_ -by the pandas creator, pandas internal architecture is not optimal and sometimes -needs up to ten times more memory than the original dataset size -(note, that pandas rule of thumb: `have 5 to 10 times as much RAM as the size of your -dataset`). In order to fix this issue (or at least to reduce needed memory amount and -needed data copying), ``PyArrow-on-Ray`` module was added. Due to the optimized architecture -of PyArrow Tables, `no additional copies are needed -`_ in some -corner cases, which can significantly improve Modin performance. The downside of this approach -is that PyArrow and pandas do not support the same APIs and some functions/parameters may have -different signatures or output different results, so for now the ``PyArrow-on-Ray`` engine is -under development and marked as experimental. diff --git a/docs/flow/modin/experimental/core/storage_formats/index.rst b/docs/flow/modin/experimental/core/storage_formats/index.rst index 176e499436f..8a5213c1ea8 100644 --- a/docs/flow/modin/experimental/core/storage_formats/index.rst +++ b/docs/flow/modin/experimental/core/storage_formats/index.rst @@ -7,11 +7,9 @@ Experimental storage formats and provides a limited set of functionality: * :doc:`hdk ` -* :doc:`pyarrow ` .. toctree:: :hidden: hdk/index - pyarrow/index diff --git a/docs/flow/modin/experimental/core/storage_formats/pyarrow/index.rst b/docs/flow/modin/experimental/core/storage_formats/pyarrow/index.rst deleted file mode 100644 index ba4ff32ae61..00000000000 --- a/docs/flow/modin/experimental/core/storage_formats/pyarrow/index.rst +++ /dev/null @@ -1,27 +0,0 @@ -PyArrow storage format -"""""""""""""""""""""" - -.. toctree:: - :hidden: - - query_compiler - parsers - -In general, PyArrow storage formats follow the flow of the pandas ones: query compiler contains an instance of Modin Dataframe, -which is internally split into partitions. The main difference is that partitions contain PyArrow tables, -instead of ``pandas.DataFrame``-s like with :doc:`pandas storage format `. To learn more about this approach please -visit :doc:`PyArrowOnRay execution ` section. - - -High-Level Module Overview -'''''''''''''''''''''''''' - -This module houses submodules which are responsible for communication between -the query compiler level and execution implementation level for PyArrow storage format: - -- :doc:`Query compiler ` is responsible for compiling efficient queries for :doc:`PyarrowOnRayDataframe `. -- :doc:`Parsers ` are responsible for parsing data on workers during IO operations. - -.. note:: - Currently the only one available PyArrow storage format factory is ``PyarrowOnRay`` which works - in :doc:`experimental mode ` only. diff --git a/docs/flow/modin/experimental/core/storage_formats/pyarrow/parsers.rst b/docs/flow/modin/experimental/core/storage_formats/pyarrow/parsers.rst deleted file mode 100644 index 62d4af8c74b..00000000000 --- a/docs/flow/modin/experimental/core/storage_formats/pyarrow/parsers.rst +++ /dev/null @@ -1,15 +0,0 @@ -Experimental PyArrow Parsers Module Description -""""""""""""""""""""""""""""""""""""""""""""""" - -This module houses parser classes that are responsible for data parsing on the workers for the PyArrow storage format. -Parsers for PyArrow storage formats follow an interface of :doc:`pandas format parsers `: -parser class of every file format implements ``parse`` method, which parses the specified part -of the file and builds PyArrow tables from the parsed data, based on the specified chunk size and number of splits. -The resulted PyArrow tables will be used as a partitions payload in the :py:class:`~modin.experimental.core.execution.ray.implementations.pyarrow_on_ray.dataframe.dataframe.PyarrowOnRayDataframe`. - -Public API -'''''''''' - -.. automodule:: modin.experimental.core.storage_formats.pyarrow.parsers - :members: - diff --git a/docs/flow/modin/experimental/core/storage_formats/pyarrow/query_compiler.rst b/docs/flow/modin/experimental/core/storage_formats/pyarrow/query_compiler.rst deleted file mode 100644 index e3c7b72ca36..00000000000 --- a/docs/flow/modin/experimental/core/storage_formats/pyarrow/query_compiler.rst +++ /dev/null @@ -1,21 +0,0 @@ -PyarrowQueryCompiler -"""""""""""""""""""" - -:py:class:`~modin.experimental.core.storage_formats.pyarrow.query_compiler.PyarrowQueryCompiler` is responsible for compiling efficient -Dataframe algebra queries for the :doc:`PyarrowOnRayDataframe `, -the frames which are backed by ``pyarrow.Table`` objects. - -Each :py:class:`~modin.experimental.core.storage_formats.pyarrow.query_compiler.PyarrowQueryCompiler` contains an instance of -:py:class:`~modin.experimental.core.execution.ray.implementations.pyarrow_on_ray.dataframe.dataframe.PyarrowOnRayDataframe` which it queries to get the result. - -Public API -'''''''''' - -:py:class:`~modin.experimental.core.storage_formats.pyarrow.query_compiler.PyarrowQueryCompiler` implements common query compilers API -defined by the :py:class:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler`. Most functionalities -are inherited from :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler`, in the following -section only overridden methods are presented. - -.. autoclass:: modin.experimental.core.storage_formats.pyarrow.query_compiler.PyarrowQueryCompiler - :members: - :show-inheritance: diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 4dabbc20f8a..5ab97abda9b 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -266,7 +266,7 @@ class StorageFormat(EnvironmentVariable, type=str): varname = "MODIN_STORAGE_FORMAT" default = "Pandas" - choices = ("Pandas", "Hdk", "Pyarrow", "Cudf") + choices = ("Pandas", "Hdk", "Cudf") class IsExperimental(EnvironmentVariable, type=bool): diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index d4e2567099f..f1bc05539c2 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -570,21 +570,6 @@ def prepare(cls): # that have little coverage of implemented functionality or are not stable enough. -@doc(_doc_factory_class, execution_name="experimental PyarrowOnRay") -class ExperimentalPyarrowOnRayFactory(BaseFactory): # pragma: no cover - @classmethod - @doc(_doc_factory_prepare_method, io_module_name="experimental ``PyarrowOnRayIO``") - def prepare(cls): - from modin.experimental.core.execution.ray.implementations.pyarrow_on_ray.io import ( - PyarrowOnRayIO, - ) - - if not IsExperimental.get(): - raise ValueError("'PyarrowOnRay' only works in experimental mode.") - - cls.io_cls = PyarrowOnRayIO - - @doc(_doc_factory_class, execution_name="experimental HdkOnNative") class ExperimentalHdkOnNativeFactory(BaseFactory): @classmethod diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/__init__.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/__init__.py deleted file mode 100644 index 16805cc615c..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental functionality related to Ray execution engine and optimized for PyArrow storage format.""" diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/dataframe/__init__.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/dataframe/__init__.py deleted file mode 100644 index ec39f99a387..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/dataframe/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental Base Modin Dataframe class optimized for PyArrow on Ray execution.""" diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/dataframe/dataframe.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/dataframe/dataframe.py deleted file mode 100644 index 5c139e477ad..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/dataframe/dataframe.py +++ /dev/null @@ -1,74 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -""" -Module contains class ``PyarrowOnRayDataframe``. - -``PyarrowOnRayDataframe`` is a dataframe class with PyArrow storage format and Ray engine. -""" - -from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe - -from ..partitioning.partition_manager import PyarrowOnRayDataframePartitionManager - - -class PyarrowOnRayDataframe(PandasDataframe): - """ - Class for dataframes with PyArrow storage format and Ray engine. - - ``PyarrowOnRayDataframe`` implements interfaces specific for PyArrow and Ray, - other functionality is inherited from the ``PandasDataframe`` class. - - Parameters - ---------- - partitions : np.ndarray - A 2D NumPy array of partitions. - index : sequence - The index for the dataframe. Converted to a ``pandas.Index``. - columns : sequence - The columns object for the dataframe. Converted to a ``pandas.Index``. - row_lengths : list, optional - The length of each partition in the rows. The "height" of - each of the block partitions. Is computed if not provided. - column_widths : list, optional - The width of each partition in the columns. The "width" of - each of the block partitions. Is computed if not provided. - dtypes : pandas.Series, optional - The data types for the dataframe columns. - """ - - _partition_mgr_cls = PyarrowOnRayDataframePartitionManager - - def synchronize_labels(self, axis=None): - """ - Synchronize labels by applying the index object (Index or Columns) to the partitions lazily. - - Parameters - ---------- - axis : {0, 1}, optional - Parameter is deprecated and affects nothing. - """ - self._filter_empties() - - def to_pandas(self): - """ - Convert frame object to a ``pandas.DataFrame``. - - Returns - ------- - pandas.DataFrame - """ - df = super(PyarrowOnRayDataframe, self).to_pandas() - df.index = self.index - df.columns = self.columns - return df diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/io/__init__.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/io/__init__.py deleted file mode 100644 index da3f745ec7a..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/io/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental Base IO classes optimized for PyArrow on Ray execution.""" - -from .io import PyarrowOnRayIO - -__all__ = ["PyarrowOnRayIO"] diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/io/io.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/io/io.py deleted file mode 100644 index 8dd2df0756c..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/io/io.py +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module for housing IO classes with PyArrow storage format and Ray engine.""" - -from modin.core.execution.ray.common import RayWrapper -from modin.core.execution.ray.generic.io import RayIO -from modin.core.io import CSVDispatcher -from modin.experimental.core.execution.ray.implementations.pyarrow_on_ray.dataframe.dataframe import ( - PyarrowOnRayDataframe, -) -from modin.experimental.core.execution.ray.implementations.pyarrow_on_ray.partitioning.partition import ( - PyarrowOnRayDataframePartition, -) -from modin.experimental.core.storage_formats.pyarrow import ( - PyarrowCSVParser, - PyarrowQueryCompiler, -) - - -class PyarrowOnRayCSVDispatcher(RayWrapper, PyarrowCSVParser, CSVDispatcher): - """Class handles utils for reading `.csv` files with PyArrow storage format and Ray engine.""" - - frame_cls = PyarrowOnRayDataframe - frame_partition_cls = PyarrowOnRayDataframePartition - query_compiler_cls = PyarrowQueryCompiler - - -class PyarrowOnRayIO(RayIO): - """Class for storing IO functions operated on PyArrow storage format and Ray engine.""" - - frame_cls = PyarrowOnRayDataframe - frame_partition_cls = PyarrowOnRayDataframePartition - query_compiler_cls = PyarrowQueryCompiler - csv_reader = PyarrowOnRayCSVDispatcher - - read_parquet_remote_task = None - read_hdf_remote_task = None - read_feather_remote_task = None - read_sql_remote_task = None diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/__init__.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/__init__.py deleted file mode 100644 index 16805cc615c..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental functionality related to Ray execution engine and optimized for PyArrow storage format.""" diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py deleted file mode 100644 index 2c8ba0304b8..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py +++ /dev/null @@ -1,305 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""The module defines interface for an axis partition with PyArrow storage format and Ray engine.""" - -import pyarrow -import ray - -from modin.core.dataframe.pandas.partitioning.axis_partition import ( - BaseDataframeAxisPartition, -) - -from .partition import PyarrowOnRayDataframePartition - - -class PyarrowOnRayDataframeAxisPartition(BaseDataframeAxisPartition): - """ - Class defines axis partition interface with PyArrow storage format and Ray engine. - - Inherits functionality from ``BaseDataframeAxisPartition`` class. - - Parameters - ---------- - list_of_blocks : list - List with partition objects to create common axis partition for. - """ - - def __init__(self, list_of_blocks): - assert all( - [len(partition.list_of_blocks) == 1 for partition in list_of_blocks] - ), "Implementation assumes that each partition contains a signle block." - # Unwrap from PandasDataframePartition object for ease of use - self.list_of_blocks = [obj.list_of_blocks[0] for obj in list_of_blocks] - - def apply(self, func, *args, num_splits=None, other_axis_partition=None, **kwargs): - """ - Apply func to the object in the Plasma store. - - Parameters - ---------- - func : callable or ray.ObjectRef - The function to apply. - *args : iterable - Positional arguments to pass with `func`. - num_splits : int, optional - The number of times to split the resulting object. - other_axis_partition : PyarrowOnRayDataframeAxisPartition, optional - Another ``PyarrowOnRayDataframeAxisPartition`` object to apply to - `func` with this one. - **kwargs : dict - Additional keyward arguments to pass with `func`. - - Returns - ------- - list - List with ``PyarrowOnRayDataframePartition`` objects. - - Notes - ----- - See notes in Parent class about this method. - """ - if num_splits is None: - num_splits = len(self.list_of_blocks) - - if other_axis_partition is not None: - return [ - PyarrowOnRayDataframePartition(obj) - for obj in deploy_ray_func_between_two_axis_partitions.options( - num_returns=num_splits - ).remote( - self.axis, - func, - args, - kwargs, - num_splits, - len(self.list_of_blocks), - *(self.list_of_blocks + other_axis_partition.list_of_blocks), - ) - ] - - return [ - PyarrowOnRayDataframePartition(obj) - for obj in deploy_ray_axis_func.options(num_returns=num_splits).remote( - self.axis, - func, - args, - kwargs, - num_splits, - *self.list_of_blocks, - ) - ] - - -class PyarrowOnRayDataframeColumnPartition(PyarrowOnRayDataframeAxisPartition): - """ - The column partition implementation for PyArrow storage format and Ray engine. - - All of the implementation for this class is in the ``PyarrowOnRayDataframeAxisPartition`` - parent class, and this class defines the axis to perform the computation over. - - Parameters - ---------- - list_of_blocks : list - List with partition objects to create common axis partition. - """ - - axis = 0 - - -class PyarrowOnRayDataframeRowPartition(PyarrowOnRayDataframeAxisPartition): - """ - The row partition implementation for PyArrow storage format and Ray engine. - - All of the implementation for this class is in the ``PyarrowOnRayDataframeAxisPartition`` - parent class, and this class defines the axis to perform the computation over. - - Parameters - ---------- - list_of_blocks : list - List with partition objects to create common axis partition. - """ - - axis = 1 - - -def concat_arrow_table_partitions(axis, partitions): - """ - Concatenate given `partitions` in a single table. - - Parameters - ---------- - axis : {0, 1} - The axis to concatenate over. - partitions : array-like - Array with partitions for concatenating. - - Returns - ------- - pyarrow.Table - ``pyarrow.Table`` constructed from the passed partitions. - """ - if axis == 0: - table = pyarrow.Table.from_batches( - [part.to_batches(part.num_rows)[0] for part in partitions] - ) - else: - table = partitions[0].drop([partitions[0].columns[-1].name]) - for obj in partitions[1:]: - i = 0 - for col in obj.itercolumns(): - if i < obj.num_columns - 1: - table = table.append_column(col) - i += 1 - table = table.append_column(partitions[0].columns[-1]) - return table - - -def split_arrow_table_result(axis, result, num_partitions, num_splits, metadata): - """ - Split ``pyarrow.Table`` according to the passed parameters. - - Parameters - ---------- - axis : {0, 1} - The axis to perform the function along. - result : pyarrow.Table - Resulting table to split. - num_partitions : int - Number of partitions that `result` was constructed from. - num_splits : int - The number of splits to return. - metadata : dict - Dictionary with ``pyarrow.Table`` metadata. - - Returns - ------- - list - List of PyArrow Tables. - """ - chunksize = ( - num_splits // num_partitions - if num_splits % num_partitions == 0 - else num_splits // num_partitions + 1 - ) - if axis == 0: - return [ - pyarrow.Table.from_batches([part]) for part in result.to_batches(chunksize) - ] - else: - return [ - result.drop( - [ - result.columns[i].name - for i in range(result.num_columns) - if i >= n * chunksize or i < (n - 1) * chunksize - ] - ) - .append_column(result.columns[-1]) - .replace_schema_metadata(metadata=metadata) - for n in range(1, num_splits) - ] + [ - result.drop( - [ - result.columns[i].name - for i in range(result.num_columns) - if i < (num_splits - 1) * chunksize - ] - ).replace_schema_metadata(metadata=metadata) - ] - - -@ray.remote -def deploy_ray_axis_func(axis, func, f_args, f_kwargs, num_splits, *partitions): - """ - Deploy a function along a full axis in Ray. - - Parameters - ---------- - axis : {0, 1} - The axis to perform the function along. - func : callable - The function to perform. - f_args : list or tuple - Positional arguments to pass to ``func``. - f_kwargs : dict - Keyword arguments to pass to ``func``. - num_splits : int - The number of splits to return. - *partitions : array-like - All partitions that make up the full axis (row or column). - - Returns - ------- - list - List of PyArrow Tables. - """ - table = concat_arrow_table_partitions(axis, partitions) - try: - result = func(table, *f_args, **f_kwargs) - except Exception: - result = pyarrow.Table.from_pandas(func(table.to_pandas(), *f_args, **f_kwargs)) - return split_arrow_table_result( - axis, result, len(partitions), num_splits, table.schema.metadata - ) - - -@ray.remote -def deploy_ray_func_between_two_axis_partitions( - axis, - func, - f_args, - f_kwargs, - num_splits, - len_of_left, - *partitions, -): - """ - Deploy a function along a full axis between two data sets in Ray. - - Parameters - ---------- - axis : {0, 1} - The axis to perform the function along. - func : callable - The function to perform. - f_args : list or tuple - Positional arguments to pass to ``func``. - f_kwargs : dict - Keyword arguments to pass to ``func``. - num_splits : int - The number of splits to return. - len_of_left : int - The number of values in `partitions` that belong to the left data set. - *partitions : array-like - All partitions that make up the full axis (row or column) - for both data sets. - - Returns - ------- - list - List of PyArrow Tables. - """ - lt_table = concat_arrow_table_partitions(axis, partitions[:len_of_left]) - rt_table = concat_arrow_table_partitions(axis, partitions[len_of_left:]) - try: - result = func(lt_table, rt_table, *f_args, **f_kwargs) - except Exception: - lt_frame = lt_table.from_pandas() - rt_frame = rt_table.from_pandas() - result = pyarrow.Table.from_pandas( - func(lt_frame, rt_frame, *f_args, **f_kwargs) - ) - return split_arrow_table_result( - axis, result, len(result.num_rows), num_splits, result.schema.metadata - ) diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/partition.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/partition.py deleted file mode 100644 index 3bd094498e6..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/partition.py +++ /dev/null @@ -1,83 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""The module defines interface for a partition with PyArrow storage format and Ray engine.""" - -import pyarrow - -from modin.core.execution.ray.common import RayWrapper -from modin.core.execution.ray.implementations.pandas_on_ray.partitioning import ( - PandasOnRayDataframePartition, -) - - -class PyarrowOnRayDataframePartition(PandasOnRayDataframePartition): - """ - Class provides partition interface specific for PyArrow storage format and Ray engine. - - Inherits functionality from the ``PandasOnRayDataframePartition`` class. - - Parameters - ---------- - data : ray.ObjectRef - A reference to ``pyarrow.Table`` that needs to be wrapped with this class. - length : ray.ObjectRef or int, optional - Length or reference to it of wrapped ``pyarrow.Table``. - width : ray.ObjectRef or int, optional - Width or reference to it of wrapped ``pyarrow.Table``. - ip : ray.ObjectRef or str, optional - Node IP address or reference to it that holds wrapped ``pyarrow.Table``. - call_queue : list, optional - Call queue that needs to be executed on wrapped ``pyarrow.Table``. - """ - - @classmethod - def put(cls, obj): - """ - Put an object in the Plasma store and wrap it in this object. - - Parameters - ---------- - obj : object - The object to be put. - - Returns - ------- - PyarrowOnRayDataframePartition - A ``PyarrowOnRayDataframePartition`` object. - """ - return PyarrowOnRayDataframePartition( - RayWrapper.put(pyarrow.Table.from_pandas(obj)) - ) - - @classmethod - def _length_extraction_fn(cls): - """ - Return the callable that extracts the number of rows from the given ``pyarrow.Table``. - - Returns - ------- - callable - """ - return lambda table: table.num_rows - - @classmethod - def _width_extraction_fn(cls): - """ - Return the callable that extracts the number of columns from the given ``pyarrow.Table``. - - Returns - ------- - callable - """ - return lambda table: table.num_columns - (1 if "index" in table.columns else 0) diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/partition_manager.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/partition_manager.py deleted file mode 100644 index 4c45e9096fd..00000000000 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/partition_manager.py +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module houses class for tracking partitions with PyArrow storage format and Ray engine.""" - -from modin.core.execution.ray.generic.partitioning import ( - GenericRayDataframePartitionManager, -) - -from .axis_partition import ( - PyarrowOnRayDataframeColumnPartition, - PyarrowOnRayDataframeRowPartition, -) -from .partition import PyarrowOnRayDataframePartition - - -class PyarrowOnRayDataframePartitionManager(GenericRayDataframePartitionManager): - """ - Class for tracking partitions with PyArrow storage format and Ray engine. - - Inherits all functionality from ``GenericRayDataframePartitionManager`` and ``PandasDataframePartitionManager`` base - classes. - """ - - # This object uses PyarrowOnRayDataframePartition objects as the underlying store. - _partition_class = PyarrowOnRayDataframePartition - _column_partitions_class = PyarrowOnRayDataframeColumnPartition - _row_partition_class = PyarrowOnRayDataframeRowPartition diff --git a/modin/experimental/core/storage_formats/pyarrow/__init__.py b/modin/experimental/core/storage_formats/pyarrow/__init__.py deleted file mode 100644 index 12bae94625f..00000000000 --- a/modin/experimental/core/storage_formats/pyarrow/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental Modin functionality specific to PyArrow storage format.""" - -from .parsers import PyarrowCSVParser -from .query_compiler import PyarrowQueryCompiler - -__all__ = ["PyarrowQueryCompiler", "PyarrowCSVParser"] diff --git a/modin/experimental/core/storage_formats/pyarrow/parsers.py b/modin/experimental/core/storage_formats/pyarrow/parsers.py deleted file mode 100644 index ecefd2f63a5..00000000000 --- a/modin/experimental/core/storage_formats/pyarrow/parsers.py +++ /dev/null @@ -1,77 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module houses Modin parser classes, that are used for data parsing on the workers.""" - -from io import BytesIO - -import pandas - -from modin.core.storage_formats.pandas.utils import compute_chunksize - - -class PyarrowCSVParser: - """Class for handling CSV files on the workers using PyArrow storage format.""" - - def parse(self, fname, num_splits, start, end, header, **kwargs): - """ - Parse CSV file into PyArrow tables. - - Parameters - ---------- - fname : str - Name of the CSV file to parse. - num_splits : int - Number of partitions to split the resulted PyArrow table into. - start : int - Position in the specified file to start parsing from. - end : int - Position in the specified file to end parsing at. - header : str - Header line that will be interpret as the first line of the parsed CSV file. - **kwargs : kwargs - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - list - List with split parse results and it's metadata: - - - First `num_split` elements are PyArrow tables, representing the corresponding chunk. - - Next element is the number of rows in the parsed table. - - Last element is the pandas Series, containing the data-types for each column of the parsed table. - """ - import pyarrow as pa - import pyarrow.csv as csv - - with open(fname, "rb") as bio: - # The header line for the CSV file - first_line = bio.readline() - bio.seek(start) - to_read = header + first_line + bio.read(end - start) - - table = csv.read_csv( - BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1) - ) - chunksize = compute_chunksize(table.num_columns, num_splits) - chunks = [ - pa.Table.from_arrays(table.columns[chunksize * i : chunksize * (i + 1)]) - for i in range(num_splits) - ] - return chunks + [ - table.num_rows, - pandas.Series( - [t.to_pandas_dtype() for t in table.schema.types], - index=table.schema.names, - ), - ] diff --git a/modin/experimental/core/storage_formats/pyarrow/query_compiler.py b/modin/experimental/core/storage_formats/pyarrow/query_compiler.py deleted file mode 100644 index 75e42e5a6c0..00000000000 --- a/modin/experimental/core/storage_formats/pyarrow/query_compiler.py +++ /dev/null @@ -1,95 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -""" -Module contains ``PyarrowQueryCompiler`` class. - -``PyarrowQueryCompiler`` is responsible for compiling efficient DataFrame algebra -queries for the ``PyarrowOnRayDataframe``. -""" - -import pandas - -from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler -from modin.utils import _inherit_docstrings - - -class FakeSeries: - """ - Series metadata class. - - Parameters - ---------- - dtype : dtype - Data-type of the represented Series. - """ - - def __init__(self, dtype): - self.dtype = dtype - - -@_inherit_docstrings(PandasQueryCompiler) -class PyarrowQueryCompiler(PandasQueryCompiler): - """ - Query compiler for the PyArrow storage format. - - This class translates common query compiler API into the DataFrame Algebra - queries, that is supposed to be executed by - :py:class:`~modin.experimental.core.execution.ray.implementations.pyarrow_on_ray.dataframe.dataframe.PyarrowOnRayDataframe`. - - Parameters - ---------- - modin_frame : PyarrowOnRayDataframe - Modin Frame to query with the compiled queries. - shape_hint : {"row", "column", None}, default: None - Shape hint for frames known to be a column or a row, otherwise None. - """ - - def _compute_index(self, axis, data_object, compute_diff=True): - """ - Compute index labels of the passed Modin Frame along specified axis. - - Parameters - ---------- - axis : {0, 1} - Axis to compute index labels along. 0 is for index and 1 is for column. - data_object : PyarrowOnRayDataframe - Modin Frame object to build indices from. - compute_diff : bool, default: True - Whether to cut the resulted indices to a subset of the self indices. - - Returns - ------- - pandas.Index - """ - - def arrow_index_extraction(table, axis): - """Extract index labels from the passed pyarrow table the along specified axis.""" - if not axis: - return pandas.Index(table.column(table.num_columns - 1)) - else: - try: - return pandas.Index(table.columns) - except AttributeError: - return [] - - index_obj = self.index if not axis else self.columns - old_blocks = self.data if compute_diff else None - # FIXME: `PandasDataframe.get_indices` was deprecated, this call should be - # replaced either by `PandasDataframe._compute_axis_label` or by `PandasDataframe.axes`. - new_indices, _ = data_object.get_indices( - axis=axis, - index_func=lambda df: arrow_index_extraction(df, axis), - old_blocks=old_blocks, - ) - return index_obj[new_indices] if compute_diff else new_indices diff --git a/modin/test/test_executions_api.py b/modin/test/test_executions_api.py index 35013132eea..f1cd635232b 100644 --- a/modin/test/test_executions_api.py +++ b/modin/test/test_executions_api.py @@ -14,10 +14,9 @@ import pytest from modin.core.storage_formats import BaseQueryCompiler, PandasQueryCompiler -from modin.experimental.core.storage_formats.pyarrow import PyarrowQueryCompiler BASE_EXECUTION = BaseQueryCompiler -EXECUTIONS = [PandasQueryCompiler, PyarrowQueryCompiler] +EXECUTIONS = [PandasQueryCompiler] def test_base_abstract_methods(): diff --git a/scripts/doc_checker.py b/scripts/doc_checker.py index 0979274d973..70db787b0b3 100644 --- a/scripts/doc_checker.py +++ b/scripts/doc_checker.py @@ -514,7 +514,6 @@ def monkeypatch(*args, **kwargs): pandas.util.cache_readonly = property # We are mocking packages we don't need for docs checking in order to avoid import errors - sys.modules["pyarrow.gandiva"] = Mock() sys.modules["sqlalchemy"] = Mock() modin.utils.instancer = functools.wraps(modin.utils.instancer)(lambda cls: cls) diff --git a/setup.cfg b/setup.cfg index 55de39100b6..3acc554836f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,9 +58,6 @@ omit = modin/experimental/core/execution/native/implementations/hdk_on_native/test/* # Plotting is not tested modin/pandas/plotting.py - # Skip Gandiva because it is experimental - modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/* - modin/core/storage_formats/pyarrow/* # Skip CUDF tests modin/storage_formats/cudf/* modin/core/execution/ray/implementations/cudf_on_ray/*