diff --git a/Makefile b/Makefile index 06ff9cb..90b49e8 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ dev-setup: install-test-requirements install-doc-requirements pre-commit install dev-install: - maturin develop -E pandas + maturin develop -E pandas,polars prod-install: ./prod_install.sh diff --git a/prod_install.sh b/prod_install.sh index bf93273..cca17b6 100755 --- a/prod_install.sh +++ b/prod_install.sh @@ -1,4 +1,4 @@ #!/bin/bash -xe rm -rf target/wheels/ maturin build --release -pip install --force-reinstall "$(echo target/wheels/*.whl)[pandas]" +pip install --force-reinstall "$(echo target/wheels/*.whl)[pandas, polars]" diff --git a/pyproject.toml b/pyproject.toml index 0b47044..528cc20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,8 @@ dependencies = [ [project.optional-dependencies] pandas = ["pandas>=1.4.4,<1.6"] +polars = ["polars>=0.16.14,<0.17"] + [project.urls] "Source Code" = "https://github.com/ToucanToco/fastexcel" diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 6b54e31..29f073a 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -4,6 +4,7 @@ import pandas as pd import pyarrow as pa +import polars as pl from ._fastexcel import __version__, _ExcelReader, _ExcelSheet from ._fastexcel import read_excel as _read_excel @@ -47,6 +48,15 @@ def to_pandas(self) -> "pd.DataFrame": # We know for sure that the sheet will yield exactly one RecordBatch return self.to_arrow().to_pandas() + def to_polars(self) -> pl.DataFrame | pl.Series: + """Converts the sheet to a Polars `DataFrame`. + + Requires the `polars` extra to be installed. + """ + # We know for sure that the sheet will yield exactly one RecordBatch + batch = self.to_arrow() + return pl.from_arrow(data=pa.Table.from_batches([batch])) + def __repr__(self) -> str: return self._sheet.__repr__() diff --git a/python/tests/test_fastexcel.py b/python/tests/test_fastexcel.py index b1c9d0c..dafcc96 100644 --- a/python/tests/test_fastexcel.py +++ b/python/tests/test_fastexcel.py @@ -2,8 +2,10 @@ from os.path import join as path_join import pytest -from pandas import DataFrame, Timestamp -from pandas.testing import assert_frame_equal +import pandas as pd +import polars as pl +from polars.testing import assert_frame_equal as pl_assert_frame_equal +from pandas.testing import assert_frame_equal as pd_assert_frame_equal import fastexcel @@ -23,10 +25,15 @@ def test_single_sheet_to_pandas(): assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 2 - expected = DataFrame({"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}) + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} - assert_frame_equal(sheet_by_name.to_pandas(), expected) - assert_frame_equal(sheet_by_idx.to_pandas(), expected) + pd_expected = pd.DataFrame(expected) + pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) + pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) + + pl_expected = pl.DataFrame(expected) + pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) + pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_single_sheet_with_types_to_pandas(): @@ -40,36 +47,67 @@ def test_single_sheet_with_types_to_pandas(): assert sheet.height == sheet.total_height == 3 assert sheet.width == 4 - assert_frame_equal( + pd_assert_frame_equal( sheet.to_pandas(), - DataFrame( + pd.DataFrame( { "__UNNAMED__0": [0.0, 1.0, 2.0], "bools": [True, False, True], - "dates": [Timestamp("2022-03-02 05:43:04")] * 3, + "dates": [pd.Timestamp("2022-03-02 05:43:04")] * 3, "floats": [12.35, 42.69, 1234567], } ), ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "__UNNAMED__0": [0.0, 1.0, 2.0], + "bools": [True, False, True], + "dates": ["2022-03-02 05:43:04"] * 3, + "floats": [12.35, 42.69, 1234567], + } + ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + ) + def test_multiple_sheets_to_pandas(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) assert excel_reader.sheet_names == ["January", "February", "With unnamed columns"] - assert_frame_equal( + pd_assert_frame_equal( excel_reader.load_sheet_by_idx(0).to_pandas(), - DataFrame({"Month": [1.0], "Year": [2019.0]}), + pd.DataFrame({"Month": [1.0], "Year": [2019.0]}), ) - - assert_frame_equal( + pd_assert_frame_equal( excel_reader.load_sheet_by_idx(1).to_pandas(), - DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}), + pd.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}), ) - - assert_frame_equal( + pd_assert_frame_equal( excel_reader.load_sheet_by_name("With unnamed columns").to_pandas(), - DataFrame( + pd.DataFrame( + { + "col1": [2.0, 3.0], + "__UNNAMED__1": [1.5, 2.5], + "col3": ["hello", "world"], + "__UNNAMED__3": [-5.0, -6.0], + "col5": ["a", "b"], + } + ), + ) + + pl_assert_frame_equal( + excel_reader.load_sheet_by_idx(0).to_polars(), + pl.DataFrame({"Month": [1.0], "Year": [2019.0]}), + ) + pl_assert_frame_equal( + excel_reader.load_sheet_by_idx(1).to_polars(), + pl.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}), + ) + pl_assert_frame_equal( + excel_reader.load_sheet_by_name("With unnamed columns").to_polars(), + pl.DataFrame( { "col1": [2.0, 3.0], "__UNNAMED__1": [1.5, 2.5], @@ -94,10 +132,15 @@ def test_sheets_with_header_line_diff_from_zero(): assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 2 - expected = DataFrame({"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}) + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + + pd_expected = pd.DataFrame(expected) + pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) + pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) - assert_frame_equal(sheet_by_name.to_pandas(), expected) - assert_frame_equal(sheet_by_idx.to_pandas(), expected) + pl_expected = pl.DataFrame(expected) + pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) + pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_no_header(): @@ -113,16 +156,19 @@ def test_sheets_with_no_header(): assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 3 - expected = DataFrame( - { - "__UNNAMED__0": [1.0, 2.0], - "__UNNAMED__1": [3.0, 4.0], - "__UNNAMED__2": [5.0, 6.0], - } - ) + expected = { + "__UNNAMED__0": [1.0, 2.0], + "__UNNAMED__1": [3.0, 4.0], + "__UNNAMED__2": [5.0, 6.0], + } + + pd_expected = pd.DataFrame(expected) + pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) + pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) - assert_frame_equal(sheet_by_name.to_pandas(), expected) - assert_frame_equal(sheet_by_idx.to_pandas(), expected) + pl_expected = pl.DataFrame(expected) + pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) + pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_empty_rows_before_header(): @@ -138,10 +184,15 @@ def test_sheets_with_empty_rows_before_header(): assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 2 - expected = DataFrame({"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}) + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} - assert_frame_equal(sheet_by_name.to_pandas(), expected) - assert_frame_equal(sheet_by_idx.to_pandas(), expected) + pd_expected = pd.DataFrame(expected) + pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) + pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) + + pl_expected = pl.DataFrame(expected) + pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) + pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_custom_headers(): @@ -161,10 +212,15 @@ def test_sheets_with_custom_headers(): assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 3 - expected = DataFrame({"foo": [1.0, 2.0], "bar": [3.0, 4.0], "baz": [5.0, 6.0]}) + expected = {"foo": [1.0, 2.0], "bar": [3.0, 4.0], "baz": [5.0, 6.0]} + + pd_expected = pd.DataFrame(expected) + pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) + pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) - assert_frame_equal(sheet_by_name.to_pandas(), expected) - assert_frame_equal(sheet_by_idx.to_pandas(), expected) + pl_expected = pl.DataFrame(expected) + pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) + pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_skipping_headers(): @@ -182,12 +238,16 @@ def test_sheets_with_skipping_headers(): assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 3 - expected = DataFrame( - {"Bugs": [1.0, 2.0], "__UNNAMED__1": [3.0, 4.0], "__UNNAMED__2": [5.0, 6.0]} - ) + expected = {"Bugs": [1.0, 2.0], "__UNNAMED__1": [3.0, 4.0], "__UNNAMED__2": [5.0, 6.0]} + - assert_frame_equal(sheet_by_name.to_pandas(), expected) - assert_frame_equal(sheet_by_idx.to_pandas(), expected) + pd_expected = pd.DataFrame(expected) + pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) + pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) + + pl_expected = pl.DataFrame(expected) + pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) + pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheet_with_pagination(): @@ -202,18 +262,30 @@ def test_sheet_with_pagination(): assert sheet.total_height == 3 assert sheet.width == 4 - assert_frame_equal( + pd_assert_frame_equal( sheet.to_pandas(), - DataFrame( + pd.DataFrame( { "__UNNAMED__0": [1.0], "bools": [False], - "dates": [Timestamp("2022-03-02 05:43:04")], + "dates": [pd.Timestamp("2022-03-02 05:43:04")], "floats": [42.69], } ), ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "__UNNAMED__0": [1.0], + "bools": [False], + "dates": ["2022-03-02 05:43:04"], + "floats": [42.69], + } + ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + ) + def test_sheet_with_skip_rows(): excel_reader = fastexcel.read_excel( @@ -226,18 +298,30 @@ def test_sheet_with_skip_rows(): assert sheet.height == 2 assert sheet.width == 4 - assert_frame_equal( + pd_assert_frame_equal( sheet.to_pandas(), - DataFrame( + pd.DataFrame( { "__UNNAMED__0": [1.0, 2.0], "bools": [False, True], - "dates": [Timestamp("2022-03-02 05:43:04")] * 2, + "dates": [pd.Timestamp("2022-03-02 05:43:04")] * 2, "floats": [42.69, 1234567], } ), ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "__UNNAMED__0": [1.0, 2.0], + "bools": [False, True], + "dates": ["2022-03-02 05:43:04"] * 2, + "floats": [42.69, 1234567], + } + ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + ) + def test_sheet_with_n_rows(): excel_reader = fastexcel.read_excel( @@ -250,18 +334,30 @@ def test_sheet_with_n_rows(): assert sheet.height == 1 assert sheet.width == 4 - assert_frame_equal( + pd_assert_frame_equal( sheet.to_pandas(), - DataFrame( + pd.DataFrame( { "__UNNAMED__0": [0.0], "bools": [True], - "dates": [Timestamp("2022-03-02 05:43:04")], + "dates": [pd.Timestamp("2022-03-02 05:43:04")], "floats": [12.35], } ), ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "__UNNAMED__0": [0.0], + "bools": [True], + "dates": ["2022-03-02 05:43:04"], + "floats": [12.35], + } + ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + ) + def test_sheet_with_pagination_and_without_headers(): excel_reader = fastexcel.read_excel( @@ -280,18 +376,30 @@ def test_sheet_with_pagination_and_without_headers(): assert sheet.height == 1 assert sheet.width == 4 - assert_frame_equal( + pd_assert_frame_equal( sheet.to_pandas(), - DataFrame( + pd.DataFrame( { "This": [0.0], "Is": [True], - "Amazing": [Timestamp("2022-03-02 05:43:04")], + "Amazing": [pd.Timestamp("2022-03-02 05:43:04")], "Stuff": [12.35], } ), ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "This": [0.0], + "Is": [True], + "Amazing": ["2022-03-02 05:43:04"], + "Stuff": [12.35], + } + ).with_columns(pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + ) + def test_sheet_with_pagination_out_of_bound(): excel_reader = fastexcel.read_excel( @@ -318,14 +426,26 @@ def test_sheet_with_pagination_out_of_bound(): assert sheet.height == 3 assert sheet.width == 4 - assert_frame_equal( + pd_assert_frame_equal( sheet.to_pandas(), - DataFrame( + pd.DataFrame( { "This": [0.0, 1.0, 2.0], "Is": [True, False, True], - "Amazing": [Timestamp("2022-03-02 05:43:04")] * 3, + "Amazing": [pd.Timestamp("2022-03-02 05:43:04")] * 3, "Stuff": [12.35, 42.69, 1234567], } ), ) + + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "This": [0.0, 1.0, 2.0], + "Is": [True, False, True], + "Amazing": ["2022-03-02 05:43:04"] * 3, + "Stuff": [12.35, 42.69, 1234567], + } + ).with_columns(pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + )