diff --git a/.gitignore b/.gitignore index 9ed43c0..3a99b8d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,5 +9,6 @@ __pycache__ .python-version .venv docs +.vscode .idea .benchmarks diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 4c519a4..fc65be3 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -3,12 +3,11 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from pathlib import Path - import pandas as pd import polars as pl from os.path import expanduser +from pathlib import Path import pyarrow as pa @@ -238,12 +237,14 @@ def __repr__(self) -> str: return self._reader.__repr__() -def read_excel(path: Path | str) -> ExcelReader: +def read_excel(source: Path | str | bytes) -> ExcelReader: """Opens and loads an excel file. - :param path: The path to the file + :param source: The path to a file or its content as bytes """ - return ExcelReader(_read_excel(expanduser(path))) + if isinstance(source, (str, Path)): + source = expanduser(source) + return ExcelReader(_read_excel(source)) __all__ = ( diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 01fd90d..0e27e59 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -55,7 +55,7 @@ class _ExcelReader: @property def sheet_names(self) -> list[str]: ... -def read_excel(path: str) -> _ExcelReader: +def read_excel(source: str | bytes) -> _ExcelReader: """Reads an excel file and returns an ExcelReader""" __version__: str diff --git a/python/tests/test_errors.py b/python/tests/test_errors.py index 4897158..fffb4db 100644 --- a/python/tests/test_errors.py +++ b/python/tests/test_errors.py @@ -5,6 +5,12 @@ from utils import path_for_fixture +def test_read_excel_bad_type() -> None: + expected_message = "source must be a string or bytes" + with pytest.raises(fastexcel.InvalidParametersError, match=expected_message): + fastexcel.read_excel(42) # type: ignore[arg-type] + + def test_does_not_exist() -> None: expected_message = """calamine error: Cannot detect file format Context: diff --git a/python/tests/test_fastexcel.py b/python/tests/test_fastexcel.py index f59caaf..4c1a9a1 100644 --- a/python/tests/test_fastexcel.py +++ b/python/tests/test_fastexcel.py @@ -9,7 +9,7 @@ from utils import path_for_fixture -def test_single_sheet_to_pandas(): +def test_single_sheet(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) assert excel_reader.sheet_names == ["January"] sheet_by_name = excel_reader.load_sheet("January") @@ -31,7 +31,30 @@ def test_single_sheet_to_pandas(): pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) -def test_single_sheet_with_types_to_pandas(): +def test_single_sheet_bytes(): + with open(path_for_fixture("fixture-single-sheet.xlsx"), "rb") as f: + excel_reader = fastexcel.read_excel(f.read()) + assert excel_reader.sheet_names == ["January"] + sheet_by_name = excel_reader.load_sheet("January") + sheet_by_idx = excel_reader.load_sheet(0) + + # Metadata + assert sheet_by_name.name == sheet_by_idx.name == "January" + assert sheet_by_name.height == sheet_by_idx.height == 2 + assert sheet_by_name.width == sheet_by_idx.width == 2 + + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + + pd_expected = pd.DataFrame(expected) + pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) + pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) + + pl_expected = pl.DataFrame(expected) + pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) + pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) + + +def test_single_sheet_with_types(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) assert excel_reader.sheet_names == ["Sheet1"] @@ -67,7 +90,7 @@ def test_single_sheet_with_types_to_pandas(): ) -def test_multiple_sheets_to_pandas(): +def test_multiple_sheets(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) assert excel_reader.sheet_names == ["January", "February", "With unnamed columns"] diff --git a/src/lib.rs b/src/lib.rs index 1537816..a6b3686 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,12 +8,22 @@ use types::{ExcelReader, ExcelSheet}; /// Reads an excel file and returns an object allowing to access its sheets and a bit of metadata #[pyfunction] -fn read_excel(path: &str) -> PyResult { +fn read_excel(source: &PyAny) -> PyResult { use py_errors::IntoPyResult; - ExcelReader::try_from_path(path) - .with_context(|| format!("could not load excel file at {path}")) - .into_pyresult() + if let Ok(path) = source.extract::<&str>() { + ExcelReader::try_from_path(path) + .with_context(|| format!("could not load excel file at {path}")) + .into_pyresult() + } else if let Ok(bytes) = source.extract::<&[u8]>() { + ExcelReader::try_from(bytes) + .with_context(|| "could not load excel file for those bytes") + .into_pyresult() + } else { + Err(py_errors::InvalidParametersError::new_err( + "source must be a string or bytes", + )) + } } // Taken from pydantic-core: diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index 6a15ce5..d1c6f20 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -1,10 +1,16 @@ -use std::{fs::File, io::BufReader}; +use std::{ + fs::File, + io::{BufReader, Cursor}, +}; -use calamine::{open_workbook_auto, Reader, Sheets}; +use calamine::{ + open_workbook_auto, open_workbook_auto_from_rs, Data, Error, Range, Reader, Sheets, +}; use pyo3::{pyclass, pymethods, PyAny, PyResult}; use crate::error::{ - py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName, + py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, + IdxOrName, }; use super::{ @@ -12,12 +18,41 @@ use super::{ ExcelSheet, }; +enum ExcelSheets { + File(Sheets>), + Bytes(Sheets>>), +} + +impl ExcelSheets { + fn worksheet_range(&mut self, name: &str) -> Result, Error> { + match self { + Self::File(sheets) => sheets.worksheet_range(name), + Self::Bytes(sheets) => sheets.worksheet_range(name), + } + } + + fn worksheet_range_at(&mut self, idx: usize) -> Option, Error>> { + match self { + Self::File(sheets) => sheets.worksheet_range_at(idx), + Self::Bytes(sheets) => sheets.worksheet_range_at(idx), + } + } + + #[allow(dead_code)] + fn sheet_names(&self) -> Vec { + match self { + Self::File(sheets) => sheets.sheet_names(), + Self::Bytes(sheets) => sheets.sheet_names(), + } + } +} + #[pyclass(name = "_ExcelReader")] pub(crate) struct ExcelReader { - sheets: Sheets>, + sheets: ExcelSheets, #[pyo3(get)] sheet_names: Vec, - path: String, + source: String, } impl ExcelReader { @@ -29,9 +64,26 @@ impl ExcelReader { .with_context(|| format!("Could not open workbook at {path}"))?; let sheet_names = sheets.sheet_names().to_owned(); Ok(Self { - sheets, + sheets: ExcelSheets::File(sheets), + sheet_names, + source: path.to_owned(), + }) + } +} + +impl TryFrom<&[u8]> for ExcelReader { + type Error = FastExcelError; + + fn try_from(bytes: &[u8]) -> Result { + let cursor = Cursor::new(bytes.to_vec()); + let sheets = open_workbook_auto_from_rs(cursor) + .map_err(|err| FastExcelErrorKind::CalamineError(err).into()) + .with_context(|| "Could not open workbook from bytes")?; + let sheet_names = sheets.sheet_names().to_owned(); + Ok(Self { + sheets: ExcelSheets::Bytes(sheets), sheet_names, - path: path.to_owned(), + source: "bytes".to_owned(), }) } } @@ -39,7 +91,7 @@ impl ExcelReader { #[pymethods] impl ExcelReader { pub fn __repr__(&self) -> String { - format!("ExcelReader<{}>", &self.path) + format!("ExcelReader<{}>", &self.source) } #[pyo3(signature = (