From 24c8a06e71805f56c3b9ec70d70e13badc589257 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 26 Jan 2025 22:46:04 +0100 Subject: [PATCH] Throw a specific an clear error --- pyogrio/geopandas.py | 10 +++++++++- pyogrio/tests/test_geopandas_io.py | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 11672b25..d1f0dc9d 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -289,7 +289,15 @@ def read_dataframe( kwargs = {"self_destruct": True} if arrow_to_pandas_kwargs is not None: kwargs.update(arrow_to_pandas_kwargs) - df = table.to_pandas(**kwargs) + + try: + df = table.to_pandas(**kwargs) + except UnicodeDecodeError as ex: + # Arrow does not support reading data in a non-UTF-8 encoding + raise DataSourceError( + "The file being read is not encoded in UTF-8; please use_arrow=False" + ) from ex + del table if fid_as_index: diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index e079b7a5..3d85d1f1 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -130,13 +130,24 @@ def test_read_csv_encoding(tmp_path, encoding, arrow): reason="test requires non-UTF-8 default platform", ) def test_read_csv_platform_encoding(tmp_path, use_arrow): - """verify that read defaults to platform encoding; only works on Windows (CP1252)""" + """Verify that read defaults to platform encoding; only works on Windows (CP1252). + + When use_arrow=True, reading an non-UTF8 fails. + """ csv_path = tmp_path / "test.csv" with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv: csv.write("näme,city\n") csv.write("Wilhelm Röntgen,Zürich\n") - df = read_dataframe(csv_path, use_arrow=use_arrow) + if use_arrow: + handler = pytest.raises( + DataSourceError, match="The file being read is not encoded in UTF-8" + ) + else: + handler = contextlib.nullcontext() + + with handler: + df = read_dataframe(csv_path, use_arrow=use_arrow) assert len(df) == 1 assert df.columns.tolist() == ["näme", "city"]