From aaf88180978066cb35ae77e2050e63c6bb62a6a8 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 17 Oct 2024 22:00:29 +0200 Subject: [PATCH 01/26] ENH: deal properly with naive datetimes with arrow --- pyogrio/tests/test_geopandas_io.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0675c197..953d259a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,5 +1,6 @@ import contextlib import locale +import time import warnings from datetime import datetime from io import BytesIO @@ -351,6 +352,29 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow): assert_geodataframe_equal(df, result) +@pytest.mark.filterwarnings( + "ignore: Non-conformant content for record 1 in column dates" +) +@pytest.mark.requires_arrow_write_api +def test_read_write_datetime_no_tz(tmp_path, use_arrow): + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + if use_arrow: + # with Arrow, the datetimes are always read as UTC + df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") + assert_geodataframe_equal(df, result) + + def test_read_null_values(tmp_path, use_arrow): filename = tmp_path / "test_null_values_no_geometry.gpkg" From 3e463a19e383319f3148ad6f176854b68b858a74 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 18 Oct 2024 18:38:43 +0200 Subject: [PATCH 02/26] Add more testcases, also for tz datetimes --- pyogrio/tests/test_geopandas_io.py | 52 ++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 953d259a..91e2c416 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -7,6 +7,7 @@ from zipfile import ZipFile import numpy as np +import pytz from pyogrio import ( __gdal_version__, @@ -299,7 +300,7 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) df_read = read_dataframe(fpath, use_arrow=use_arrow) if use_arrow: - # with Arrow, the datetimes are always read as UTC + # with Arrow, the datetimes are always read as UTC for .gpkg expected = expected.dt.tz_convert("UTC") assert_series_equal(df_read.datetime_col, expected) @@ -329,49 +330,72 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): assert_series_equal(result["dates"], utc_col) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow): - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] +def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC - df["dates"] = df["dates"].dt.tz_convert("UTC") + if use_arrow and ext == ".gpkg": + # for GPKG with Arrow, the datetime is written as naive datetime with the + # correct times, but when read the naive time is assumed to be UTC, which + # changes the effective time so this seems wrong. + df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") assert_geodataframe_equal(df, result) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_no_tz(tmp_path, use_arrow): - dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] +def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) if use_arrow: - # with Arrow, the datetimes are always read as UTC - df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") + if ext == ".fgb": + # when FlatGeoBuffer is read with Arrow, for datetimes with equal timezone, + # a column type with the appropriate minutes offset is returned. + # REMARK: For .fgb, the timezone is just dropped when reading or writing!!! + # -> 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T09:00:00.123 + df["dates"] = df["dates"].dt.tz_localize(tz=None) + elif ext in (".geojson", ".geojsonl"): + # when GeoJSON is read with Arrow, for datetimes with equal timezone, a + # column type with the appropriate minutes offset is returned. + # REMARK: for .geojson, the data is written fine, but when reading it goes + # wrong: 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T04:00:00.123-05:00 + df["dates"] = ( + df["dates"] + .dt.tz_localize(tz=None) + .dt.tz_localize(tz="UTC") + .dt.tz_convert(pytz.FixedOffset(-300)) + ) + elif ext == ".gpkg": + # when GPKG is read with Arrow, datetimes with timezone are converted to + # UTC. + df["dates"] = df["dates"].dt.tz_convert("UTC") assert_geodataframe_equal(df, result) From c18ab22b5e16bf43777a706f720131ad3857f4fc Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 09:09:25 +0100 Subject: [PATCH 03/26] Use datetime_as_string for reading with arrow --- pyogrio/_io.pyx | 10 +++++++- pyogrio/geopandas.py | 14 +++++++---- pyogrio/raw.py | 8 ++++++ pyogrio/tests/test_geopandas_io.py | 39 ++---------------------------- 4 files changed, 28 insertions(+), 43 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index d7334838..574c88fa 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1430,6 +1430,7 @@ def ogr_open_arrow( int return_fids=False, int batch_size=0, use_pyarrow=False, + datetime_as_string=False, ): cdef int err = 0 @@ -1624,6 +1625,12 @@ def ogr_open_arrow( "GEOARROW".encode('UTF-8') ) + # Read DateTime fields as strings, as the Arrow DateTime column type is + # quite limited regarding support for mixed timezones,... + IF CTE_GDAL_VERSION >= (3, 11, 0): + if datetime_as_string: + options = CSLSetNameValue(options, "DATETIME_AS_STRING", "YES") + # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -1649,6 +1656,7 @@ def ogr_open_arrow( 'crs': crs, 'encoding': encoding, 'fields': fields[:,2], # return only names + "dtypes": fields[:,3], 'geometry_type': geometry_type, 'geometry_name': geometry_name, 'fid_column': fid_column, @@ -2552,7 +2560,7 @@ def ogr_write_arrow( object path_or_fp, str layer, str driver, - object arrow_obj, + obje ct arrow_obj, str crs, str geometry_type, str geometry_name, diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 11672b25..62f2b532 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -257,11 +257,9 @@ def read_dataframe( read_func = read_arrow if use_arrow else read gdal_force_2d = False if use_arrow else force_2d - if not use_arrow: - # For arrow, datetimes are read as is. - # For numpy IO, datetimes are read as string values to preserve timezone info - # as numpy does not directly support timezones. - kwargs["datetime_as_string"] = True + + # Always read datetimes are as string values to preserve (mixed) timezone info + # as numpy does not directly support timezones and arrow support is also limited. result = read_func( path_or_buffer, layer=layer, @@ -278,6 +276,7 @@ def read_dataframe( sql=sql, sql_dialect=sql_dialect, return_fids=fid_as_index, + datetime_as_string=True, **kwargs, ) @@ -292,6 +291,11 @@ def read_dataframe( df = table.to_pandas(**kwargs) del table + # convert datetime columns that were read as string to datetime + for dtype, column in zip(meta["dtypes"], meta["fields"]): + if dtype is not None and dtype.startswith("datetime"): + df[column] = _try_parse_datetime(df[column]) + if fid_as_index: df = df.set_index(meta["fid_column"]) df.index.names = ["fid"] diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 0f0c3063..09bd5aa2 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -233,6 +233,7 @@ def read_arrow( sql=None, sql_dialect=None, return_fids=False, + datetime_as_string=False, **kwargs, ): """Read OGR data source into a pyarrow Table. @@ -303,6 +304,7 @@ def read_arrow( skip_features=gdal_skip_features, batch_size=batch_size, use_pyarrow=True, + datetime_as_string=datetime_as_string, **kwargs, ) as source: meta, reader = source @@ -358,6 +360,7 @@ def open_arrow( return_fids=False, batch_size=65_536, use_pyarrow=False, + datetime_as_string=False, **kwargs, ): """Open OGR data source as a stream of Arrow record batches. @@ -386,6 +389,9 @@ def open_arrow( ArrowStream object. In the default case, this stream object needs to be passed to another library supporting the Arrow PyCapsule Protocol to consume the stream of data. + datetime_as_string : bool, optional (default: False) + If True, will return datetime dtypes as detected by GDAL as strings, + as arrow doesn't support e.g. mixed timezones. Examples -------- @@ -423,6 +429,7 @@ def open_arrow( Meta is: { "crs": "", "fields": , + "dtypes": "encoding": "", "geometry_type": "", "geometry_name": "", @@ -453,6 +460,7 @@ def open_arrow( dataset_kwargs=dataset_kwargs, batch_size=batch_size, use_pyarrow=use_pyarrow, + datetime_as_string=datetime_as_string, ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f2373526..543370c3 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,13 +1,11 @@ import contextlib import locale -import time import warnings from datetime import datetime from io import BytesIO from zipfile import ZipFile import numpy as np -import pytz from pyogrio import ( __gdal_version__, @@ -316,9 +314,6 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) df_read = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC for .gpkg - expected = expected.dt.tz_convert("UTC") assert_series_equal(df_read.datetime_col, expected) @@ -348,9 +343,6 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.filterwarnings( - "ignore: Non-conformant content for record 1 in column dates" -) @pytest.mark.requires_arrow_write_api def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] @@ -359,17 +351,11 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" ) fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow and ext == ".gpkg": - # for GPKG with Arrow, the datetime is written as naive datetime with the - # correct times, but when read the naive time is assumed to be UTC, which - # changes the effective time so this seems wrong. - df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") assert_geodataframe_equal(df, result) @@ -391,28 +377,7 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - if ext == ".fgb": - # when FlatGeoBuffer is read with Arrow, for datetimes with equal timezone, - # a column type with the appropriate minutes offset is returned. - # REMARK: For .fgb, the timezone is just dropped when reading or writing!!! - # -> 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T09:00:00.123 - df["dates"] = df["dates"].dt.tz_localize(tz=None) - elif ext in (".geojson", ".geojsonl"): - # when GeoJSON is read with Arrow, for datetimes with equal timezone, a - # column type with the appropriate minutes offset is returned. - # REMARK: for .geojson, the data is written fine, but when reading it goes - # wrong: 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T04:00:00.123-05:00 - df["dates"] = ( - df["dates"] - .dt.tz_localize(tz=None) - .dt.tz_localize(tz="UTC") - .dt.tz_convert(pytz.FixedOffset(-300)) - ) - elif ext == ".gpkg": - # when GPKG is read with Arrow, datetimes with timezone are converted to - # UTC. - df["dates"] = df["dates"].dt.tz_convert("UTC") + assert_geodataframe_equal(df, result) From 597855f72936d421fd3606cc0e3d541584261cd0 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 09:23:59 +0100 Subject: [PATCH 04/26] Update _io.pyx --- pyogrio/_io.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 574c88fa..cd8e17e2 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -2560,7 +2560,7 @@ def ogr_write_arrow( object path_or_fp, str layer, str driver, - obje ct arrow_obj, + object arrow_obj, str crs, str geometry_type, str geometry_name, From fa4b86e489d895ccfe68116c45a440d481f2b83b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 09:37:09 +0100 Subject: [PATCH 05/26] Skip tests where appropriate --- pyogrio/tests/test_geopandas_io.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 543370c3..750c8ca5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -298,6 +298,9 @@ def test_read_datetime(datetime_file, use_arrow): @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + df = read_dataframe(datetime_tz_file) # Make the index non-consecutive to test this case as well. Added for issue # https://github.com/geopandas/pyogrio/issues/324 @@ -345,6 +348,9 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.requires_arrow_write_api def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") @@ -365,6 +371,9 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): ) @pytest.mark.requires_arrow_write_api def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") From 0e41ae4f7f5ef1a13cca443929d90825b1c86199 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 21:32:24 +0100 Subject: [PATCH 06/26] Improve support for mixed and naive datetimes --- pyogrio/geopandas.py | 94 +++++++++++++++++++++++++++--- pyogrio/tests/test_geopandas_io.py | 49 +++++++++++----- 2 files changed, 123 insertions(+), 20 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 62f2b532..f209c191 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -39,6 +39,7 @@ def _try_parse_datetime(ser): datetime_kwargs = {"format": "ISO8601", "errors": "ignore"} else: datetime_kwargs = {"yearfirst": True} + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -51,12 +52,6 @@ def _try_parse_datetime(ser): res = pd.to_datetime(ser, **datetime_kwargs) except Exception: res = ser - # if object dtype, try parse as utc instead - if res.dtype == "object": - try: - res = pd.to_datetime(ser, utc=True, **datetime_kwargs) - except Exception: - pass if res.dtype != "object": # GDAL only supports ms precision, convert outputs to match. @@ -66,6 +61,7 @@ def _try_parse_datetime(ser): res = res.dt.as_unit("ms") else: res = res.dt.round(freq="ms") + return res @@ -486,6 +482,8 @@ def write_dataframe( gdal_tz_offsets = {} for name in fields: col = df[name] + values = None + if isinstance(col.dtype, pd.DatetimeTZDtype): # Deal with datetimes with timezones by passing down timezone separately # pass down naive datetime @@ -500,8 +498,20 @@ def write_dataframe( # Convert each row offset to a signed multiple of 15m and add to GMT value gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 gdal_tz_offsets[name] = gdal_offset_representation.values - else: + + elif col.dtype == "object": + # Column of Timestamp objects, also split in naive datetime and tz offset + col_na = df[col.notna()][name] + if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): + tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) + gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 + gdal_tz_offsets[name] = gdal_offset_repr.values + naive = col.apply(lambda x: None if pd.isna(x) else x.tz_localize(None)) + values = naive.values + + if values is None: values = col.values + if isinstance(values, pd.api.extensions.ExtensionArray): from pandas.arrays import BooleanArray, FloatingArray, IntegerArray @@ -624,8 +634,33 @@ def write_dataframe( df = pd.DataFrame(df, copy=False) df[geometry_column] = geometry + # Convert all datetime columns to isoformat strings, to avoid mixed timezone + # information getting lost. + datetime_cols = [] + for name, dtype in df.dtypes.items(): + col = df[name] + if dtype == "object": + # When all non-NA values are Timestamps, treat as datetime column + col_na = df[col.notna()][name] + if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): + df[name] = col.apply( + lambda x: None if pd.isna(x) else x.isoformat() + ) + datetime_cols.append(name) + elif isinstance(dtype, pd.DatetimeTZDtype): + # Also for regular datetime columns with timezone mixed timezones are + # possible when thera is a difference between summer and winter time. + df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) + datetime_cols.append(name) + table = pa.Table.from_pandas(df, preserve_index=False) + # Add metadata to datetime columns so GDAL knows they are datetimes. + for datetime_col in datetime_cols: + table = _add_column_metadata( + table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}} + ) + if geometry_column is not None: # ensure that the geometry column is binary (for all-null geometries, # this could be a wrong type) @@ -685,3 +720,48 @@ def write_dataframe( gdal_tz_offsets=gdal_tz_offsets, **kwargs, ) + + +def _add_column_metadata(table, column_metadata: dict = {}): + """Add or update column-level metadata to an arrow table. + + Parameters + ---------- + table : pyarrow.Table + The table to add the column metadata to. + column_metadata : dict + A dictionary with column metadata in the form + { + "column_1": {"some": "data"}, + "column_2": {"more": "stuff"}, + } + + Returns + ------- + pyarrow.Table: table with the updated column metadata. + """ + import pyarrow as pa + + if not column_metadata: + return table + + # Create updated column fields with new metadata + fields = [] + for col in table.schema.names: + if col in column_metadata: + # Add/update column metadata + metadata = table.field(col).metadata or {} + for key, value in column_metadata[col].items(): + metadata[key] = value + # Update field with updated metadata + fields.append(table.field(col).with_metadata(metadata)) + else: + fields.append(table.field(col)) + + # Create new schema with the updated field metadata + schema = pa.schema(fields, metadata=table.schema.metadata) + + # Build new table with updated schema (shouldn't copy data) + table = table.cast(schema) + + return table diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 750c8ca5..5a517cf5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -324,14 +324,13 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_mixed_offset(tmp_path, use_arrow): +def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): + """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] naive_col = pd.Series(pd.to_datetime(dates), name="dates") localised_col = naive_col.dt.tz_localize("Australia/Sydney") - utc_col = localised_col.dt.tz_convert("UTC") - if PANDAS_GE_20: - utc_col = utc_col.dt.as_unit("ms") + localised_ts_col = localised_col.map(pd.Timestamp.isoformat).map(pd.Timestamp) df = gp.GeoDataFrame( {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, @@ -340,9 +339,30 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + # GDAL tz only encodes offsets, not timezones - # check multiple offsets are read as utc datetime instead of string values - assert_series_equal(result["dates"], utc_col) + assert_series_equal(result["dates"], localised_ts_col) + + +@pytest.mark.filterwarnings( + "ignore: Non-conformant content for record 1 in column dates" +) +@pytest.mark.requires_arrow_write_api +def test_write_datetime_mixed_offsets(tmp_path, use_arrow): + """Test with dates with mixed timezone offsets.""" + # Pandas datetime64 column types doesn't support mixed timezone offsets, so this + # list converts to pandas.Timestamp objects instead. + dates = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] + offset_col = pd.Series(pd.to_datetime(dates), name="dates") + df = gp.GeoDataFrame( + {"dates": offset_col, "geometry": [Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + assert_series_equal(result["dates"], offset_col) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -370,15 +390,18 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): +def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] - if PANDAS_GE_20: - dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") - else: - dates = pd.to_datetime(dates_raw) + dates_raw = [ + pd.Timestamp("2020-01-01T09:00:00.123-05:00"), + pd.Timestamp("2020-01-01T10:00:00-05:00"), + None, + ] + dates = pd.Series(dates_raw, dtype="O") + dates_expected = pd.Series(pd.to_datetime(dates_raw).as_unit("ms"), name="dates") + df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", @@ -387,7 +410,7 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_geodataframe_equal(df, result) + assert_series_equal(result.dates, dates_expected) def test_read_null_values(tmp_path, use_arrow): From 1378ace45939f35d56973a8539d4ccb7d55a6ebc Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 22:42:35 +0100 Subject: [PATCH 07/26] Skip use_arrow tests with old gdal versions --- pyogrio/tests/test_geopandas_io.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 5a517cf5..0c94bf7a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -326,6 +326,9 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): @pytest.mark.requires_arrow_write_api def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): """Test with localized dates across a different summer/winter timezone offset.""" + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] naive_col = pd.Series(pd.to_datetime(dates), name="dates") @@ -350,6 +353,9 @@ def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): @pytest.mark.requires_arrow_write_api def test_write_datetime_mixed_offsets(tmp_path, use_arrow): """Test with dates with mixed timezone offsets.""" + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + # Pandas datetime64 column types doesn't support mixed timezone offsets, so this # list converts to pandas.Timestamp objects instead. dates = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] From 0f1ab272aac3a436e3052fe939ada857bc8c023c Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 23:06:14 +0100 Subject: [PATCH 08/26] Take in account pandas version --- pyogrio/tests/test_geopandas_io.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0c94bf7a..6630ed80 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -406,7 +406,12 @@ def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): None, ] dates = pd.Series(dates_raw, dtype="O") - dates_expected = pd.Series(pd.to_datetime(dates_raw).as_unit("ms"), name="dates") + + if PANDAS_GE_20: + expected = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + expected = pd.to_datetime(dates_raw) + expected = pd.Series(expected, name="dates") df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, @@ -416,7 +421,7 @@ def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_series_equal(result.dates, dates_expected) + assert_series_equal(result.dates, expected) def test_read_null_values(tmp_path, use_arrow): From 6f78c68f5d65a95643e2316cbdf578884ea71ffa Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 00:45:31 +0100 Subject: [PATCH 09/26] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 6630ed80..3684156d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -368,6 +368,7 @@ def test_write_datetime_mixed_offsets(tmp_path, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + assert result["dates"][0] == offset_col[0] assert_series_equal(result["dates"], offset_col) From 336d0d80476f64489d70cee6b8191ea50b3886db Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 02:08:51 +0100 Subject: [PATCH 10/26] Also support columns with datetime objects --- pyogrio/geopandas.py | 13 ++++++++++++- pyogrio/tests/test_geopandas_io.py | 25 +++++++++++++++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index f209c191..5b2151e3 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -2,6 +2,7 @@ import os import warnings +from datetime import datetime import numpy as np @@ -508,6 +509,14 @@ def write_dataframe( gdal_tz_offsets[name] = gdal_offset_repr.values naive = col.apply(lambda x: None if pd.isna(x) else x.tz_localize(None)) values = naive.values + elif len(col_na) and all(isinstance(x, datetime) for x in col_na): + tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) + gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 + gdal_tz_offsets[name] = gdal_offset_repr.values + naive = col.apply( + lambda x: None if pd.isna(x) else x.replace(tzinfo=None) + ) + values = naive.values if values is None: values = col.values @@ -642,7 +651,9 @@ def write_dataframe( if dtype == "object": # When all non-NA values are Timestamps, treat as datetime column col_na = df[col.notna()][name] - if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): + if len(col_na) and all( + isinstance(x, (pd.Timestamp, datetime)) for x in col_na + ): df[name] = col.apply( lambda x: None if pd.isna(x) else x.isoformat() ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 3684156d..d81cec3d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -393,19 +393,29 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize( + "dates_raw", + [ + ( + pd.Timestamp("2020-01-01T09:00:00.123-05:00"), + pd.Timestamp("2020-01-01T10:00:00-05:00"), + None, + ), + ( + datetime.fromisoformat("2020-01-01T09:00:00.123-05:00"), + datetime.fromisoformat("2020-01-01T10:00:00-05:00"), + None, + ), + ], +) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): +def test_read_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - dates_raw = [ - pd.Timestamp("2020-01-01T09:00:00.123-05:00"), - pd.Timestamp("2020-01-01T10:00:00-05:00"), - None, - ] dates = pd.Series(dates_raw, dtype="O") if PANDAS_GE_20: @@ -422,6 +432,9 @@ def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + # With some older versions, the offset is represented slightly differently + if str(result.dates.dtype) == "datetime64[ns, pytz.FixedOffset(-300)]": + result.dates = result.dates.astype(expected.dtype) assert_series_equal(result.dates, expected) From 3035a1166061e91dd10474d8bdca76186dcadc7f Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 09:00:05 +0100 Subject: [PATCH 11/26] Rename some test functions for consistency --- pyogrio/tests/test_geopandas_io.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index d81cec3d..98bc7657 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -297,7 +297,7 @@ def test_read_datetime(datetime_file, use_arrow): @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api -def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): +def test_write_datetime_tz(datetime_tz_file, tmp_path, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") @@ -374,7 +374,8 @@ def test_write_datetime_mixed_offsets(tmp_path, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): +def test_write_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a datetime column without timezone information.""" if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") @@ -412,7 +413,7 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): +def test_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") From 9efdc091b915a7c4cbf348a87e7063d79d898d37 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 09:18:15 +0100 Subject: [PATCH 12/26] Avoid warning in test --- pyogrio/tests/test_geopandas_io.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 98bc7657..50fc758e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -356,20 +356,23 @@ def test_write_datetime_mixed_offsets(tmp_path, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - # Pandas datetime64 column types doesn't support mixed timezone offsets, so this - # list converts to pandas.Timestamp objects instead. - dates = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] - offset_col = pd.Series(pd.to_datetime(dates), name="dates") + # Pandas datetime64 column types doesn't support mixed timezone offsets, so + # it needs to be a list of pandas.Timestamp objects instead. + dates = [ + pd.Timestamp("2023-01-01 11:00:01.111+01:00"), + pd.Timestamp("2023-06-01 10:00:01.111+05:00"), + ] + expected = pd.Series(dates, name="dates") + df = gp.GeoDataFrame( - {"dates": offset_col, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert result["dates"][0] == offset_col[0] - assert_series_equal(result["dates"], offset_col) + assert_series_equal(result["dates"], expected) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) From eb80e0872bd4d15a60b2397108bd4850275ab05e Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 09:35:47 +0100 Subject: [PATCH 13/26] Improve inline comment --- pyogrio/geopandas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 5b2151e3..e708b65f 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -255,8 +255,9 @@ def read_dataframe( read_func = read_arrow if use_arrow else read gdal_force_2d = False if use_arrow else force_2d - # Always read datetimes are as string values to preserve (mixed) timezone info - # as numpy does not directly support timezones and arrow support is also limited. + # Always read datetimes as string values to preserve (mixed) timezone info + # as numpy does not directly support timezones and arrow datetime columns + # don't support mixed timezones. result = read_func( path_or_buffer, layer=layer, From d50b2d04da7b14e1e20792788549c1dea8561f52 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 12:25:22 +0100 Subject: [PATCH 14/26] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 117751cd..4af6563a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ ### Improvements - Capture all errors logged by gdal when opening a file fails (#495). +- Improve support for datetime columns with mixed or naive times (#486). ### Bug fixes From 1efa5bfb1d09e1e20eac6d9e9e2983e64152715f Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 11:12:49 +0100 Subject: [PATCH 15/26] Symplify code --- pyogrio/geopandas.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 19793113..4728b4c4 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -600,7 +600,7 @@ def write_dataframe( datetime_cols.append(name) elif isinstance(dtype, pd.DatetimeTZDtype): # Also for regular datetime columns with timezone mixed timezones are - # possible when thera is a difference between summer and winter time. + # possible when there is a difference between summer and winter time. df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) datetime_cols.append(name) @@ -678,15 +678,11 @@ def write_dataframe( gdal_tz_offsets[name] = gdal_offset_representation.values elif col.dtype == "object": - # Column of Timestamp objects, also split in naive datetime and tz offset + # Column of Timestamp/datetime objects, split in naive datetime and tz. col_na = df[col.notna()][name] - if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): - tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) - gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 - gdal_tz_offsets[name] = gdal_offset_repr.values - naive = col.apply(lambda x: None if pd.isna(x) else x.tz_localize(None)) - values = naive.values - elif len(col_na) and all(isinstance(x, datetime) for x in col_na): + if len(col_na) and all( + isinstance(x, (pd.Timestamp, datetime)) for x in col_na + ): tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 gdal_tz_offsets[name] = gdal_offset_repr.values From 0032839d22ba7d4157809e39d31ec7fe77ecd0af Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 17:25:48 +0100 Subject: [PATCH 16/26] Don't cast UTC data to string when writing --- pyogrio/geopandas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 4728b4c4..d3cf011e 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -598,9 +598,9 @@ def write_dataframe( lambda x: None if pd.isna(x) else x.isoformat() ) datetime_cols.append(name) - elif isinstance(dtype, pd.DatetimeTZDtype): - # Also for regular datetime columns with timezone mixed timezones are - # possible when there is a difference between summer and winter time. + elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": + # When a timezone has daylight saving time the offsets can also be + # different. UTC doesn't have this issue. df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) datetime_cols.append(name) From 9d2bfce2bafdc9bc31d1fbfc32ef01a2436bf1b0 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 17:26:58 +0100 Subject: [PATCH 17/26] Various improvements to tests - Test result < GDAL 3.11 instead of skipping - Add UTC test - ... --- pyogrio/tests/test_geopandas_io.py | 183 ++++++++++++++++++++--------- 1 file changed, 126 insertions(+), 57 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 50fc758e..be191a4c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -6,6 +6,7 @@ from zipfile import ZipFile import numpy as np +from pandas.api.types import is_datetime64_dtype from pyogrio import ( __gdal_version__, @@ -295,93 +296,117 @@ def test_read_datetime(datetime_file, use_arrow): assert df.col.dtype.name == "datetime64[ns]" +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api -def test_write_datetime_tz(datetime_tz_file, tmp_path, use_arrow): - if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") +def test_write_read_datetime_tz(tmp_path, ext, use_arrow): + """Write and read file with all equal timezones. + + This should result in the result being in pandas datetime64 dtype column. + """ + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) - df = read_dataframe(datetime_tz_file) # Make the index non-consecutive to test this case as well. Added for issue # https://github.com/geopandas/pyogrio/issues/324 - df = df.set_index(np.array([0, 2])) - raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + index=[0, 2], + crs="EPSG:4326", + ) + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) - if PANDAS_GE_20: - expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") - else: - expected = pd.to_datetime(raw_expected) - expected = pd.Series(expected, name="datetime_col") - assert_series_equal(df.datetime_col, expected, check_index=False) - # test write and read round trips - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - df_read = read_dataframe(fpath, use_arrow=use_arrow) - assert_series_equal(df_read.datetime_col, expected) + result = read_dataframe(fpath, use_arrow=use_arrow) + + # With some older versions, the offset is represented slightly differently + if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(df.dates.dtype) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".fgb", ".gpkg"): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + # columns + df.dates = df.dates.map( + lambda x: x.isoformat() if x is not pd.NaT else pd.NaT + ) + + assert_series_equal(result.dates, df.dates, check_index=False) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): +def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): """Test with localized dates across a different summer/winter timezone offset.""" - if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] - naive_col = pd.Series(pd.to_datetime(dates), name="dates") - localised_col = naive_col.dt.tz_localize("Australia/Sydney") - localised_ts_col = localised_col.map(pd.Timestamp.isoformat).map(pd.Timestamp) + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] + dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") + dates_local = dates_naive.dt.tz_localize("Australia/Sydney") + dates_local_offsets_str = dates_local.map(pd.Timestamp.isoformat) + dates_exp = dates_local_offsets_str.map(pd.Timestamp) df = gp.GeoDataFrame( - {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates_local, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + if use_arrow and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns written as string type columns + dates_exp = dates_local_offsets_str + # GDAL tz only encodes offsets, not timezones - assert_series_equal(result["dates"], localised_ts_col) + assert_series_equal(result["dates"], dates_exp) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_mixed_offsets(tmp_path, use_arrow): +def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): """Test with dates with mixed timezone offsets.""" - if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - # Pandas datetime64 column types doesn't support mixed timezone offsets, so # it needs to be a list of pandas.Timestamp objects instead. - dates = [ - pd.Timestamp("2023-01-01 11:00:01.111+01:00"), - pd.Timestamp("2023-06-01 10:00:01.111+05:00"), - ] - expected = pd.Series(dates, name="dates") + dates_raw = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] + dates_ts = list(map(pd.Timestamp, dates_raw)) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates_ts, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_series_equal(result["dates"], expected) + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + if PANDAS_GE_20: + df.dates = pd.to_datetime(dates_ts, utc=True).as_unit("ms") + else: + df.dates = pd.to_datetime(dates_ts, utc=True) + elif ext in (".gpkg", ".fgb"): + # With arrow and GDAL < 3.11, mixed timezone datetimes are written as string + # type columns, so no proper roundtrip possible. + df.dates = df.dates.map(pd.Timestamp.isoformat) + + assert_geodataframe_equal(result, df) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.requires_arrow_write_api -def test_write_datetime_no_tz(tmp_path, ext, use_arrow): - """Test writing/reading a datetime column without timezone information.""" - if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - +def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a column with naive datetimes (no timezone information).""" dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") @@ -390,10 +415,21 @@ def test_write_datetime_no_tz(tmp_path, ext, use_arrow): df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" ) + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_geodataframe_equal(df, result) + + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, columns with naive datetimes are written + # correctly, but when read they are wrongly interpreted as being in UTC. + # The reason is complicated, but more info can be found e.g. here: + # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") + + assert is_datetime64_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -416,30 +452,63 @@ def test_write_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): +def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): + """Datetime objects with null values and the equal offset are read as datetime64.""" + dates = pd.Series(dates_raw, dtype="O") + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + if PANDAS_GE_20: + dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates_exp = pd.to_datetime(dates_raw) + exp_df = df.copy() + exp_df.dates = pd.Series(dates_exp, name="dates") + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + # With some older versions, the offset is represented slightly differently + if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(exp_df.dates.dtype) + if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + if ext in (".fgb", ".gpkg"): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + # columns + exp_df.dates = exp_df.dates.map( + lambda x: x.isoformat() if x is not pd.NaT else pd.NaT + ) - dates = pd.Series(dates_raw, dtype="O") + assert_geodataframe_equal(result, exp_df) + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_utc(tmp_path, ext, use_arrow): + """Test writing/reading a column with UTC datetimes.""" + dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z"] if PANDAS_GE_20: - expected = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: - expected = pd.to_datetime(dates_raw) - expected = pd.Series(expected, name="dates") - + dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" ) + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype) == "datetime64[ns, pytz.FixedOffset(-300)]": - result.dates = result.dates.astype(expected.dtype) - assert_series_equal(result.dates, expected) + if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb + assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) + pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") + + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + assert_geodataframe_equal(result, df) def test_read_null_values(tmp_path, use_arrow): From ca9a8ae24003e637a78c7db718c7839275ace597 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 17:58:50 +0100 Subject: [PATCH 18/26] Smal fixes to tests --- pyogrio/tests/test_geopandas_io.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index be191a4c..2f0331f2 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -6,7 +6,6 @@ from zipfile import ZipFile import numpy as np -from pandas.api.types import is_datetime64_dtype from pyogrio import ( __gdal_version__, @@ -40,6 +39,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt + from pandas.api.types import is_datetime64_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -331,9 +331,7 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): if ext in (".fgb", ".gpkg"): # With GDAL < 3.11 with arrow, datetime columns are written as string type # columns - df.dates = df.dates.map( - lambda x: x.isoformat() if x is not pd.NaT else pd.NaT - ) + df.dates = df.dates.map(lambda x: x.isoformat()) assert_series_equal(result.dates, df.dates, check_index=False) @@ -479,7 +477,7 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar # With GDAL < 3.11 with arrow, datetime columns are written as string type # columns exp_df.dates = exp_df.dates.map( - lambda x: x.isoformat() if x is not pd.NaT else pd.NaT + lambda x: x.isoformat() if x is not pd.NaT else None ) assert_geodataframe_equal(result, exp_df) From deb862c2f5dcd6605bc2e972e9bce5bb00ea6b82 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 18:32:53 +0100 Subject: [PATCH 19/26] Xfail some tests where needed --- pyogrio/tests/test_geopandas_io.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 2f0331f2..f3eac4b4 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -304,6 +304,12 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): This should result in the result being in pandas datetime64 dtype column. """ + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") @@ -452,6 +458,12 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): @pytest.mark.requires_arrow_write_api def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): """Datetime objects with null values and the equal offset are read as datetime64.""" + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + dates = pd.Series(dates_raw, dtype="O") df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, From e35c356176c965ab5bfddb35927adca97da7dc9b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 22 Jan 2025 22:23:39 +0100 Subject: [PATCH 20/26] Make UTC assert more specific --- pyogrio/tests/test_geopandas_io.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f3eac4b4..0566f59a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -39,7 +39,6 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -432,7 +431,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - assert is_datetime64_dtype(result.dates.dtype) + assert str(result.dates.dtype) == "datetime64[ms, UTC]" assert_geodataframe_equal(result, df) From 593b2820c5668e16129d6d856bb5a39e765f0793 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 22 Jan 2025 22:58:01 +0100 Subject: [PATCH 21/26] Revert "Make UTC assert more specific" This reverts commit e35c356176c965ab5bfddb35927adca97da7dc9b. --- pyogrio/tests/test_geopandas_io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0566f59a..f3eac4b4 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -39,6 +39,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt + from pandas.api.types import is_datetime64_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -431,7 +432,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - assert str(result.dates.dtype) == "datetime64[ms, UTC]" + assert is_datetime64_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) From 35d8d87239445249bbe37bf204bedc7773e2a4ac Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 22 Jan 2025 22:58:37 +0100 Subject: [PATCH 22/26] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f3eac4b4..bf9f70dc 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -517,7 +517,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + assert str(result.dates.dtype) == "datetime64[ms, UTC]" assert_geodataframe_equal(result, df) From 41c9da6a5937d8557af10ef354bb605c6844d843 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:33:07 +0100 Subject: [PATCH 23/26] Use astype("string") instead of apply Needs to be astype"string") instead of astype(str) to support nan values --- pyogrio/geopandas.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index d3cf011e..e2128cd5 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -594,14 +594,12 @@ def write_dataframe( if len(col_na) and all( isinstance(x, (pd.Timestamp, datetime)) for x in col_na ): - df[name] = col.apply( - lambda x: None if pd.isna(x) else x.isoformat() - ) + df[name] = col.astype("string") datetime_cols.append(name) elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": - # When a timezone has daylight saving time the offsets can also be - # different. UTC doesn't have this issue. - df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) + # When it is a datetime column with a timezone different than UTC, it + # needs to be converted to string, otherwise the timezone info is lost. + df[name] = col.astype("string") datetime_cols.append(name) table = pa.Table.from_pandas(df, preserve_index=False) From f53af87dc55e8ee94cc6cd44dd8a7a13d1416b78 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:33:23 +0100 Subject: [PATCH 24/26] Improve tests --- pyogrio/tests/test_geopandas_io.py | 161 ++++++++++++++++------------- 1 file changed, 92 insertions(+), 69 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index bf9f70dc..015c99c6 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -39,7 +39,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_dtype + from pandas.api.types import is_datetime64_dtype, is_object_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -296,6 +296,35 @@ def test_read_datetime(datetime_file, use_arrow): assert df.col.dtype.name == "datetime64[ns]" +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a column with naive datetimes (no timezone information).""" + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00", None] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, columns with naive datetimes are written + # correctly, but when read they are wrongly interpreted as being in UTC. + # The reason is complicated, but more info can be found e.g. here: + # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") + + assert is_datetime64_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) + + @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api @@ -310,7 +339,7 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): # This was fixed in https://github.com/OSGeo/gdal/pull/11049 pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -319,8 +348,8 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): # Make the index non-consecutive to test this case as well. Added for issue # https://github.com/geopandas/pyogrio/issues/324 df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, - index=[0, 2], + {"dates": dates, "geometry": [Point(1, 1)] * 3}, + index=[0, 2, 3], crs="EPSG:4326", ) assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) @@ -330,45 +359,58 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): result = read_dataframe(fpath, use_arrow=use_arrow) # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): result.dates = result.dates.astype(df.dates.dtype) - if use_arrow and __gdal_version__ < (3, 11, 0): - if ext in (".fgb", ".gpkg"): - # With GDAL < 3.11 with arrow, datetime columns are written as string type - # columns - df.dates = df.dates.map(lambda x: x.isoformat()) + if use_arrow and ext in (".fgb", ".gpkg") and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + df_exp = df.copy() + df_exp.dates = df_exp[df_exp.dates.notna()].dates.astype(str) + assert_series_equal(result.dates, df_exp.dates, check_index=False) + pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow") + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) assert_series_equal(result.dates, df.dates, check_index=False) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): +def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow): """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", None] dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") - dates_local_offsets_str = dates_local.map(pd.Timestamp.isoformat) + dates_local_offsets_str = dates_local.astype("string").astype("O") dates_exp = dates_local_offsets_str.map(pd.Timestamp) df = gp.GeoDataFrame( - {"dates": dates_local, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) if use_arrow and __gdal_version__ < (3, 11, 0): - # With GDAL < 3.11 with arrow, datetime columns written as string type columns - dates_exp = dates_local_offsets_str + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + dates_utc = dates_local.dt.tz_convert("UTC") + if PANDAS_GE_20: + dates_utc = dates_utc.dt.as_unit("ms") + assert_series_equal(result.dates, dates_utc) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + elif ext in (".gpkg", ".fgb"): + # With GDAL < 3.11 with arrow, datetime columns written as string type + assert_series_equal(result.dates, dates_local_offsets_str) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") # GDAL tz only encodes offsets, not timezones - assert_series_equal(result["dates"], dates_exp) + assert is_object_dtype(result.dates.dtype) + assert_series_equal(result.dates, dates_exp) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -376,16 +418,18 @@ def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): +def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): """Test with dates with mixed timezone offsets.""" # Pandas datetime64 column types doesn't support mixed timezone offsets, so # it needs to be a list of pandas.Timestamp objects instead. - dates_raw = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] - dates_ts = list(map(pd.Timestamp, dates_raw)) + dates = [ + pd.Timestamp("2023-01-01 11:00:01.111+01:00"), + pd.Timestamp("2023-06-01 10:00:01.111+05:00"), + None, + ] df = gp.GeoDataFrame( - {"dates": dates_ts, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) @@ -395,44 +439,21 @@ def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): if ext in (".geojson", ".geojsonl"): # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC # when read as the arrow datetime column type does not support mixed tz. + df_exp = df.copy() + df_exp.dates = pd.to_datetime(dates, utc=True) if PANDAS_GE_20: - df.dates = pd.to_datetime(dates_ts, utc=True).as_unit("ms") - else: - df.dates = pd.to_datetime(dates_ts, utc=True) + df_exp.dates = df_exp.dates.dt.as_unit("ms") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): # With arrow and GDAL < 3.11, mixed timezone datetimes are written as string # type columns, so no proper roundtrip possible. - df.dates = df.dates.map(pd.Timestamp.isoformat) + df_exp = df.copy() + df_exp.dates = df_exp.dates.astype("string").astype("O") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") - assert_geodataframe_equal(result, df) - - -@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.requires_arrow_write_api -def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): - """Test writing/reading a column with naive datetimes (no timezone information).""" - dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] - if PANDAS_GE_20: - dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") - else: - dates = pd.to_datetime(dates_raw) - df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" - ) - - fpath = tmp_path / f"test{ext}" - write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) - - if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): - # With GDAL < 3.11 with arrow, columns with naive datetimes are written - # correctly, but when read they are wrongly interpreted as being in UTC. - # The reason is complicated, but more info can be found e.g. here: - # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 - assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) - pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - - assert is_datetime64_dtype(result.dates.dtype) + assert is_object_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) @@ -456,8 +477,8 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): - """Datetime objects with null values and the equal offset are read as datetime64.""" +def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow): + """Datetime objects with equal offsets are read as datetime64.""" if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime # as well as retaining the timezone. @@ -466,9 +487,9 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar dates = pd.Series(dates_raw, dtype="O") df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) + if PANDAS_GE_20: dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -481,17 +502,18 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar result = read_dataframe(fpath, use_arrow=use_arrow) # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): result.dates = result.dates.astype(exp_df.dates.dtype) if use_arrow and __gdal_version__ < (3, 11, 0): if ext in (".fgb", ".gpkg"): # With GDAL < 3.11 with arrow, datetime columns are written as string type - # columns - exp_df.dates = exp_df.dates.map( - lambda x: x.isoformat() if x is not pd.NaT else None - ) + exp2_df = exp_df.copy() + exp2_df.dates = exp2_df.dates.astype("string").astype("O") + assert_geodataframe_equal(result, exp2_df) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) assert_geodataframe_equal(result, exp_df) @@ -499,14 +521,15 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar @pytest.mark.requires_arrow_write_api def test_write_read_datetime_utc(tmp_path, ext, use_arrow): """Test writing/reading a column with UTC datetimes.""" - dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z"] + dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) + assert df.dates.dtype.name == "datetime64[ms, UTC]" fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) @@ -517,7 +540,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert str(result.dates.dtype) == "datetime64[ms, UTC]" + assert result.dates.dtype.name == "datetime64[ms, UTC]" assert_geodataframe_equal(result, df) From a8c85b752c27c2132e6ef66a7e78f16bb9e5c023 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:46:56 +0100 Subject: [PATCH 25/26] Fix tests for older versions --- pyogrio/tests/test_geopandas_io.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 015c99c6..fbf86cf7 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -385,7 +385,9 @@ def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow) dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") dates_local_offsets_str = dates_local.astype("string").astype("O") - dates_exp = dates_local_offsets_str.map(pd.Timestamp) + dates_exp = dates_local_offsets_str.apply( + lambda x: pd.Timestamp(x) if pd.notna(x) else None + ) df = gp.GeoDataFrame( {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" @@ -540,7 +542,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert result.dates.dtype.name == "datetime64[ms, UTC]" + assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") assert_geodataframe_equal(result, df) From 40ca1a53b2fa352e6ccd800fb4fefdc512c30fe2 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:50:00 +0100 Subject: [PATCH 26/26] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fbf86cf7..7891fa10 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -531,7 +531,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - assert df.dates.dtype.name == "datetime64[ms, UTC]" + assert df.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow)