Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: improve support for datetime columns #486

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Changes from 2 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
aaf8818
ENH: deal properly with naive datetimes with arrow
theroggy Oct 17, 2024
3e463a1
Add more testcases, also for tz datetimes
theroggy Oct 18, 2024
afdd0c1
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Jan 16, 2025
c18ab22
Use datetime_as_string for reading with arrow
theroggy Jan 17, 2025
597855f
Update _io.pyx
theroggy Jan 17, 2025
fa4b86e
Skip tests where appropriate
theroggy Jan 17, 2025
0e41ae4
Improve support for mixed and naive datetimes
theroggy Jan 17, 2025
1378ace
Skip use_arrow tests with old gdal versions
theroggy Jan 17, 2025
0f1ab27
Take in account pandas version
theroggy Jan 17, 2025
6f78c68
Update test_geopandas_io.py
theroggy Jan 17, 2025
336d0d8
Also support columns with datetime objects
theroggy Jan 18, 2025
3035a11
Rename some test functions for consistency
theroggy Jan 18, 2025
9efdc09
Avoid warning in test
theroggy Jan 18, 2025
eb80e08
Improve inline comment
theroggy Jan 18, 2025
d50b2d0
Update CHANGES.md
theroggy Jan 18, 2025
47aa298
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Jan 19, 2025
1efa5bf
Symplify code
theroggy Jan 20, 2025
0032839
Don't cast UTC data to string when writing
theroggy Jan 20, 2025
9d2bfce
Various improvements to tests
theroggy Jan 20, 2025
ca9a8ae
Smal fixes to tests
theroggy Jan 20, 2025
deb862c
Xfail some tests where needed
theroggy Jan 20, 2025
e35c356
Make UTC assert more specific
theroggy Jan 22, 2025
593b282
Revert "Make UTC assert more specific"
theroggy Jan 22, 2025
35d8d87
Update test_geopandas_io.py
theroggy Jan 22, 2025
41c9da6
Use astype("string") instead of apply
theroggy Jan 23, 2025
f53af87
Improve tests
theroggy Jan 23, 2025
a8c85b7
Fix tests for older versions
theroggy Jan 23, 2025
40ca1a5
Update test_geopandas_io.py
theroggy Jan 23, 2025
fc53d44
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 53 additions & 5 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import contextlib
import locale
import time
import warnings
from datetime import datetime
from io import BytesIO
from zipfile import ZipFile

import numpy as np
import pytz

from pyogrio import (
__gdal_version__,
Expand Down Expand Up @@ -298,7 +300,7 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow):
write_dataframe(df, fpath, use_arrow=use_arrow)
df_read = read_dataframe(fpath, use_arrow=use_arrow)
if use_arrow:
# with Arrow, the datetimes are always read as UTC
# with Arrow, the datetimes are always read as UTC for .gpkg
expected = expected.dt.tz_convert("UTC")
assert_series_equal(df_read.datetime_col, expected)

Expand Down Expand Up @@ -328,11 +330,38 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow):
assert_series_equal(result["dates"], utc_col)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
theroggy marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.requires_arrow_write_api
def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow):
def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow):
dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
else:
dates = pd.to_datetime(dates_raw)
df = gp.GeoDataFrame(
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]},
crs="EPSG:4326",
)
fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
result = read_dataframe(fpath, use_arrow=use_arrow)
if use_arrow and ext == ".gpkg":
# for GPKG with Arrow, the datetime is written as naive datetime with the
# correct times, but when read the naive time is assumed to be UTC, which
# changes the effective time so this seems wrong.
df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC")
theroggy marked this conversation as resolved.
Show resolved Hide resolved
assert_geodataframe_equal(df, result)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
@pytest.mark.requires_arrow_write_api
def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow):
dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
Expand All @@ -342,12 +371,31 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow):
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]},
crs="EPSG:4326",
)
fpath = tmp_path / "test.gpkg"
fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
result = read_dataframe(fpath, use_arrow=use_arrow)
if use_arrow:
# with Arrow, the datetimes are always read as UTC
df["dates"] = df["dates"].dt.tz_convert("UTC")
if ext == ".fgb":
# when FlatGeoBuffer is read with Arrow, for datetimes with equal timezone,
# a column type with the appropriate minutes offset is returned.
theroggy marked this conversation as resolved.
Show resolved Hide resolved
# REMARK: For .fgb, the timezone is just dropped when reading or writing!!!
# -> 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T09:00:00.123
df["dates"] = df["dates"].dt.tz_localize(tz=None)
elif ext in (".geojson", ".geojsonl"):
# when GeoJSON is read with Arrow, for datetimes with equal timezone, a
# column type with the appropriate minutes offset is returned.
# REMARK: for .geojson, the data is written fine, but when reading it goes
# wrong: 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T04:00:00.123-05:00
theroggy marked this conversation as resolved.
Show resolved Hide resolved
df["dates"] = (
df["dates"]
.dt.tz_localize(tz=None)
.dt.tz_localize(tz="UTC")
.dt.tz_convert(pytz.FixedOffset(-300))
)
elif ext == ".gpkg":
# when GPKG is read with Arrow, datetimes with timezone are converted to
# UTC.
df["dates"] = df["dates"].dt.tz_convert("UTC")
assert_geodataframe_equal(df, result)


Expand Down
Loading