Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arrow reading: generic code path (as used by GeoJSON): fix mis-handling of timezones #11049

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 72 additions & 24 deletions autotest/ogr/ogr_geojson.py
Original file line number Diff line number Diff line change
Expand Up @@ -4605,7 +4605,7 @@ def test_ogr_geojson_arrow_stream_pyarrow_mixed_timezone(tmp_vsimem):


def test_ogr_geojson_arrow_stream_pyarrow_utc_plus_five(tmp_vsimem):
pytest.importorskip("pyarrow")
# pytest.importorskip("pyarrow")

filename = str(
tmp_vsimem / "test_ogr_geojson_arrow_stream_pyarrow_utc_plus_five.geojson"
Expand All @@ -4621,22 +4621,37 @@ def test_ogr_geojson_arrow_stream_pyarrow_utc_plus_five(tmp_vsimem):
lyr.CreateFeature(f)
ds = None

try:
import pyarrow # NOQA

has_pyarrow = True
except ImportError:
has_pyarrow = False
if has_pyarrow:
ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
stream = lyr.GetArrowStreamAsPyArrow()
assert stream.schema.field("datetime").type.tz == "+05:00"
values = []
for batch in stream:
for x in batch.field("datetime"):
values.append(x.value)
assert values == [1653982496789, 1653986096789]

mem_ds = ogr.GetDriverByName("Memory").CreateDataSource("")
mem_lyr = mem_ds.CreateLayer("test", geom_type=ogr.wkbPoint)
ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
stream = lyr.GetArrowStreamAsPyArrow()
assert stream.schema.field("datetime").type.tz == "+05:00"
values = []
for batch in stream:
for x in batch.field("datetime"):
values.append(x.value)
assert values == [1654000496789, 1654004096789]
mem_lyr.WriteArrow(lyr)

f = mem_lyr.GetNextFeature()
assert f["datetime"] == "2022/05/31 12:34:56.789+05"


###############################################################################


def test_ogr_geojson_arrow_stream_pyarrow_utc_minus_five(tmp_vsimem):
pytest.importorskip("pyarrow")

filename = str(
tmp_vsimem / "test_ogr_geojson_arrow_stream_pyarrow_utc_minus_five.geojson"
Expand All @@ -4652,22 +4667,37 @@ def test_ogr_geojson_arrow_stream_pyarrow_utc_minus_five(tmp_vsimem):
lyr.CreateFeature(f)
ds = None

try:
import pyarrow # NOQA

has_pyarrow = True
except ImportError:
has_pyarrow = False
if has_pyarrow:
ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
stream = lyr.GetArrowStreamAsPyArrow()
assert stream.schema.field("datetime").type.tz == "-05:00"
values = []
for batch in stream:
for x in batch.field("datetime"):
values.append(x.value)
assert values == [1654018496789, 1654022096789]

mem_ds = ogr.GetDriverByName("Memory").CreateDataSource("")
mem_lyr = mem_ds.CreateLayer("test", geom_type=ogr.wkbPoint)
ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
stream = lyr.GetArrowStreamAsPyArrow()
assert stream.schema.field("datetime").type.tz == "-05:00"
values = []
for batch in stream:
for x in batch.field("datetime"):
values.append(x.value)
assert values == [1654000496789, 1654004096789]
mem_lyr.WriteArrow(lyr)

f = mem_lyr.GetNextFeature()
assert f["datetime"] == "2022/05/31 12:34:56.789-05"


###############################################################################


def test_ogr_geojson_arrow_stream_pyarrow_unknown_timezone(tmp_vsimem):
pytest.importorskip("pyarrow")

filename = str(
tmp_vsimem / "test_ogr_geojson_arrow_stream_pyarrow_unknown_timezone.geojson"
Expand All @@ -4683,15 +4713,33 @@ def test_ogr_geojson_arrow_stream_pyarrow_unknown_timezone(tmp_vsimem):
lyr.CreateFeature(f)
ds = None

try:
import pyarrow # NOQA

has_pyarrow = True
except ImportError:
has_pyarrow = False
if has_pyarrow:
ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
stream = lyr.GetArrowStreamAsPyArrow()
assert stream.schema.field("datetime").type.tz is None
values = []
for batch in stream:
for x in batch.field("datetime"):
values.append(x.value)
assert values == [1654000496789, 1654004096789]

mem_ds = ogr.GetDriverByName("Memory").CreateDataSource("")
mem_lyr = mem_ds.CreateLayer("test", geom_type=ogr.wkbPoint)
ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
stream = lyr.GetArrowStreamAsPyArrow()
assert stream.schema.field("datetime").type.tz is None
values = []
for batch in stream:
for x in batch.field("datetime"):
values.append(x.value)
assert values == [1654000496789, 1654004096789]
mem_lyr.WriteArrow(lyr)

f = mem_lyr.GetNextFeature()
# We have lost the timezone info here, as there's no way in Arrow to
# have a mixed of with and without timezone in a single column
assert f["datetime"] == "2022/05/31 12:34:56.789"


###############################################################################
Expand Down
11 changes: 1 addition & 10 deletions ogr/ogrsf_frmts/generic/ogrlayerarrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1780,17 +1780,8 @@ FillDateTimeArray(struct ArrowArray *psChild,
auto nVal =
CPLYMDHMSToUnixTime(&brokenDown) * 1000 +
(static_cast<int>(psRawField->Date.Second * 1000 + 0.5) % 1000);
if (nFieldTZFlag > OGR_TZFLAG_MIXED_TZ &&
if (nFieldTZFlag >= OGR_TZFLAG_MIXED_TZ &&
psRawField->Date.TZFlag > OGR_TZFLAG_MIXED_TZ)
{
// Convert for psRawField->Date.TZFlag to nFieldTZFlag
const int TZOffset =
(psRawField->Date.TZFlag - nFieldTZFlag) * 15;
const int TZOffsetMS = TZOffset * 60 * 1000;
nVal -= TZOffsetMS;
}
else if (nFieldTZFlag == OGR_TZFLAG_MIXED_TZ &&
psRawField->Date.TZFlag > OGR_TZFLAG_MIXED_TZ)
{
// Convert for psRawField->Date.TZFlag to UTC
const int TZOffset =
Expand Down
Loading