Skip to content

Commit

Permalink
942 fix test prevent dtype int (#943)
Browse files Browse the repository at this point in the history
* added extra dtype assertions to test_download_era5

* simplified prevent_dtype_int() and merged with dfmt.preprocess_ERA5()

* removed prevent_dtype_int() also in merge_meteofiles()
  • Loading branch information
veenstrajelmer authored Aug 9, 2024
1 parent abb1b0f commit ed77eb0
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 71 deletions.
47 changes: 10 additions & 37 deletions dfm_tools/xarray_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pandas as pd
import logging
import numpy as np
import netCDF4
from dfm_tools.errors import OutOfRangeError

__all__ = [
Expand Down Expand Up @@ -109,8 +108,16 @@ def preprocess_ERA5(ds):
if 'valid_time' in ds.coords:
ds = ds.rename({'valid_time':'time'})

# prevent incorrect scaling/offset when merging files
prevent_dtype_int(ds)
# Prevent writing to (incorrectly scaled) int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239)
# By dropping scaling/offset encoding and converting to float32 (will result in a larger dataset)
# ERA5 datasets retrieved with the new CDS-beta are zipped float32 instead of scaled int, so this is only needed for backwards compatibility with old files.
for var in ds.data_vars:
if not set(['dtype','scale_factor','add_offset']).issubset(ds[var].encoding.keys()):
continue
# the _FillValue will still be -32767 (int default), but this is no issue for float32
ds[var].encoding.pop('scale_factor')
ds[var].encoding.pop('add_offset')
ds[var].encoding["dtype"] = "float32"

return ds

Expand All @@ -124,37 +131,6 @@ def preprocess_woa(ds):
return ds


def prevent_dtype_int(ds, zlib:bool = True):
"""
Prevent writing to int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239)
Since floats are used instead of ints, the disksize of the dataset will be larger
zlib=True decreases the filesize again (approx same as int filesize), but writing this file is slower
"""

for var in ds.data_vars:
var_encoding = ds[var].encoding
if not set(['dtype','scale_factor','add_offset']).issubset(ds[var].encoding.keys()):
continue

if 'int' not in str(var_encoding['dtype']):
continue

# prevent incorrectly scaled integers by dropping scaling/offset encoding
ds[var].encoding.pop('scale_factor')
ds[var].encoding.pop('add_offset')

# remove non-convention attribute
if 'missing_value' in ds[var].encoding.keys():
ds[var].encoding.pop('missing_value')

# reduce filesize with float32 instead of int and compression
ds[var].encoding["dtype"] = "float32"
if '_FillValue' in ds[var].encoding.keys():
float32_fillvalue = netCDF4.default_fillvals['f4']
ds[var].encoding['_FillValue'] = float32_fillvalue
ds[var].encoding["zlib"] = zlib


def merge_meteofiles(file_nc:str, preprocess=None,
time_slice:slice = slice(None,None),
add_global_overlap:bool = False, zerostart:bool = False,
Expand Down Expand Up @@ -266,9 +242,6 @@ def merge_meteofiles(file_nc:str, preprocess=None,
field_zerostart['time'] = [times_pd.index[0]-dt.timedelta(days=2),times_pd.index[0]-dt.timedelta(days=1)] #TODO: is one zero field not enough? (is replacing first field not also ok? (results in 1hr transition period)
data_xr = xr.concat([field_zerostart,data_xr],dim='time',combine_attrs='no_conflicts') #combine_attrs argument prevents attrs from being dropped

# converting from int16 with scalefac/offset to float32 with zlib, relevant for ERA5
prevent_dtype_int(data_xr)

return data_xr


Expand Down
1 change: 1 addition & 0 deletions docs/whats-new.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- making `dfmt.open_dataset_extra()` more modular by partly moving it to separate private functions in [#913](https://github.com/Deltares/dfm_tools/pull/913)
- added station_id variable to dataset returned by `dfmt.interp_uds_to_plipoints()` in [#914](https://github.com/Deltares/dfm_tools/pull/914)
- update private functions under `dfmt.download_ERA5()` to CDS-Beta (requires ECMWF apikey instead) in [#925](https://github.com/Deltares/dfm_tools/pull/925)
- simplified prevention of int dtypes in `dfmt.preprocess_ERA5()` in [#943](https://github.com/Deltares/dfm_tools/pull/943)

### Fix
- also apply convert_360to180 to longitude variable in `dfmt.open_dataset_curvilinear()` in [#913](https://github.com/Deltares/dfm_tools/pull/913)
Expand Down
6 changes: 6 additions & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,12 @@ def test_download_era5(file_nc_era5_pattern):
assert ds.sizes[timedim] == 1416
assert ds[timedim].to_pandas().iloc[0] == pd.Timestamp('2010-01-01')
assert ds[timedim].to_pandas().iloc[-1] == pd.Timestamp('2010-02-28 23:00')

# check if there are no integers in the dataset anymore
# this was the case before CDS-beta in https://github.com/Deltares/dfm_tools/issues/239
msl_encoding = ds['msl'].encoding
assert str(msl_encoding['dtype']) == 'float32'
assert 'scale_factor' not in msl_encoding.keys()


@pytest.mark.requiressecrets
Expand Down
34 changes: 0 additions & 34 deletions tests/test_xarray_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,47 +5,13 @@
@author: veenstra
"""


import os
import pytest
import xarray as xr
import numpy as np
from dfm_tools.xarray_helpers import prevent_dtype_int
import dfm_tools as dfmt
import pandas as pd

#TODO: many xarray_helpers tests are still in test_dfm_tools.py


@pytest.mark.requiressecrets
@pytest.mark.unittest
@pytest.mark.timeout(60) # useful since CDS downloads are terribly slow sometimes, so skip in that case
def test_prevent_dtype_int(tmp_path, file_nc_era5_pattern):
# file_nc_era5_pattern comes from file_nc_era5_pattern() in conftest.py
varn = "msl"
file_nc = os.path.join(tmp_path,f'era5_{varn}_*.nc')

#optional encoding
for zlib, size_expected in zip([False, True], [480000, 250000]):
data_xr = xr.open_mfdataset(file_nc)
prevent_dtype_int(data_xr, zlib=zlib)

#write to netcdf file
file_out = os.path.join(tmp_path, f'era5_prevent_dtype_int_{varn}_zlib_{zlib}.nc')
data_xr.to_netcdf(file_out)
data_xr_check = xr.open_dataset(file_out)
print(data_xr_check[varn].encoding)

assert np.allclose(data_xr[varn], data_xr_check[varn])

del data_xr
del data_xr_check

file_size = os.path.getsize(file_out)
print(file_size)
assert file_size < size_expected


@pytest.mark.requiressecrets
@pytest.mark.unittest
def test_merge_meteofiles(file_nc_era5_pattern):
Expand Down

0 comments on commit ed77eb0

Please sign in to comment.