forked from mhm-ufz/mHM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathasc2nc.py
379 lines (341 loc) · 16.1 KB
/
asc2nc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
#!/usr/bin/env python
# encoding: utf-8
"""
File Name : asc2nc
Project Name: mhm
Description : converts all ascii files of a typical (test_basin) mhm <v.6.0 environment and converts them to
netcdf files ready to be used in MPR from version 6.0 onwards
Author : ottor
Created : 28.08.17 14:53
"""
# IMPORTS
import os
import shutil
import pathlib
from itertools import product
import numpy
import pandas as pd
import xarray as xr
import argparse
# GLOBAL VARIABLES
# default input directory
IN_DIR = '../test_basin/input'
# default output directory
OUT_DIR = '../../MPR/reference/test_basin/input/temp'
# all file types scanned in input directory
POSSIBLE_SUFFIXES = ['.nc', '.asc']
# all folders scanned in input directory
FOLDER_LIST = ['lai', 'luse', 'morph']
# FUNCTIONS
def parse_args():
"""
parses all command line arguments
Returns
-------
Namespace object with args
"""
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
description='''This is a Python script to convert the input of mHM < v6.0 into
input for mHM >= v6.0.
author: Robert Schweppe
created: Oct 2018''')
parser.add_argument('-i', '--in_dir', action='store',
default=IN_DIR, dest='input_dir', metavar='input_dir',
help='input directories, default: {}'.format(IN_DIR))
parser.add_argument('-o', '--out_dir', action='store',
default=OUT_DIR, dest='output_dir', metavar='output_dir',
help='Output directory, where netcdf files for mHM >6.0 are stored'
', default: {}'.format(OUT_DIR))
return parser.parse_args()
def get_all_subfiles(path, relation=None):
"""
returns generator, yielding all files found in any subfolder of a given path considering POSSIBLE_SUFFIXES and FOLDER_LIST
relative to the given path
Parameters
----------
path: pathlib.Path
the path to look for files
relation:
a parent folder to be prefixed to the resulting file path
Returns
-------
generator
"""
if path.is_file():
if path.suffix in POSSIBLE_SUFFIXES:
yield path.relative_to(relation or path)
else:
for sub_path in path.iterdir():
if sub_path.name not in FOLDER_LIST and sub_path.is_dir():
continue
yield from get_all_subfiles(sub_path, relation or path)
# CLASSES
class MyAsciiToNetcdfConverter(object):
POSSIBLE_LOOKUP_MODES = ['dims_as_col', 'dims_in_col']
def __init__(self, input_file, output_file, lookup=None, sel=None, name=None, attrs=None, values_dtype=int,
iterate={}, csv_lut=False, lookup_mode='dims_in_col', lookup_nrows=None):
"""
initializes an object that is capable of handling the voncersion from ascii to netcdf
Parameters
----------
input_file: str
path to input file (ascii file with header information)
output_file: str
path to nc output file (can optionally include placeholders for formatting values)
e.g. './h_{horizon:0>2d}.nc' with arg iterate={'horizon': x}
lookup: str
path to lookup table mapping values to the indices in the ascii file
it is (whitespace-seperated) and may look like this:
'''
ID HORIZON UD[mm] LD[mm] CLAY[%] SAND[%] Bd[gcm-3] MUID
1 1 0 50 33. 33. 1.5 DUMMY
1 2 50 100 33. 33. 1.5 DUMMY
1 3 100 200 33. 33. 1.5 DUMMY
'''
sel: str or list of str
only applies if lookup is given and selects columns in the lookup file by name
name: str
if only one variable is created, this is the name set for this
attrs: dict
if only one variable is created, these are the attributes set for this
values_dtype: type
dtype set for values in array, to support np.nan, set to float
iterate: dict
dict containing additional dimensions to create in netcdf file
they are grabbed from lookup file, created in dataset and used for creating output
csv_lut: bool
flag specify format of lookup table, either comma- or whitespace-separated
lookup_mode: bool
the way the lookup table is parsed, either:
dims_as_col - all columns are used as a dimension, requires iterate (e.g. for LAI_class)
iterate must be the new dimension, e.g. {'month_of_year': range(1, 13)}
dims_in_col - all unique values in column are used as a dimension,
requires iterate (e.g. for soil_class)
iterate must be the column name and its index, e.g. {'horizon': 1}
lookup_nrows: int
number of rows to read in lookup file
"""
self.lookup_nrows = lookup_nrows
self.lookup_mode = lookup_mode
if self.lookup_mode not in self.POSSIBLE_LOOKUP_MODES:
raise Exception('The provided lookup_mode "' + self.lookup_mode + '" is not valid. I must be one of: '
', '.join(self.POSSIBLE_LOOKUP_MODES))
self.csv_lut = csv_lut
self.iterate = iterate
self.values_dtype = values_dtype
self.attrs = attrs
if self.attrs is None:
self.attrs = {}
self.name = name
self.sel = sel
self.lookup = lookup
if self.lookup is not None:
self.lookup = self._format_path(self.lookup)
self.input_file = self._format_path(input_file)
self.output_file = self._format_path(output_file)
self.raw_data = None
self.data = None
self.iterators = {}
def read(self):
"""
reads the ascii file and optionally the lookup table and creates a xr.Dataset at self.data
"""
data_meta = {}
with open(self.input_file) as f_in:
# read the first lines with meta information
for line in range(6):
key, value = f_in.readline().split()
data_meta[key] = float(value)
# read the rest of the thing into a numpy array
self.raw_data = numpy.loadtxt(f_in, dtype=self.values_dtype)
# apply nan
self.raw_data[self.raw_data == data_meta['NODATA_value']] = numpy.nan
# span the coord arrays
lats = self._make_coord(data_meta, 'lat')[::-1]
lat_attrs = {'standard_name': 'latitude',
'long_name': 'latitude',
'units': 'degrees_north',
'axis': 'Y'}
lons = self._make_coord(data_meta, 'lon')
lon_attrs = {'standard_name': 'longitude',
'long_name': 'longitude',
'units': 'degrees_east',
'axis': 'X'}
if self.lookup is not None:
# read the lookup table with the specified columns, set the ID as index, skip initial row
if self.csv_lut:
delim_whitespace = False
skiprows = 0
else:
delim_whitespace = True
skiprows = 1
index_col = [0]
if self.lookup_mode == self.POSSIBLE_LOOKUP_MODES[1]:
index_col.extend([_ for _ in list(self.iterate.values()) if _ is not None])
lookup_data = pd.read_csv(self.lookup, skiprows=skiprows, usecols=self.sel, nrows=self.lookup_nrows,
delim_whitespace=delim_whitespace,
index_col=index_col)
# iteratively build the nc Dataset
coords = {'lat': lats, 'lon': lons}
if self.lookup_mode == self.POSSIBLE_LOOKUP_MODES[1]:
# update coord by unique values occuring in additional index col
self.iterators = {key: lookup_data.index.get_level_values(value).unique().values for key, value in
self.iterate.items()}
# TODO: the horizon coord needs to get the value of the lower boundary of each layer
coords.update(self.iterators)
self.data = xr.Dataset({col_name.split('[')[0]: xr.DataArray(data=self._convert_raw(lookup_data[col_name]),
coords=coords,
dims=list(coords.keys()),
name=col_name.split('[')[0],
attrs={'units': col_name.split('[')[-1].rstrip(
']')}) for col_name in
lookup_data.columns},
attrs=self.attrs)
elif self.lookup_mode == self.POSSIBLE_LOOKUP_MODES[0]:
# update coord by additional columns
self.iterators = self.iterate
coords.update(self.iterators)
self.data = xr.DataArray(data=self._convert_raw(lookup_data),
coords=coords,
dims=list(coords.keys()),
name=self.name,
attrs=self.attrs).sortby(['lon', 'lat'])
else:
self.data = xr.DataArray(data=self.raw_data,
coords={'lon': lons, 'lat': lats},
dims=['lat', 'lon'],
name=self.name,
attrs=self.attrs,
).sortby(['lon', 'lat'])
self.data.lon.attrs = lon_attrs
self.data.lat.attrs = lat_attrs
def _convert_raw(self, series):
"""
applies the lookup table values to array
Parameters
----------
series: pd.Series
Series with index as array indices and replacement values
Returns
-------
np.array
"""
converted_clone = self.raw_data.copy()
# here all new dimensions are added to temporary field converted_clone
for new_axes in self.iterators.values():
# dynamically append a dimension to data to accommodate new values
# build slice object for array
old_slice = tuple(
[slice(None) for x in converted_clone.shape] + [numpy.newaxis for x in self.iterators.keys()])
# build slice object for new dimension
new_slice = tuple([numpy.newaxis for x in converted_clone.shape] + [slice(None)])
# use numpys broadcasting to combine
converted_clone = converted_clone[old_slice] * numpy.ones_like(new_axes)[new_slice]
if self.iterators:
if self.lookup_mode == self.POSSIBLE_LOOKUP_MODES[1]:
# loop over product of new dims
for axes in product(*list(self.iterators.values())):
# https://stackoverflow.com/questions/5036816/numpy-lookup-map-or-point
# select the slice in the array, convert axes values to their corresponding index values
array_slice = tuple(
[slice(None) for x in self.raw_data.shape] + [list(list(self.iterators.values())[i_a]).index(a) for
i_a, a in enumerate(axes)])
# select the slice in the lookup data
lookup_slice = tuple([slice(None)] + list(axes))
# select the lookup data
lookup_sel = series[lookup_slice]
# get lookup values and index them by index array
# append a nan value because searchsort returns len(lookup_sel.index) if nan is found
converted_clone[array_slice] = lookup_sel.append(pd.Series(numpy.nan)).values[
lookup_sel.index.searchsorted(converted_clone[array_slice])]
elif self.lookup_mode == self.POSSIBLE_LOOKUP_MODES[0]:
for index in numpy.unique(self.raw_data[~numpy.isnan(self.raw_data)]):
converted_clone[self.raw_data == index] = series.loc[index, :]
else:
#converted_clone = series.values[series.index.searchsorted(converted_clone)]
for idx, val in series.iteritems():
converted_clone[converted_clone == idx] = val
return converted_clone
def _make_coord(self, metas, which='lat'):
"""
creates array with coordinates based on meta information from asc file header
Parameters
----------
metas: dict
must contain entries 'xllcorner', 'yllcorner', 'nrows', 'ncols', 'cellsize'
which: str
one of 'lat' or 'lon'
Returns
-------
np.array with corresponding latitude or longitude values
"""
coord_dict = {'lat': ('yllcorner', 'nrows'), 'lon': ('xllcorner', 'ncols')}
ll = metas[coord_dict[which][0]]
n = metas[coord_dict[which][1]]
w = metas['cellsize']
return numpy.linspace(start=ll + 0.5 * w,
stop=ll + (n + 0.5) * w,
num=int(n),
endpoint=False)
def _format_path(self, path):
"""
formats a path
Parameters
----------
path: str
raw path
Returns
-------
str
"""
return pathlib.Path(os.path.abspath(os.path.expandvars(path)))
def write(self):
"""
dumps the data to netcdf file
"""
if hasattr(self.data, 'data_vars'):
# if multiple data arrays are contained in dataset, write each in seperate file
for data_var in self.data.data_vars:
output_file = pathlib.Path(self.output_file.parent, data_var + self.output_file.suffix)
self.data[data_var].to_netcdf(output_file)
else:
# if only dataarray is contained, then set name properly
self.data.name = self.output_file.stem
self.data.to_netcdf(self.output_file)
# SCRIPT
if __name__ == '__main__':
args = parse_args()
input_dir = pathlib.Path(args.input_dir)
if input_dir.is_file():
raise Exception("Input directory must be a directory, it is a file")
path_list = get_all_subfiles(input_dir)
for path in path_list:
print('working on file: {}'.format(path))
kwargs = {}
if path.stem.endswith('_class'):
kwargs['lookup'] = pathlib.Path(input_dir, path.parent, path.stem + 'definition.txt')
if path.stem == 'LAI_class':
#kwargs['csv_lut'] = False
kwargs['sel'] = ['ID', 'Jan.', 'Feb.', 'Mar.', 'Apr.', 'May', 'Jun.', 'Jul.', 'Aug.', 'Sep.',
'Oct.', 'Nov.', 'Dec.']
kwargs['lookup_mode'] = 'dims_as_col'
kwargs['attrs'] = {'units': '-'}
kwargs['iterate'] = {'month_of_year': list(range(1,13))}
elif path.stem == 'soil_class':
kwargs['iterate'] = {'horizon': 1}
elif path.stem == 'geology_class':
kwargs['sel'] = ['GeoParam(i)', 'ClassUnit', 'Karstic']
kwargs['lookup_nrows'] = 12
elif '_class_horizon_' in path.stem:
kwargs['lookup'] = pathlib.Path(input_dir, path.parent, 'soil_classdefinition_iFlag_soilDB_1.txt')
if path.suffix == '.nc':
shutil.copy(pathlib.Path(input_dir, path), pathlib.Path(args.output_dir, path.name))
else:
my_conv = MyAsciiToNetcdfConverter(
input_file=pathlib.Path(input_dir, path),
output_file=pathlib.Path(args.output_dir, path.stem + '.nc'),
values_dtype=float,
**kwargs
)
my_conv.read()
my_conv.write()