-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathdata_loader.py
542 lines (427 loc) · 28 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
import numpy as np
import pandas as pd
from tqdm import tqdm
name = "DataLoader"
regression_datasets = ["AustraliaRainfall",
"HouseholdPowerConsumption1",
"HouseholdPowerConsumption2",
"BeijingPM25Quality",
"BeijingPM10Quality",
"Covid3Month",
"LiveFuelMoistureContent",
"FloodModeling1",
"FloodModeling2",
"FloodModeling3",
"AppliancesEnergy",
"BenzeneConcentration",
"NewsHeadlineSentiment",
"NewsTitleSentiment",
"BIDMC32RR",
"BIDMC32HR",
"BIDMC32SpO2",
"IEEEPPG",
"PPGDalia"]
# The following code is adapted from the python package sktime to read .ts file.
class TsFileParseException(Exception):
"""
Should be raised when parsing a .ts file and the format is incorrect.
"""
pass
def load_from_tsfile_to_dataframe(full_file_path_and_name, return_separate_X_and_y=True,
replace_missing_vals_with='NaN'):
"""Loads data from a .ts file into a Pandas DataFrame.
Parameters
----------
full_file_path_and_name: str
The full pathname of the .ts file to read.
return_separate_X_and_y: bool
true if X and Y values should be returned as separate Data Frames (X) and a numpy array (y), false otherwise.
This is only relevant for data that
replace_missing_vals_with: str
The value that missing values in the text file should be replaced with prior to parsing.
Returns
-------
DataFrame, ndarray
If return_separate_X_and_y then a tuple containing a DataFrame and a numpy array containing the relevant time-series and corresponding class values.
DataFrame
If not return_separate_X_and_y then a single DataFrame containing all time-series and (if relevant) a column "class_vals" the associated class values.
"""
# Initialize flags and variables used when parsing the file
metadata_started = False
data_started = False
has_problem_name_tag = False
has_timestamps_tag = False
has_univariate_tag = False
has_class_labels_tag = False
has_target_labels_tag = False
has_data_tag = False
previous_timestamp_was_float = None
previous_timestamp_was_int = None
previous_timestamp_was_timestamp = None
num_dimensions = None
is_first_case = True
instance_list = []
class_val_list = []
line_num = 0
# Parse the file
# print(full_file_path_and_name)
with open(full_file_path_and_name, 'r', encoding='utf-8') as file:
for line in tqdm(file):
# print(".", end='')
# Strip white space from start/end of line and change to lowercase for use below
line = line.strip().lower()
# Empty lines are valid at any point in a file
if line:
# Check if this line contains metadata
# Please note that even though metadata is stored in this function it is not currently published externally
if line.startswith("@problemname"):
# Check that the data has not started
if data_started:
raise TsFileParseException("metadata must come before data")
# Check that the associated value is valid
tokens = line.split(' ')
token_len = len(tokens)
if token_len == 1:
raise TsFileParseException("problemname tag requires an associated value")
problem_name = line[len("@problemname") + 1:]
has_problem_name_tag = True
metadata_started = True
elif line.startswith("@timestamps"):
# Check that the data has not started
if data_started:
raise TsFileParseException("metadata must come before data")
# Check that the associated value is valid
tokens = line.split(' ')
token_len = len(tokens)
if token_len != 2:
raise TsFileParseException("timestamps tag requires an associated Boolean value")
elif tokens[1] == "true":
timestamps = True
elif tokens[1] == "false":
timestamps = False
else:
raise TsFileParseException("invalid timestamps value")
has_timestamps_tag = True
metadata_started = True
elif line.startswith("@univariate"):
# Check that the data has not started
if data_started:
raise TsFileParseException("metadata must come before data")
# Check that the associated value is valid
tokens = line.split(' ')
token_len = len(tokens)
if token_len != 2:
raise TsFileParseException("univariate tag requires an associated Boolean value")
elif tokens[1] == "true":
univariate = True
elif tokens[1] == "false":
univariate = False
else:
raise TsFileParseException("invalid univariate value")
has_univariate_tag = True
metadata_started = True
elif line.startswith("@classlabel"):
# Check that the data has not started
if data_started:
raise TsFileParseException("metadata must come before data")
# Check that the associated value is valid
tokens = line.split(' ')
token_len = len(tokens)
if token_len == 1:
raise TsFileParseException("classlabel tag requires an associated Boolean value")
if tokens[1] == "true":
class_labels = True
elif tokens[1] == "false":
class_labels = False
else:
raise TsFileParseException("invalid classLabel value")
# Check if we have any associated class values
if token_len == 2 and class_labels:
raise TsFileParseException("if the classlabel tag is true then class values must be supplied")
has_class_labels_tag = True
class_label_list = [token.strip() for token in tokens[2:]]
metadata_started = True
elif line.startswith("@targetlabel"):
# Check that the data has not started
if data_started:
raise TsFileParseException("metadata must come before data")
# Check that the associated value is valid
tokens = line.split(' ')
token_len = len(tokens)
if token_len == 1:
raise TsFileParseException("targetlabel tag requires an associated Boolean value")
if tokens[1] == "true":
target_labels = True
elif tokens[1] == "false":
target_labels = False
else:
raise TsFileParseException("invalid targetLabel value")
has_target_labels_tag = True
class_val_list = []
metadata_started = True
# Check if this line contains the start of data
elif line.startswith("@data"):
if line != "@data":
raise TsFileParseException("data tag should not have an associated value")
if data_started and not metadata_started:
raise TsFileParseException("metadata must come before data")
else:
has_data_tag = True
data_started = True
# If the 'data tag has been found then metadata has been parsed and data can be loaded
elif data_started:
# Check that a full set of metadata has been provided
incomplete_regression_meta_data = not has_problem_name_tag or not has_timestamps_tag or not has_univariate_tag or not has_target_labels_tag or not has_data_tag
incomplete_classification_meta_data = not has_problem_name_tag or not has_timestamps_tag or not has_univariate_tag or not has_class_labels_tag or not has_data_tag
if incomplete_regression_meta_data and incomplete_classification_meta_data:
raise TsFileParseException("a full set of metadata has not been provided before the data")
# Replace any missing values with the value specified
line = line.replace("?", replace_missing_vals_with)
# Check if we dealing with data that has timestamps
if timestamps:
# We're dealing with timestamps so cannot just split line on ':' as timestamps may contain one
has_another_value = False
has_another_dimension = False
timestamps_for_dimension = []
values_for_dimension = []
this_line_num_dimensions = 0
line_len = len(line)
char_num = 0
while char_num < line_len:
# Move through any spaces
while char_num < line_len and str.isspace(line[char_num]):
char_num += 1
# See if there is any more data to read in or if we should validate that read thus far
if char_num < line_len:
# See if we have an empty dimension (i.e. no values)
if line[char_num] == ":":
if len(instance_list) < (this_line_num_dimensions + 1):
instance_list.append([])
instance_list[this_line_num_dimensions].append(pd.Series())
this_line_num_dimensions += 1
has_another_value = False
has_another_dimension = True
timestamps_for_dimension = []
values_for_dimension = []
char_num += 1
else:
# Check if we have reached a class label
if line[char_num] != "(" and target_labels:
class_val = line[char_num:].strip()
# if class_val not in class_val_list:
# raise TsFileParseException(
# "the class value '" + class_val + "' on line " + str(
# line_num + 1) + " is not valid")
class_val_list.append(float(class_val))
char_num = line_len
has_another_value = False
has_another_dimension = False
timestamps_for_dimension = []
values_for_dimension = []
else:
# Read in the data contained within the next tuple
if line[char_num] != "(" and not target_labels:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " does not start with a '('")
char_num += 1
tuple_data = ""
while char_num < line_len and line[char_num] != ")":
tuple_data += line[char_num]
char_num += 1
if char_num >= line_len or line[char_num] != ")":
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " does not end with a ')'")
# Read in any spaces immediately after the current tuple
char_num += 1
while char_num < line_len and str.isspace(line[char_num]):
char_num += 1
# Check if there is another value or dimension to process after this tuple
if char_num >= line_len:
has_another_value = False
has_another_dimension = False
elif line[char_num] == ",":
has_another_value = True
has_another_dimension = False
elif line[char_num] == ":":
has_another_value = False
has_another_dimension = True
char_num += 1
# Get the numeric value for the tuple by reading from the end of the tuple data backwards to the last comma
last_comma_index = tuple_data.rfind(',')
if last_comma_index == -1:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " contains a tuple that has no comma inside of it")
try:
value = tuple_data[last_comma_index + 1:]
value = float(value)
except ValueError:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " contains a tuple that does not have a valid numeric value")
# Check the type of timestamp that we have
timestamp = tuple_data[0: last_comma_index]
try:
timestamp = int(timestamp)
timestamp_is_int = True
timestamp_is_timestamp = False
except ValueError:
timestamp_is_int = False
if not timestamp_is_int:
try:
timestamp = float(timestamp)
timestamp_is_float = True
timestamp_is_timestamp = False
except ValueError:
timestamp_is_float = False
if not timestamp_is_int and not timestamp_is_float:
try:
timestamp = timestamp.strip()
timestamp_is_timestamp = True
except ValueError:
timestamp_is_timestamp = False
# Make sure that the timestamps in the file (not just this dimension or case) are consistent
if not timestamp_is_timestamp and not timestamp_is_int and not timestamp_is_float:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " contains a tuple that has an invalid timestamp '" + timestamp + "'")
if previous_timestamp_was_float is not None and previous_timestamp_was_float and not timestamp_is_float:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " contains tuples where the timestamp format is inconsistent")
if previous_timestamp_was_int is not None and previous_timestamp_was_int and not timestamp_is_int:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " contains tuples where the timestamp format is inconsistent")
if previous_timestamp_was_timestamp is not None and previous_timestamp_was_timestamp and not timestamp_is_timestamp:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " contains tuples where the timestamp format is inconsistent")
# Store the values
timestamps_for_dimension += [timestamp]
values_for_dimension += [value]
# If this was our first tuple then we store the type of timestamp we had
if previous_timestamp_was_timestamp is None and timestamp_is_timestamp:
previous_timestamp_was_timestamp = True
previous_timestamp_was_int = False
previous_timestamp_was_float = False
if previous_timestamp_was_int is None and timestamp_is_int:
previous_timestamp_was_timestamp = False
previous_timestamp_was_int = True
previous_timestamp_was_float = False
if previous_timestamp_was_float is None and timestamp_is_float:
previous_timestamp_was_timestamp = False
previous_timestamp_was_int = False
previous_timestamp_was_float = True
# See if we should add the data for this dimension
if not has_another_value:
if len(instance_list) < (this_line_num_dimensions + 1):
instance_list.append([])
if timestamp_is_timestamp:
timestamps_for_dimension = pd.DatetimeIndex(timestamps_for_dimension)
instance_list[this_line_num_dimensions].append(
pd.Series(index=timestamps_for_dimension, data=values_for_dimension))
this_line_num_dimensions += 1
timestamps_for_dimension = []
values_for_dimension = []
elif has_another_value:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " ends with a ',' that is not followed by another tuple")
elif has_another_dimension and target_labels:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " ends with a ':' while it should list a class value")
elif has_another_dimension and not target_labels:
if len(instance_list) < (this_line_num_dimensions + 1):
instance_list.append([])
instance_list[this_line_num_dimensions].append(pd.Series(dtype=np.float32))
this_line_num_dimensions += 1
num_dimensions = this_line_num_dimensions
# If this is the 1st line of data we have seen then note the dimensions
if not has_another_value and not has_another_dimension:
if num_dimensions is None:
num_dimensions = this_line_num_dimensions
if num_dimensions != this_line_num_dimensions:
raise TsFileParseException("line " + str(
line_num + 1) + " does not have the same number of dimensions as the previous line of data")
# Check that we are not expecting some more data, and if not, store that processed above
if has_another_value:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " ends with a ',' that is not followed by another tuple")
elif has_another_dimension and target_labels:
raise TsFileParseException(
"dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
line_num + 1) + " ends with a ':' while it should list a class value")
elif has_another_dimension and not target_labels:
if len(instance_list) < (this_line_num_dimensions + 1):
instance_list.append([])
instance_list[this_line_num_dimensions].append(pd.Series())
this_line_num_dimensions += 1
num_dimensions = this_line_num_dimensions
# If this is the 1st line of data we have seen then note the dimensions
if not has_another_value and num_dimensions != this_line_num_dimensions:
raise TsFileParseException("line " + str(
line_num + 1) + " does not have the same number of dimensions as the previous line of data")
# Check if we should have class values, and if so that they are contained in those listed in the metadata
if target_labels and len(class_val_list) == 0:
raise TsFileParseException("the cases have no associated class values")
else:
dimensions = line.split(":")
# If first row then note the number of dimensions (that must be the same for all cases)
if is_first_case:
num_dimensions = len(dimensions)
if target_labels:
num_dimensions -= 1
for dim in range(0, num_dimensions):
instance_list.append([])
is_first_case = False
# See how many dimensions that the case whose data in represented in this line has
this_line_num_dimensions = len(dimensions)
if target_labels:
this_line_num_dimensions -= 1
# All dimensions should be included for all series, even if they are empty
if this_line_num_dimensions != num_dimensions:
raise TsFileParseException("inconsistent number of dimensions. Expecting " + str(
num_dimensions) + " but have read " + str(this_line_num_dimensions))
# Process the data for each dimension
for dim in range(0, num_dimensions):
dimension = dimensions[dim].strip()
if dimension:
data_series = dimension.split(",")
data_series = [float(i) for i in data_series]
instance_list[dim].append(pd.Series(data_series))
else:
instance_list[dim].append(pd.Series())
if target_labels:
class_val_list.append(float(dimensions[num_dimensions].strip()))
line_num += 1
# Check that the file was not empty
if line_num:
# Check that the file contained both metadata and data
complete_regression_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag and has_target_labels_tag and has_data_tag
complete_classification_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag and has_class_labels_tag and has_data_tag
if metadata_started and not complete_regression_meta_data and not complete_classification_meta_data:
raise TsFileParseException("metadata incomplete")
elif metadata_started and not data_started:
raise TsFileParseException("file contained metadata but no data")
elif metadata_started and data_started and len(instance_list) == 0:
raise TsFileParseException("file contained metadata but no data")
# Create a DataFrame from the data parsed above
data = pd.DataFrame(dtype=np.float32)
for dim in range(0, num_dimensions):
data['dim_' + str(dim)] = instance_list[dim]
# Check if we should return any associated class labels separately
if target_labels:
if return_separate_X_and_y:
return data, np.asarray(class_val_list)
else:
data['class_vals'] = pd.Series(class_val_list)
return data
else:
return data
else:
raise TsFileParseException("empty file")