-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset.py
142 lines (106 loc) · 3.59 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
from urllib import request as req
import numpy as np
import zipfile
import tensorflow as tf
def load_data(data_config):
print('loading dataset...')
data_type = data_config['kind']
data = []
label = []
if data_type == 'TEXT':
data = pd.read_csv(data_config['train_uri'])
label = data[data_config['label']]
data = data.drop(axis=1, columns=[data_config['label']])
elif data_type == 'IMAGES':
r = req.Request(data_config['train_uri'])
data = open('./dataset.zip', 'wb')
data.write(req.urlopen(r).read())
data.close()
with zipfile.ZipFile('./dataset.zip', 'r') as zip_ref:
zip_ref.extractall('./dataset')
data = []
label = []
print('load dataset successful')
return data, label
def get_input_shape(data, shape):
# Param shape must be pointer
new_shape = shape
# Add dimension for batch
for i, val in enumerate(new_shape):
if val == None:
new_shape[i] = -1
print(new_shape)
return new_shape
def normalization(data, norm):
res = data
method = norm['method']
if norm['usage'] == False:
res = data
else:
if method == 'MinMax':
mms = MinMaxScaler()
res = mms.fit_transform(res)
elif method == 'Standard':
ss = StandardScaler()
res = ss.fit_transform(res)
return res
def get_dataset(data_config, model):
shape = list(*model.layers[0].output_shape)
data, label = load_data(data_config)
norm_type = data_config['normalization']
if data_config['kind'] == 'IMAGES':
# preprocessing for image data
datagen = ImageDataGenerator(rescale=1.0/255.0, validation_split=0.3)
color_mode = 'rgb'
if shape[3] == 1:
color_mode = 'grayscale'
elif shape[3] == 4:
color_mode = 'rgba'
train = datagen.flow_from_directory(
directory='./dataset',
shuffle=data_config['shuffle'],
subset='training',
color_mode=color_mode,
target_size=shape[1:3]
)
valid = datagen.flow_from_directory(
directory='./dataset',
shuffle=data_config['shuffle'],
subset='validation',
color_mode=color_mode,
target_size=shape[1:3]
)
data = [train, valid]
label = []
else:
x = np.array(data)
y = np.array(label)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, stratify=y,
shuffle=data_config['shuffle'])
x_train = normalization(x_train, norm_type)
x_val = normalization(x_val, norm_type)
train = x_train.reshape(get_input_shape(x_train, shape))
valid = x_val.reshape(get_input_shape(x_val, shape))
data = [train, valid]
label = [y_train, y_val]
print(data)
print(label)
return data, label
def url_to_image(url, shape):
r = req.Request(url)
res = req.urlopen(r)
image = np.asarray(bytearray(res.read()), dtype="uint8")
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
image = cv2.resize(image, shape[1:3])
return image
def get_image_data_from_csv(df, shape):
images = []
for url in df['url']:
image = url_to_image(url, shape)
images.append(image)
return images