-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfeatures_type.py
227 lines (181 loc) · 6.77 KB
/
features_type.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""Functions to load or set the type of features of the databases."""
import pandas as pd
import os
from time import time
import database
from database.constants import METADATA_PATH
backup_dir = 'backup/'
os.makedirs(backup_dir, exist_ok=True)
def _ask_feature_type_df(df):
"""Ask to the user the types of the feature of the data frame.
Available types:
0 - Categorical
1 - Ordinal
2 - Continue • real
3 - Continue • integer
4 - Date timestamp
5 - Date exploded
6 - Binary
-1 - Not a feature
Parameters:
-----------
df : pandas.DataFrame
The data frame from which to determine the feature (=columns) types.
Returns:
--------
pandas.Series
Series with df.columns as index and integers as values.
"""
types = pd.Series(0, index=df.columns)
print(
'\n'
' -------------------------------------------------------\n'
'|Set the type of features in the data frame. |\n'
'|-------------------------------------------------------|\n'
'|Type an integer in [-1, 6] to set a category. |\n'
'|Leave empty to select default choice [bracketed]. |\n'
'|Type "end" to exit and set all unaswered to default. |\n'
'--------------------------------------------------------'
)
for i, feature in enumerate(df.columns):
# Ask the feature type to the user
while True:
t = input(
f'\n\n'
f'Feature (ID: {i}): {feature}\n\n'
f'Type? [0 - Categorical]\n'
f' 1 - Ordinal\n'
f' 2 - Continue • real\n'
f' 3 - Continue • integer\n'
f' 4 - Date timestamp\n'
f' 5 - Date exploded\n'
f' 6 - Binary\n'
f' -1 - Not a feature\n'
)
# By typing 'end', the unanswered are set to default
if t == 'end':
return types
# Convert empty (default) to 0 - Categorical
if t == '':
t = 0
# Try to convert user's input to integer
try:
t = int(t)
except ValueError:
pass
# Check if the integer is in the good range
if isinstance(t, int) and t <= 6 and t >= -1:
break # t matchs all conditions, so break the loop
print('\nError: enter an integer in [-1, 6] or type "end".')
types[feature] = t
return types
def ask_feature_type_helper():
"""Implement helper for asking feature type to the user."""
NHIS, TB = database.NHIS(), database.TB()
available_db_names = [db.acronym for db in [NHIS, TB]]
while True:
# Prevent from asking again when user failed on second input
if 'db_name' not in locals():
db_name = input(
f'\n'
f'Which database do you want to set the features\' types?\n'
f'Available choices: {available_db_names}\n'
f'Type "exit" to end.\n'
)
# Load appropiate database
if db_name == NHIS.acronym:
db = NHIS
elif db_name == TB.acronym:
db = TB
elif db_name == 'exit':
return
else:
print(f'\nAnswer must be in {available_db_names}')
del db_name
continue
available_df_names = db.df_names()
df_name = input(
f'\n'
f'Which data frame of {db_name} do you want to set the '
f'features\' types?\n'
f'Available: {available_df_names}\n'
f'Type "none" to change database.\n'
f'Type "exit" to end.\n'
)
if df_name == 'none':
del db_name
continue
if df_name == 'exit':
return
if df_name not in available_df_names:
print(f'\nAnswer must be in {available_df_names}')
continue
df = db[df_name]
types = _ask_feature_type_df(df)
_dump_feature_types(types, db, df_name, anonymize=False)
def _dump_feature_types(types, db, df_name, anonymize=True):
"""Dump the features' types anonymizing the features' names.
Parameters:
-----------
types: pandas.Series
Series with features' names as index and features' types as values.
db : Database object
Used to dump results in the right folder.
df_name : string
Name or path of the data frame from which has been computed the types.
Used to dump the results in the right folder.
anonymize : bool
Whether to anonymize feature names or not when dumping.
False: features' name is dumped. True: only id is dumped.
"""
if df_name in db.frame_paths:
path = db.frame_paths[df_name]
else:
path = df_name
filename = os.path.basename(path)
basename, _ = os.path.splitext(filename)
dir_path = f'{METADATA_PATH}features_types/{db.acronym}/'
# Anonymize features' names
if anonymize:
types = pd.Series(types.values, index=range(len(types.index)))
# Creates directories if doesn't exist
os.makedirs(dir_path, exist_ok=True)
# Save to csv
filepath = dir_path+basename
types.to_csv(f'{filepath}.csv', header=False)
# Backup all dumps in the same folder
backup_tag = f'{filepath.replace("/", "_")}_{time():.1f}'
types.to_csv(f'{backup_dir}{backup_tag}.csv', header=False)
def _load_feature_types(db, df_name, anonymized=True):
"""Load the features' types deanonymizing the features' names.
Parameters:
-----------
db : Database class
The features' database.
df_name : string
Name of the features' data frame.
anonymized : bool
Whether the features have been anonymized before being dumped
(i.e no feature names but only their id).
Returns:
--------
pandas.Series
Series with features' names as index and features' types as values.
"""
filename = os.path.basename(db.frame_paths[df_name])
basename, _ = os.path.splitext(filename)
filepath = f'{METADATA_PATH}features_types/{db.acronym}/{basename}.csv'
# Load types series
types = pd.read_csv(filepath, index_col=0,
header=None, squeeze=True)
# Deanonymize features' names
if anonymized:
types = pd.Series(types.values, index=db[df_name].columns)
return types
if __name__ == '__main__':
from database import NHIS, TB
ask_feature_type_helper()
# types = _ask_feature_type_df(NHIS['family'])
# _dump_feature_types(types, NHIS, 'family')
# print(_load_feature_types(NHIS, 'family'))
# print(_load_feature_types(TB, '20000'))