forked from sergeyf/ki_43A
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_import_data.py
134 lines (119 loc) · 4.13 KB
/
01_import_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 5 13:40:03 2021
@author: sergey feldman
"""
import pickle
import pandas as pd
# import data from excel files
# note that all values of 99, 999, 9999 will be treated as missing
# some of the 9s are also missing for certain covariates, but these are left alone
# because many columns have "real" 9s and we don't want to NaN those
df = pd.read_excel("data/DataRequest_Sanchez-Alonso_12.10.20.xlsx", na_values=[99, "99", 999, "999", 9999, "9999"])
# import train/test split
test_train = pd.read_excel("data/BEAN_testing_training_n130.xlsx")
# this entire sprint will not touch the test data, so all we need is the training data
train_ids = test_train.loc[test_train["Dataset"] == "Training", "ID"]
df = df[df["FSID"].isin(train_ids)]
# the TENSION scale variables
tension_covariates = [
"feelsad",
"noapptit",
"mmryloss",
"frustrat",
"insomnia",
"coldbody",
"runaway",
"headache",
"afraid",
"feeltird",
"helpless",
"shakines",
"sexprob",
"priodprb",
"bealone",
"painbody",
"homesick",
"feelhot",
"vagdisch",
"feeldizz",
"hartpalp",
"losscntr",
"brethles",
"hairwhit",
"wtchnge",
]
"""
AGEM_AN03 has the actual age of the child for the measurement stored in HAZ_AN01
we have to exclude HAZ values that happened after the MULLEN scores at 6m and 24m
for predicting 6m mullen, can use: HAZ_AN01 through HAZ_AN03 (HAZ_AN04 can happen at 6m or after)
for predicing 24m mullen, can use: HAZ_AN01 through HAZ_AN09 (HAZ_AN10 can happen at 24m or after)
caveat: HAZ_AN07 has missing values, so we're excluding that one
"""
haz_covariates_for_6m = [f"HAZ_AN0{i}" for i in [1, 2, 3]]
df["HAZ_mean_up_to_6m"] = df[haz_covariates_for_6m].mean(axis=1)
df["HAZ_min_up_to_6m"] = df[haz_covariates_for_6m].min(axis=1)
df["HAZ_max_up_to_6m"] = df[haz_covariates_for_6m].max(axis=1)
haz_covariates_for_24m = [f"HAZ_AN0{i}" for i in [1, 2, 3, 4, 5, 6, 8, 9]]
df["HAZ_mean_up_to_24m"] = df[haz_covariates_for_24m].mean(axis=1)
df["HAZ_min_up_to_24m"] = df[haz_covariates_for_24m].min(axis=1)
df["HAZ_max_up_to_24m"] = df[haz_covariates_for_24m].max(axis=1)
other_covariates = [
"wall_1", # categorical but only 2 unique values (3 and 4) so doesn't need special codeing
"medu_1",
"fedu_1",
"inco_1",
"SEX",
"room_1",
"WT_1",
"HT_1",
"MUAC_1",
]
output_covariates_6m = [
"gmraw_6",
"gmtsc_6",
"vrraw_6",
"vrtsc_6",
"fmraw_6",
"fmtsc_6",
"rlraw_6",
"rltsc_6",
"elraw_6",
"eltsc_6",
]
output_covariates_24m = [
"gmraw_24",
"gmtsc_24",
"vrraw_24",
"vrtsc_24",
"fmraw_24",
"fmtsc_24",
"rlraw_24",
"rltsc_24",
"elraw_24",
"eltsc_24",
]
"""
now we can put together all the covariates for 6m and 24m and save for easy access
"""
input_covariates_6m = (
tension_covariates + other_covariates + ["HAZ_mean_up_to_6m", "HAZ_min_up_to_6m", "HAZ_max_up_to_6m"]
)
output_covariates_6m_raw = [i for i in output_covariates_6m if "raw" in i]
df["mullen_6m_raw_average"] = df[output_covariates_6m_raw].mean(1) # simple average
output_covariates_6m_tsc = [i for i in output_covariates_6m if "tsc" in i]
df["mullen_6m_tsc_average"] = df[output_covariates_6m_tsc].mean(1) # simple average
df_6m = df[input_covariates_6m + ["FSID", "mullen_6m_raw_average", "mullen_6m_tsc_average"]]
input_covariates_24m = (
tension_covariates + other_covariates + ["HAZ_mean_up_to_24m", "HAZ_min_up_to_24m", "HAZ_max_up_to_24m"]
)
output_covariates_24m_raw = [i for i in output_covariates_24m if "raw" in i]
df["mullen_24m_raw_average"] = df[output_covariates_24m_raw].mean(1) # simple average
output_covariates_24m_tsc = [i for i in output_covariates_24m if "tsc" in i]
df["mullen_24m_tsc_average"] = df[output_covariates_24m_tsc].mean(1) # simple average
df_24m = df[input_covariates_24m + ["FSID", "mullen_24m_raw_average", "mullen_24m_tsc_average"]]
# not all mullen scores are available for 24m
df_24m = df_24m[~df_24m["mullen_24m_raw_average"].isnull()]
# save to disk
with open("data/processed_data.pickle", "wb") as f:
pickle.dump((df_6m, input_covariates_6m, df_24m, input_covariates_24m), f)