-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreTreat.py
60 lines (48 loc) · 1.83 KB
/
preTreat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def cutOffByMin(minNum):
remainIndexs = None
for i in range(data.typeNum):
meanFt = data.trainDataFrames[i].mean(1) # get mean
remainIndex = meanFt[meanFt > minNum].index
if remainIndexs is None:
remainIndexs = remainIndex
else:
remainIndexs = remainIndexs & remainIndex
for i in range(data.typeNum):
data.trainDataFrames[i] = data.trainDataFrames[i].loc[remainIndexs]
print(data.trainDataFrames[0].shape)
return
def cutOffByMeanLog(remainRatio, cachePath):
remainIndexs = None
for i in range(data.typeNum):
meanPre = data.trainDataFrames[i].mean(1) # mean
varPre = data.trainDataFrames[i].std(1) / meanPre # std
# draw
varPre.plot(kind="kde", xlim=[0, 2])
# all the features >remainRatio
remainIndex = varPre[varPre < remainRatio].index
if (remainIndexs is None):
remainIndexs = remainIndex
else:
remainIndexs = remainIndexs & remainIndex
plt.savefig(cachePath + "picDataDistribute.png", dpi=600)
for i in range(data.typeNum):
data.trainDataFrames[i] = data.trainDataFrames[i].loc[remainIndexs]
print(data.trainDataFrames[0].shape)
return
def toLog():
for i in range(data.typeNum):
data.trainDataFrames[i] = np.log(data.trainDataFrames[i] + 1) # log(x+1)
return
def preTreatRun(dataCache, minNum, remainRatio, cachePath):
global data
data = dataCache
print("\n1_1 pretreat min by", minNum)
cutOffByMin(minNum=minNum)
print("1_2 remain std by", remainRatio)
cutOffByMeanLog(remainRatio=remainRatio, cachePath=cachePath)
print("1_3 pretreat std as log")
toLog()
return data