-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPrediction.R
228 lines (212 loc) · 8.46 KB
/
Prediction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
library(e1071)
library(DMwR)
library(outliers)
library(ggplot2)
library("ROCR")
library(caret)
library(Boruta)
library(igraph)
library(ape)
library(phangorn)
source("~/traintest_H3N2")
setwd("~/ALT0")
#load the tree
load("~/flutree2018-5.Rdata")
tree=flutree
#compute the height of each node
allHeights=node.depth.edgelength(tree)
allD=allDescendants(tree)
#read the dataset(each clade with its features)
df=read.csv("df_2018-5.csv",sep= ",",header=T,stringsAsFactors=FALSE)
df=df[,2:ncol(df)]
names(df)=c("Clade","numberTipsClade","numberTipsTrimmed","sackin","colless","Variance","I2","B1","B2",
"avgLadder","ILnumber","pitchforks","maxHeight","MaxWidth","DelW","Stairs1","Stairs2","Cherries",
"BS","descinm","getstattest","skewness","kurtosis","MeanPairwiseDist","MaxPairwiseDist", "diameter",
"WienerIndex", "betweenness", "closeness", "eigenvector","MeadianEp","MaxEp","MeanEp",
"numberTipsTrimmed_3.4","Labels")
ind=which(df$Labels <= 1.1)
length(ind)
#assign the labels
df$Labels[ind]=0
df$Labels[-ind]=1
round(length(which(df$Labels==1))/nrow(df),1)
#find the clades which their root is in the last 3.4 years of the tree and their labels are 0
#we are not sure about the labels of these clades since they do not have enough time to growth
res=numeric()
for(i in 1:nrow(df)){
if(df$Labels[i]==0){
print(i)
root=df$Clade[i]
if( max(allHeights)-allHeights[root]<=3.4){res=c(res,i)}
}
}
length(res)
#extract the clades which we are not sure about their labels
DD=df[res,c(1,2,3,34,35)]
#remove the unimportant columns
df=df[,3:ncol(df)]
df=df[,-32]
#we then use our model to predict the labels of the recent clades which we removed from the dataset
df_predict=df[res,]
#remove the uncertain clades from dataset (we do not want to include them in train or test)
df=df[-res,]
#change the labels if neccessary (otherwise AUC is less than 0.50)
ii=which(df$Labels==1)
df$Labels[ii]=0
df$Labels[-ii]=1
set.seed(123)
DataTT=PrepareDataRemoveOutliers(df,1)
train=DataTT[[1]]
test=DataTT[[2]]
#find the best hyperparametersfor the model
B=TuneParametersR(train,test,"linear")
#==================================================================================================
#use the best hyperparameters to train the model
svm.fit = svm(data = train, Labels ~ .,
kernel ="linear",degree = 3, gamma = 0.03125,
coef0 = 0, cost =32, nu = 0.5,class.weigth=c("0"=0.5,"1"=0.5))
svm.prob <- predict(svm.fit, newdata = test)
agreement <- svm.prob == test$Labels
acc=length(which(svm.prob == test$Labels))/length(test$Labels)
round(acc,2)
svmmodel.predict<-predict(svm.fit, newdata = test,decision.values=TRUE)
svmmodel.probs<-attr(svmmodel.predict,"decision.values")
svmmodel.class<-predict(svm.fit,test,type="class")
svmmodel.labels<-test$Labels
#roc analysis for test data
svmmodel.prediction<-prediction(svmmodel.probs,svmmodel.labels)
svmmodel.performance<-performance(svmmodel.prediction,"tpr","fpr")
svmmodel.auc<-performance(svmmodel.prediction,"auc")@y.values[[1]]
round(svmmodel.auc,2)
plot(svmmodel.performance)
#==================================================================================================
####################################predicton on recent clades#####################################
df_predict=scaleData(df_predict)
svm.prob <- predict(svm.fit, newdata = df_predict)
table(svm.prob)
DD=cbind(DD,svm.prob)
#because we changed the labels of the data before training
ii=which(DD$svm.prob==1)
DD$svm.prob[ii]=0
DD$svm.prob[-ii]=1
write.csv(DD,"Prediction_Recent.csv")
#==================================================================================================
####################################predicton on clades after 2016#################################
df_predict=scaleData(df_predict)
df=read.csv("df_2018-5.csv",sep= ",",header=T,stringsAsFactors=FALSE)
df=df[,2:ncol(df)]
names(df)=c("Clade","numberTipsClade","numberTipsTrimmed","sackin","colless","Variance","I2","B1","B2",
"avgLadder","ILnumber","pitchforks","maxHeight","MaxWidth","DelW","Stairs1","Stairs2","Cherries",
"BS","descinm","getstattest","skewness","kurtosis","MeanPairwiseDist","MaxPairwiseDist", "diameter",
"WienerIndex", "betweenness", "closeness", "eigenvector","MeadianEp","MaxEp","MeanEp",
"numberTipsTrimmed_3.4","Labels")
ind=which(df$Labels <= 1.1)
length(ind)
df$Labels[ind]=0
df$Labels[-ind]=1
round(length(which(df$Labels==1))/nrow(df),1)
#find the clades which their tips are after 2016
res_2016=numeric()
myclades=getClades2(flutree, MinTotalSize = 8, MinTrimSize = 8, TimeFrame = 1.4)
Final_trimmedClades=myclades$trimclades[myclades$rejected==0]
Final_trimmedClades_root=as.numeric(names(which(myclades$rejected==0)))
Auxdata=read.csv("~/RNAP2018.csv",sep= ",",header=T,stringsAsFactors=FALSE)
Auxdata=Auxdata[,2:4]
allHeights=node.depth.edgelength(flutree); max(allHeights)
hdata=data.frame(tiplab=flutree$tip.label, height=allHeights[1:length(flutree$tip.label)])
res_2016=numeric()
for(i in 1:length(df$Clade)){
print(i)
tr1=extract.clade(flutree,df$Clade[i])
tr=drop.tip(tr1,setdiff(tr1$tip.label,hdata$tiplab[Final_trimmedClades[[i]]]), trim.internal = TRUE)
ind=match(tr$tip.label,Auxdata[,1])
date=as.Date(Auxdata[ind,3])
if(min(date)>"2016-01-01"){res_2016=c(res_2016,i)}
}
df$Clade[res_2016]
DD=df[res_2016,c(1,2,3,34,35)]
df_predict=df[res_2016,]
#predicton on clades after 2016
df_predict=scaleData(df_predict)
svm.prob <- predict(svm.fit, newdata = df_predict)
table(svm.prob)
DD=cbind(DD,svm.prob )
#because we changed the labels of the data before training
ii=which(DD$svm.prob==1)
DD$svm.prob[ii]=0
DD$svm.prob[-ii]=1
write.csv(DD,"Prediction_2016.csv")
PredLab=rep(NA, length(res_2016))
ind=match(df$Clade[res_2016],dd$Clade)
ii=which(is.na(ind))
X=dd[ind[-ii],]
PredLab
ind=match(X$Clade,DD$Clade)
PredLab[ind]=X$svm.prob
DD=cbind(DD,PredLab)
write.csv(DD,"prediction_2016")
dd=read.csv("prediction_2016")
#==================================================================================================
#############################clades that includes just 2016 strains################################
res_2016=numeric()
for(i in 1:length(df$Clade)){
print(i)
tr1=extract.clade(flutree,df$Clade[i])
tr=drop.tip(tr1,setdiff(tr1$tip.label,hdata$tiplab[Final_trimmedClades[[i]]]), trim.internal = TRUE)
ind=match(tr$tip.label,Auxdata[,1])
date=as.Date(Auxdata[ind,3])
if(min(date)>="2016-01-01"&&max(date) <= "2016-12-31"){res_2016=c(res_2016,i)}
}
#==================================================================================================
##############################train on the past and test in recent clades##########################
load("~/flutree2018-5.Rdata")
tree=flutree
allHeights=node.depth.edgelength(tree)
allD=allDescendants(tree)
setwd("~/ALT0")
df=read.csv("df_2018-5.csv",sep= ",",header=T,stringsAsFactors=FALSE)
df=df[,2:ncol(df)]
names(df)=c("Clade","numberTipsClade","numberTipsTrimmed","sackin","colless","Variance","I2","B1","B2",
"avgLadder","ILnumber","pitchforks","maxHeight","MaxWidth","DelW","Stairs1","Stairs2","Cherries",
"BS","descinm","getstattest","skewness","kurtosis","MeanPairwiseDist","MaxPairwiseDist", "diameter",
"WienerIndex", "betweenness", "closeness", "eigenvector","MeadianEp","MaxEp","MeanEp",
"numberTipsTrimmed_3.4","Labels")
ind=which(df$Labels <= 1.1)
length(ind)
df$Labels[ind]=0
df$Labels[-ind]=1
round(length(which(df$Labels==1))/nrow(df),1)
#find the clades which their root is in the last 3.4 years of the tree and their labels are 0
#we are not sure about the labels of these clades since they do not have enough time to growth
res=numeric()
for(i in 1:nrow(df)){
if(df$Labels[i]==0){
print(i)
root=df$Clade[i]
if( max(allHeights)-allHeights[root]<=3.4){res=c(res,i)}
}
}
length(res)
resCl=df$Clade[res]
res_2015=numeric()
for(i in 1:length(df$Clade)){
print(i)
tr1=extract.clade(flutree,df$Clade[i])
tr=drop.tip(tr1,setdiff(tr1$tip.label,hdata$tiplab[Final_trimmedClades[[i]]]), trim.internal = TRUE)
ind=match(tr$tip.label,Auxdata[,1])
date=as.Date(Auxdata[ind,3])
if(min(date)>"2015-01-01"){res_2015=c(res_2015,i)}
}
res_2015Cl=df$Clade[res_2015]
test=df[res_2015,]
test=test[which(is.na(match(test$Clade,df$Clade[res]))),]
dim(test)
train=df[-res_2015,]
train=train[which(is.na(match(train$Clade,df$Clade[res]))),]
dim(train)
df=rbind(test,train)
df=df[,3:ncol(df)]
df=df[,-32]
test=df[1:dim(test)[1],]
train=df[(dim(test)[1]+1):nrow(df),]
B=TuneParametersR(train,test,"linear")