-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathNA_mining_utils.R
122 lines (116 loc) · 4.97 KB
/
NA_mining_utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
library(ape)
library(phangorn)
library(seqinr)
library(stringr)
library(e1071)
# Extracting subtrees in tree form with pruning using Maryam's code
#' @param flutree Tree full tree file
#' @param myclades Output from getClades2() function
#' @return List of phylogenetic trees with names as the root node
tree_trim <- function(flutree, myclades){
# getting clades without rejection
Final_trimmedClades=myclades$trimclades[myclades$rejected==0]
# get root ids from final_trimmed_clades
Final_trimmedClades_root=as.numeric(names(which(myclades$rejected==0)))
# Get branch lengths
allHeights=node.depth.edgelength(flutree); max(allHeights)
# grab all tips and their lengths
hdata=data.frame(tiplab=flutree$tip.label, height=allHeights[1:length(flutree$tip.label)])
# remove subtrees without proper growth
print(max(allHeights)-allHeights[Final_trimmedClades_root[314]] <= 1.4)
res=numeric()
for(i in 1:length(Final_trimmedClades_root)){
if((max(allHeights)-allHeights[Final_trimmedClades_root[i]]) <= 1.4){
res=c(res,i)
}
}
Final_root=Final_trimmedClades_root[-res]
result <- list()
for (i in match(Final_root, Final_trimmedClades_root)){
if (i %% 5 == 0){print(i)}
tr1 <- extract.clade(flutree,Final_trimmedClades_root[i]) # get all from root
tr <- drop.tip(tr1,setdiff(tr1$tip.label,hdata$tiplab[Final_trimmedClades[[i]]]), trim.internal = TRUE)
root <- Final_trimmedClades_root[i]
result[[as.character(root)]] <- tr
}
return(result)
}
#' Function to map HA sample names to NA sample names
#' @param HA HA index, as a string (can get from)
HA_to_NA <- function(HA, map_file){
idx <- which(map_file[,2] == paste0(">",HA,"_A"))
output <- stringr::str_match(map_file[idx,1], regex(">(.*?)_A"))[2]
return(output)
}
#' Extracting features from NA sequences using subtrees
#' @param subtree The subtree as a phylo object
#' @param alignment_file The alignment file in fasta format and loaded as read.fasta from the seqinr package
#' @param alignment_key The clade key csv file for HA alignments with sample names as the first column.
#' Sequence of samples should be the same as that of the HA alignment fasta file
#' @param map_file The mapping file of samples between HA and NA, generated courtesy of Pak. HA second column, NA first column
#' @return A vector of length 6 with min, max and median/mean of hamming distance and GC content respectively
get_subtree_NA_feat<- function(subtree, alignment_file, alignment_key, map_file){
labs <- subtree$tip.label
# get index matching to alignment_file
match_idx <- numeric(length(labs))
for (i in 1:length(labs)){
NA_lab <- HA_to_NA(HA = labs[i], map_file = map_file)
if (!is.na(NA_lab)){
match_idx[i] <- which(alignment_key[,1] == HA_to_NA(HA = labs[i],map_file = map_file))
}
}
if (length(which(match_idx != 0)) == 0){ # means that there is no match between HA and NA
return(rep(NA,6))
} else {
match_idx <- match_idx[match_idx != 0] # only grab non-zero
# generating a matrix of sequence where each letter is it's own element in a vector
seq_mat <- c()
for (i in 1:length(match_idx)){
seq_mat <- rbind(seq_mat, getSequence(alignment_file[match_idx[i]])[[1]])
}
if (nrow(seq_mat) == 1){ # there is only one match
gc_content <- rep(seqinr::GC(seq_mat[1,]),3)
distance <- c(0,0,0)
output <- c(distance, gc_content)
return(output)
} else {
gc_content <- t(apply(seq_mat, 1, function(x){
seqinr::GC(x)
}))
distance <- as.vector(e1071::hamming.distance(seq_mat))
distance <- distance[distance != 0] # remove 0s since they are identical sequences non informative
output <- c(min(distance), max(distance), median(distance), min(gc_content), max(gc_content), mean(gc_content))
return(output)
}
}
}
#' Modified clade feature function from Maryam's script
#' @param tr The NA tree of interest
#' @return vector of 27 features similar to all subtrees
Clade_features_modified=function(tr){
features <- numeric()
features[1]=sackin(as.treeshape(tr),"pda")
features[2]=colless(as.treeshape(tr),"pda")
features[3]=var(node.depth.edgelength(tr)[1:length(tr$tip.label)])
features[4]=computeI2(tr)
features[5]=computeB1(tr)
features[6]=computeB2(tr)
features[7]=avgLadder(tr, normalise = TRUE)
features[8]=ILnumber(tr, normalise = TRUE)
features[9]=pitchforks(tr, normalise = TRUE)
features[10]=maxHeight(tr, normalise = TRUE)
features[11]=computeMaxWidth(tr)
features[12]=computeDelW(tr)
features[13]=computeStairs1(tr)
features[14]=computeStairs2(tr)
features[15]=computeCherries(tr, DOUBLE = FALSE)
features[16]=BS(tr)
features[17]=descinm(tr,m=2)$W
features[18]=getstattest(tr)$W
features[19]=skewness(i_bl(tr))
features[20]=kurtosis(i_bl(tr))
features[21]=tips_pairwise_distance(tr)
features[22]=tips_pairwise_distance_max(tr)
features[23:27]=computeNetworkStats (tr, weight = FALSE, meanpath = FALSE, maxOnly = TRUE)
return(features)
}