-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunctions.py
122 lines (111 loc) · 2.72 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#functions
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import load_model
#read fasta
def read_fasta(fa):
name, seq = None, []
for line in fa:
line = line.strip()
if line.startswith(">"):
if name: yield(name, ''.join(seq))
name, seq = line, []
else:
seq.append(line)
if name: yield (name, ''.join(seq))
#count kmer
def countoverlap(seq,kmer):
return len([1 for i in range(len(seq)) if seq.startswith(kmer,i)])
#get the kmer
def get_kmer(seq):
ntarr = ("A","C","G","T")
kmerArray = []
kmerre = []
rst = []
fst = 0
total = 0.0
pp = 0.0
item = 0.0
for n in range(4):
kmerArray.append(ntarr[n])
for n in range(4):
str1 = ntarr[n]
for m in range(4):
str2 = str1 + ntarr[m]
kmerArray.append(str2)
#############################################
for n in range(4):
str1 = ntarr[n]
for m in range(4):
str2 = str1 + ntarr[m]
for x in range(4):
str3 = str2 + ntarr[x]
kmerArray.append(str3)
#############################################
#change this part for 3mer or 4mer
for n in range(4):
str1 = ntarr[n]
for m in range(4):
str2 = str1 + ntarr[m]
for x in range(4):
str3 = str2 + ntarr[x]
for y in range(4):
str4 = str3 + ntarr[y]
kmerArray.append(str4)
############################################
for i in ntarr:
kmerre.append(i)
for m in kmerArray:
st = i + m
kmerre.append(st)
############################################
#get the second part of features
for n in range(len(kmerre)):
item = countoverlap(seq,kmerre[n])
total = total + item
rst.append(item)
sub_seq = []
if seq.startswith("T"):
sub_seq.append(seq[0:1])
sub_seq.append(seq[0:2])
sub_seq.append(seq[0:3])
sub_seq.append(seq[0:4])
sub_seq.append(seq[0:5])
if seq[9:10] == "A":
sub_seq.append(seq[9:10])
sub_seq.append(seq[8:10])
sub_seq.append(seq[7:10])
sub_seq.append(seq[6:10])
sub_seq.append(seq[5:10])
sub_seq.append(seq[9:11])
sub_seq.append(seq[9:12])
sub_seq.append(seq[9:13])
sub_seq.append(seq[9:14])
for i in sub_seq:
if "N" not in i:
inx = kmerre.index(i)
rst[inx] += 1
for n in range(len(rst)):
rst[n] = rst[n]/total
return rst
#prediction
def prediction(dat, sp):
if sp == 1:
model = load_model('Ele_piRNN.h5')
elif sp == 2:
model = load_model('Dro_piRNN.h5')
elif sp == 3:
model = load_model('Rat_piRNN.h5')
elif sp == 4:
model = load_model('Hum_piRNN.h5')
Y = model.predict_classes(dat, verbose = 0)
return(Y)
#output
def output(Y_pre, ids, dics):
new_dict = {}
for i in range(len(Y_pre)):
if Y_pre[i] == 1:
new_dict[ids[i]] = dics[ids[i]]
return(new_dict)