forked from bio-ontology-research-group/deepgoplus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaminoacids.py
40 lines (36 loc) · 1.09 KB
/
aminoacids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
AALETTER = [
'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
AANUM = len(AALETTER)
AAINDEX = dict()
for i in range(len(AALETTER)):
AAINDEX[AALETTER[i]] = i + 1
INVALID_ACIDS = set(['U', 'O', 'B', 'Z', 'J', 'X', '*'])
MAXLEN = 2000
NGRAMS = {}
for i in range(20):
for j in range(20):
for k in range(20):
ngram = AALETTER[i] + AALETTER[j] + AALETTER[k]
index = 400 * i + 20 * j + k + 1
NGRAMS[ngram] = index
def is_ok(seq):
for c in seq:
if c in INVALID_ACIDS:
return False
return True
def to_ngrams(seq):
l = min(MAXLEN, len(seq) - 3)
ngrams = np.zeros((l,), dtype=np.int32)
for i in range(l):
ngrams[i] = NGRAMS.get(seq[i: i + 3], 0)
return ngrams
def to_onehot(seq, start=0):
onehot = np.zeros((MAXLEN, 21), dtype=np.int32)
l = min(MAXLEN, len(seq))
for i in range(start, start + l):
onehot[i, AAINDEX.get(seq[i - start], 0)] = 1
onehot[0:start, 0] = 1
onehot[start + l:, 0] = 1
return onehot