-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathdata_process.py
134 lines (115 loc) · 5.78 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Process the data set before the inference process.
"""
import numpy
import torch
from hashlib import sha256
from os import path
def get_sequences_without_embeddings(sequences, emb_dir, per_res=False):
"""
Collecting sequences that do not have generated embeddings.
sequences - DICT of all sequences in the input (keys are sequence ids,
values are protein sequences
emb_dir - STRING that defines the directory where embeddings are saved
per_res - BOOL that determines whether per-residue embeddings are needed
returns DICT with sequences that lack embeddings
"""
seqs_wo_emb = {}
for seq_id in list(sequences.keys()):
seq_code = sha256(sequences[seq_id].encode('utf-8')).hexdigest()
if(not path.exists(f"{emb_dir}/mean_{seq_code}.pt")):
seqs_wo_emb[seq_id] = sequences[seq_id]
if(per_res and not path.exists(f"{emb_dir}/per_res_{seq_code}.pt")):
seqs_wo_emb[seq_id] = sequences[seq_id]
return seqs_wo_emb
def collect_mean_embeddings(sequences, embeddings, emb_dir, input_size=1024):
"""
Collecting mean embeddings into a dictionary.
sequences - DICT of all sequences in the input (keys are sequence ids,
values are protein sequences
embeddings - DICT with generated embeddings. Keys are "mean_representations"
and "per_res_representations", which have [DICT] values, which keys are
sequence ids and values are embeddings torch tensor
emb_dir - STRING that determines the path to the embeddings 'cache'
directory
input_size - INT that notes the dimension of each embeddings vector
returns DICT with keys "x_test" (values are embeddings tensors) and
"y_test" (values are (irrelevant) temperature labels)
"""
dataset = {}
dataset['y_test'] = torch.tensor((), dtype=torch.int32)
for i, seq_id in enumerate(sequences):
if(emb_dir and path.exists(emb_dir)):
# Loading sequences from cache
embedding = torch.load("%s/mean_%s.pt" % (emb_dir,
sha256(sequences[seq_id].encode('utf-8')).hexdigest()))["mean_representations"]
else:
# Taking freshly-generated embeddings
embedding = torch.from_numpy(embeddings["mean_representations"][seq_id])
if(i):
dataset["x_test"] = torch.vstack((dataset["x_test"], torch.flatten(embedding)))
else:
dataset["x_test"] = torch.reshape(embedding, (1, input_size))
dataset["y_test"] = torch.cat((dataset["y_test"], torch.tensor([999]).int()), 0)
return dataset
def collect_per_res_embeddings(sequences, original_sequences, embeddings, emb_dir,
input_size=1024, smoothen=False, window_size=21):
"""
Collecting per-residue embeddings into a dictionary.
sequences - DICT of all sequences in the input (keys are sequence ids,
values are protein sequences
embeddings - DICT with generated embeddings. Keys are "mean_representations"
and "per_res_representations", which have [DICT] values, which keys are
sequence ids and values are embeddings torch tensor
emb_dir - STRING that determines the path to the embeddings 'cache'
directory
input_size - INT that notes the dimension of each embeddings vector
smoothen - BOOL indicates whether to make average smoothing of embeddings
returns DICT with keys "x_test" (values are embeddings tensors) and
"y_test" (values are fake temperature labels)
"""
dataset = {}
dataset['y_test'] = torch.tensor((), dtype=torch.int32)
dataset['z_test'] = {}
for i, seq_id in enumerate(sequences):
iterations_for_seq = len(sequences[seq_id])
if(emb_dir and path.exists(emb_dir)):
embedding = torch.load("%s/per_res_%s.pt" % (emb_dir,
sha256(sequences[seq_id].encode('utf-8')).hexdigest()))["per_res_representations"]
else:
# Taking freshly-generated embeddings
embedding = torch.from_numpy(embeddings["per_res_representations"][seq_id])
for j in range(iterations_for_seq):
if(i == 0 and j == 0):
dataset["x_test"] = torch.reshape(embedding[j], (1, input_size))
else:
dataset["x_test"] = torch.vstack((dataset["x_test"], torch.flatten(embedding[j])))
if(not smoothen): dataset["y_test"] = torch.cat((dataset["y_test"], torch.tensor([999]).int()), 0)
if(not smoothen): dataset["z_test"]['%s_%d' % (seq_id, j)] = original_sequences[seq_id][j]
if(smoothen):
WINDOW_SIZE = window_size
smoothened_seqs = {}
j = 0
while(j < iterations_for_seq-WINDOW_SIZE+1):
smoothened_embedding = dataset["x_test"][range(j, j+WINDOW_SIZE)].mean(dim=0)
if(not j and not i):
smoothened_embeddings = smoothened_embedding
else:
smoothened_embeddings = torch.vstack((smoothened_embeddings, smoothened_embedding))
dataset['z_test']['%s_%d-%d' % (seq_id, j, j+WINDOW_SIZE)] = ''.join(original_sequences[seq_id][j:j+WINDOW_SIZE])
dataset["y_test"] = torch.cat((dataset["y_test"], torch.tensor([999]).int()), 0)
j += 1
if(smoothen): dataset["x_test"] = smoothened_embeddings
return dataset
def load_tensor_from_NPZ(NPZ_file, keywords):
"""
Loading embeddings from file to dictionary.
NPZ_file - STRING path to the NPZ file
keywords - LIST with keywords to identify which subset of file to load
returns DICT with keys as given keywords, values in tensors
"""
dataset = {}
with numpy.load(NPZ_file, allow_pickle=True) as data_loaded:
for i in range(len(keywords)):
dataset[keywords[i]] = torch.from_numpy(data_loaded[keywords[i]])
return dataset