-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
345 lines (274 loc) · 14 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import time
import random
import numpy as np
import streamlit as st
import selfies
from selfies import encoder, decoder
########## RDKIT & SELFIES ##########
import rdkit
from rdkit import Chem
from rdkit.Chem import MolToSmiles as mol2smi
from rdkit.Chem import MolFromSmiles as smi2mol
from rdkit.Chem import AllChem, Mol, Draw
from rdkit.Chem.AtomPairs.Sheridan import GetBPFingerprint, GetBTFingerprint
from rdkit.Chem.Pharm2D import Generate, Gobbi_Pharm2D
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
st.set_page_config(
page_title='Mutating Molecules with a Genetic Algorithm',
layout='centered',
page_icon=":octopus:",
initial_sidebar_state="expanded")
st.sidebar.header("Adjustable Parameters")
st.sidebar.write("A generation is a set of molecules to be evolved")
gen_num = st.sidebar.slider("Generations", min_value=1000, max_value=10000)
st.sidebar.write("Population size of each generation")
gen_size = st.sidebar.slider("Size", min_value=100, max_value=500)
#st.sidebar.write("")
st.sidebar.write('''A run at the lowest setting takes about 30 seconds''')
run_num = st.sidebar.slider("Number of runs", min_value = 1, max_value = 10)
def get_ECFP4(mol):
''' Return rdkit ECFP4 fingerprint object for mol
Parameters:
mol (rdkit.Chem.rdchem.Mol) : RdKit mol object
Returns:
rdkit ECFP4 fingerprint object for mol
'''
return AllChem.GetMorganFingerprint(mol, 2)
def sanitize_smiles(smi):
'''Return a canonical smile representation of smi
Parameters:
smi (string) : smile string to be canonicalized
Returns:
mol (rdkit.Chem.rdchem.Mol) : RdKit mol object (None if invalid smile string smi)
smi_canon (string) : Canonicalized smile representation of smi (None if invalid smile string smi)
conversion_successful (bool): True/False to indicate if conversion was successful
'''
try:
mol = smi2mol(smi, sanitize=True)
smi_canon = mol2smi(mol, isomericSmiles=False, canonical=True)
return (mol, smi_canon, True)
except:
return (None, None, False)
def mutate_selfie(selfie, max_molecules_len, write_fail_cases=False):
'''Return a mutated selfie string (only one mutation on slefie is performed)
Mutations are done until a valid molecule is obtained
Rules of mutation: With a 50% propbabily, either:
1. Add a random SELFIE character in the string
2. Replace a random SELFIE character with another
Parameters:
selfie (string) : SELFIE string to be mutated
max_molecules_len (int) : Mutations of SELFIE string are allowed up to this length
write_fail_cases (bool) : If true, failed mutations are recorded in "selfie_failure_cases.txt"
Returns:
selfie_mutated (string) : Mutated SELFIE string
smiles_canon (string) : canonical smile of mutated SELFIE string
'''
valid=False
fail_counter = 0
chars_selfie = get_selfie_chars(selfie)
while not valid:
fail_counter += 1
alphabet = list(selfies.get_semantic_robust_alphabet()) # 34 SELFIE characters
choice_ls = [1, 2] # 1=Insert; 2=Replace; 3=Delete
random_choice = np.random.choice(choice_ls, 1)[0]
# Insert a character in a Random Location
if random_choice == 1:
random_index = np.random.randint(len(chars_selfie)+1)
random_character = np.random.choice(alphabet, size=1)[0]
selfie_mutated_chars = chars_selfie[:random_index] + [random_character] + chars_selfie[random_index:]
# Replace a random character
elif random_choice == 2:
random_index = np.random.randint(len(chars_selfie))
random_character = np.random.choice(alphabet, size=1)[0]
if random_index == 0:
selfie_mutated_chars = [random_character] + chars_selfie[random_index+1:]
else:
selfie_mutated_chars = chars_selfie[:random_index] + [random_character] + chars_selfie[random_index+1:]
# Delete a random character
elif random_choice == 3:
random_index = np.random.randint(len(chars_selfie))
if random_index == 0:
selfie_mutated_chars = chars_selfie[random_index+1:]
else:
selfie_mutated_chars = chars_selfie[:random_index] + chars_selfie[random_index+1:]
else:
raise Exception('Invalid Operation trying to be performed')
selfie_mutated = "".join(x for x in selfie_mutated_chars)
sf = "".join(x for x in chars_selfie)
try:
smiles = decoder(selfie_mutated)
mol, smiles_canon, done = sanitize_smiles(smiles)
if len(selfie_mutated_chars) > max_molecules_len or smiles_canon=="":
done = False
if done:
valid = True
else:
valid = False
except:
valid=False
if fail_counter > 1 and write_fail_cases == True:
f = open("selfie_failure_cases.txt", "a+")
f.write('Tried to mutate SELFIE: '+str(sf)+' To Obtain: '+str(selfie_mutated) + '\n')
f.close()
return (selfie_mutated, smiles_canon)
def get_selfie_chars(selfie):
'''Obtain a list of all selfie characters in string selfie
Parameters:
selfie (string) : A selfie string - representing a molecule
Example:
>>> get_selfie_chars('[C][=C][C][=C][C][=C][Ring1][Branch1_1]')
['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_1]']
Returns:
chars_selfie: list of selfie characters present in molecule selfie
'''
chars_selfie = [] # A list of all SELFIE sybols from string selfie
while selfie != '':
chars_selfie.append(selfie[selfie.find('['): selfie.find(']')+1])
selfie = selfie[selfie.find(']')+1:]
return chars_selfie
def get_reward(selfie_A_chars, selfie_B_chars):
'''Return the levenshtein similarity between the selfies characters in 'selfie_A_chars' & 'selfie_B_chars'
Parameters:
selfie_A_chars (list) : list of characters of a single SELFIES
selfie_B_chars (list) : list of characters of a single SELFIES
Returns:
reward (int): Levenshtein similarity between the two SELFIES
'''
reward = 0
iter_num = max(len(selfie_A_chars), len(selfie_B_chars)) # Larger of the selfie chars to iterate over
for i in range(iter_num):
if i+1 > len(selfie_A_chars) or i+1 > len(selfie_B_chars):
return reward
if selfie_A_chars[i] == selfie_B_chars[i]:
reward += 1
return reward
# Executable code for EXPERIMENT C (Three different choices):
# TIOTOXENE RUN
# N = 20 # Number of runs
# simlr_path_collect = []
# svg_file_name = 'Tiotixene_run.svg'
# starting_mol_name = 'Tiotixene'
# data_file_name = '20_runs_data_Tiotixene.txt'
# starting_smile = 'CN1CCN(CC1)CCC=C2C3=CC=CC=C3SC4=C2C=C(C=C4)S(=O)(=O)N(C)C'
# show_gen_out = False
# len_random_struct = len(get_selfie_chars(encoder(starting_smile))) # Length of the starting SELFIE structure
# CELECOXIB RUN
# N = 20 # Number of runs
# simlr_path_collect = []
# svg_file_name = 'Celecoxib_run.svg'
# starting_mol_name = 'Celecoxib'
# data_file_name = '20_runs_data_Celecoxib.txt'
# starting_smile = 'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F'
# show_gen_out = False
# len_random_struct = len(get_selfie_chars(encoder(starting_smile))) # Length of the starting SELFIE structure
# Troglitazone RUN
N = run_num # Number of runs
simlr_path_collect = []
svg_file_name = 'Scopolamine.svg'
starting_mol_name = 'Cannflavin A'
data_file_name = 'generated molecules.txt'
starting_smile = 'CC1=C(C2=C(CCC(O2)(C)COC3=CC=C(C=C3)CC4C(=O)NC(=O)S4)C(=C1O)C)C'
show_gen_out = False
len_random_struct = len(get_selfie_chars(encoder(starting_smile))) # Length of the starting SELFIE structure
st.title("Mutating drug molecules ")
#st.markdown("I made this app using code from the paper *Beyond Generative Models: Superfast Traversal, Optimization, Novelty, Exploration and Discovery (STONED) Algorithm for Molecules using SELFIES*. I used the molecule scopolamine as the starting structure, a phytochemical with anti-parkinsons drug properties. The genetic algorithm is capable of producing novel molecules on par with GPU powered deep learning models without the expensive resources")
st.markdown('''This app demonstrates the algorithm for novel molecule generation described in the paper *Beyond Generative Models: Superfast Traversal, Optimization, Novelty, Exploration and Discovery (STONED) Algorithm for Molecules using SELFIES*.
The algoirthm uses a starting structure, Cannflavin A, the genetic algorithm then introduces "mutations" until a structurally viable compound is created. The algorithm is capable of producing novel molecules on par with GPU powered
deep learning models without the expensive resources.") ''')
#st.sidebar.title("")
if st.sidebar.button("Generate Molecules"):
for i in range(N):
print('Run number: ', i)
with open(data_file_name, 'a') as myfile:
myfile.write('RUN {} \n'.format(i))
# celebx = 'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F'
starting_selfie = encoder(starting_smile)
starting_selfie_chars = get_selfie_chars(starting_selfie)
max_molecules_len = len(starting_selfie_chars)
# Random selfie initiation:
alphabet = list(selfies.get_semantic_robust_alphabet()) # 34 SELFIE characters
selfie = ''
for i in range(random.randint(1, len_random_struct)): # max_molecules_len = max random selfie string length
selfie = selfie + np.random.choice(alphabet, size=1)[0]
starting_selfie = [selfie]
print('Starting SELFIE: ', starting_selfie)
generation_size = gen_size
num_generations = gen_num
save_best = []
simlr_path = []
reward_path = []
# Initial set of molecules
population = np.random.choice(starting_selfie, size=500).tolist() # All molecules are in SELFIES representation
for gen_ in range(num_generations):
# Calculate fitness for all of them
fitness = [get_reward(starting_selfie_chars, get_selfie_chars(x)) for x in population]
fitness = [float(x)/float(max_molecules_len) for x in fitness] # Between 0 and 1
# Keep the best member & mutate the rest
# Step 1: Keep the best molecule
best_idx = np.argmax(fitness)
best_selfie = population[best_idx]
# Diplay some Outputs:
if show_gen_out:
print('Generation: {}/{}'.format(gen_, num_generations))
print(' Top fitness: ', fitness[best_idx])
print(' Top SELFIE: ', best_selfie)
with open(data_file_name, 'a') as myfile:
myfile.write(' SELFIE: {} FITNESS: {} \n'.format(best_selfie, fitness[best_idx]))
# Maybe also print the tanimoto score:
mol = Chem.MolFromSmiles(decoder(best_selfie))
target = Chem.MolFromSmiles(starting_smile)
fp_mol = get_ECFP4(mol)
fp_target = get_ECFP4(target)
score = TanimotoSimilarity(fp_mol, fp_target)
simlr_path.append(score)
reward_path.append(fitness[best_idx])
save_best.append(best_selfie)
# Step 2: Get mutated selfies
new_population = []
for i in range(generation_size-1):
# selfie_mutated, _ = mutate_selfie(best_selfie, max_molecules_len, write_fail_cases=True)
selfie_mutated, _ = mutate_selfie(best_selfie, len_random_struct, write_fail_cases=True) # 100 == max_mol_len allowen in mutation
new_population.append(selfie_mutated)
new_population.append(best_selfie)
# Define new population for the next generation
population = new_population[:]
if score >= 1:
print('Limit reached')
simlr_path_collect.append(simlr_path)
break
mols = []
scores = []
with open('./generated molecules.txt', 'r') as f:
for line in f:
if not 'RUN' in line:
line = line.rstrip().split()
slf = line[1]
score = line[-1]
m = Chem.MolFromSmiles(selfies.decoder(slf))
mols.append(m)
mol = mols[-1:]
scores.append(score)
mscore = scores[-1:]
# st.write(len(mols))
# st.write(scores[-3:])
# st.image(Draw.MolsToGridImage(mol, molsPerRow=3, returnPNG=True))
#st.image(rdkit.Chem.Draw.MolToMPL(mol[0], size=(300, 300), kekulize=True, wedgeBonds=True, imageType=None, fitImage=False, options=None))
#st.image(rdkit.Chem.Draw.MolsToImage(mols[-1:], subImgSize=(2000, 2000), legends=None))
#st.image(rdkit.Chem.Draw.MolsToImage(mols[-5:], subImgSize=(4000, 4000), title='RDKit Molecule'))
#st.image(rdkit.Chem.Draw.MolsToGridImage(mols[-1], molsPerRow=3, subImgSize=(1000, 1000), highlightBondLists=None, useSVG=True))
cann = Chem.MolFromSmiles("CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C(C=C3)O)OC)O)C)C")
filename = "CannflavinA.png"
filename2 = "Novel.png"
novel = mols[-15]
with st.container():
st.write("Number of molecules generated: ", len(mols))
st.write("Similiarty to Cannflavin A: ", float(scores[-15]))
with st.container():
col1,col2 = st.columns(2)
rdkit.Chem.Draw.MolToImageFile(cann, filename, size=(2000, 2000), kekulize=True, wedgeBonds=True)
col1.subheader("Original Molecule")
col1.image("CannflavinA.png")
rdkit.Chem.Draw.MolToImageFile(novel, filename2, size=(2000, 2000), kekulize=True, wedgeBonds=True)
col2.subheader("Mutated Molecule")
col2.image("Novel.png")
#col2.image(rdkit.Chem.Draw.MolsToImage(mols[-1:], subImgSize=(2000, 2000), legends=None))