diff --git a/README.md b/README.md index 106017e..3a44fca 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Argenomic is an open-source implementation of an illumination algorithm for opti ## Getting Started -After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Thiotixene) can be called upon in the following manner: +After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Troglitazone) can be called upon in the following manner: ``` python3 illuminate.py generations=100 ``` @@ -61,7 +61,7 @@ Important dependencies of the Argenomic software environment and where to find t * Jan Jensen for his work in developing and open-sourcing a graph-based genetic algorithm for molecular optimisation, which served as impetus for this project. -* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space. +* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space. * Pat Walters for his scripts indicating how to run structural alerts using the RDKit and ChEMBL, and for his many enlightening medicinal chemistry blog posts. diff --git a/__pycache__/cynosure.cpython-37.pyc b/__pycache__/cynosure.cpython-37.pyc new file mode 100644 index 0000000..59612b5 Binary files /dev/null and b/__pycache__/cynosure.cpython-37.pyc differ diff --git a/argenomic/__init__.py b/argenomic/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/argenomic/__init__.py @@ -0,0 +1 @@ + diff --git a/argenomic/__pycache__/__init__.cpython-37.pyc b/argenomic/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..41c2eef Binary files /dev/null and b/argenomic/__pycache__/__init__.cpython-37.pyc differ diff --git a/argenomic/__pycache__/base.cpython-37.pyc b/argenomic/__pycache__/base.cpython-37.pyc new file mode 100644 index 0000000..403cecc Binary files /dev/null and b/argenomic/__pycache__/base.cpython-37.pyc differ diff --git a/argenomic/__pycache__/infrastructure.cpython-37.pyc b/argenomic/__pycache__/infrastructure.cpython-37.pyc index 6f20b5c..5e8e75b 100644 Binary files a/argenomic/__pycache__/infrastructure.cpython-37.pyc and b/argenomic/__pycache__/infrastructure.cpython-37.pyc differ diff --git a/argenomic/__pycache__/mechanism.cpython-37.pyc b/argenomic/__pycache__/mechanism.cpython-37.pyc index be8afdb..06fa850 100644 Binary files a/argenomic/__pycache__/mechanism.cpython-37.pyc and b/argenomic/__pycache__/mechanism.cpython-37.pyc differ diff --git a/argenomic/__pycache__/operations.cpython-37.pyc b/argenomic/__pycache__/operations.cpython-37.pyc index 8eb7fa7..f33b51d 100644 Binary files a/argenomic/__pycache__/operations.cpython-37.pyc and b/argenomic/__pycache__/operations.cpython-37.pyc differ diff --git a/argenomic/base.py b/argenomic/base.py new file mode 100644 index 0000000..7c0706f --- /dev/null +++ b/argenomic/base.py @@ -0,0 +1,19 @@ +from typing import List, Tuple +from dataclasses import dataclass + +class Elite: + def __init__(self, index): + self.index = index + self.molecule = None + + def update(self, molecule): + if self.molecule is None or (molecule.fitness - self.molecule.fitness) > 0.0: + self.molecule = molecule + return None + +@dataclass +class Molecule: + smiles: str + pedigree: Tuple[str, str ,str] + fitness: float = None + descriptor: List[float] = None diff --git a/argenomic/infrastructure.py b/argenomic/infrastructure.py index c7f7136..e9cf14c 100644 --- a/argenomic/infrastructure.py +++ b/argenomic/infrastructure.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd from typing import List, Tuple +from dataclasses import dataclass from datetime import datetime from sklearn.cluster import KMeans @@ -18,22 +19,14 @@ rdBase.DisableLog('rdApp.error') from rdkit.Chem import Lipinski +from argenomic.base import Molecule, Elite -class elite(): - def __init__(self, index, descriptor): - self.index = index - self.fitness = 0.0 - self.molecule = None - self.descriptor = descriptor - - def update(self, fitness, molecule, descriptor): - if self.fitness < fitness: - self.fitness = fitness - self.molecule = molecule - self.descriptor = descriptor - return None - -class archive: +class Archive: + """ + A composite class containing the current elite molecules in a CVT tree structure. Allows for processing of + new molecules, sampling of the existing elite molecules, and disk storage of the current state of the archive. + The CVT centers are either loaded from or deposited to cache disk storage. + """ def __init__(self, archive_config, descriptor_config) -> None: self.archive_size = archive_config.size self.archive_accuracy = archive_config.accuracy @@ -46,132 +39,162 @@ def __init__(self, archive_config, descriptor_config) -> None: kmeans = KMeans(n_clusters=self.archive_size) kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, self.archive_dimensions)) self.cvt_centers = kmeans.cluster_centers_ - np.savetxt(self.cvt_location, self.cvt_centers) + np.savetxt(self.cvt_location, self.cvt_centers) self.cvt = KDTree(self.cvt_centers, metric='euclidean') - self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)] + self.elites = [Elite(index) for index, _ in enumerate(self.cvt_centers, start=0)] return None def cvt_index(self, descriptor: List[float]) -> int: + """ + Returns CVT index for the niche nearest to the given discriptor. + """ return self.cvt.query([descriptor], k=1)[1][0][0] - def add_to_archive(self, molecules: List[Chem.Mol], descriptors: List[List[float]], fitnesses: List[float]) -> None: - for molecule, descriptor, fitness in zip(molecules, descriptors, fitnesses): - self.elites[self.cvt_index(descriptor)].update(fitness, molecule, descriptor) + def add_to_archive(self, molecules) -> None: + """ + Takes in a list of molecules and adds them to the archive as prescribed by the MAP-Elites algorithm, + i.e. each niche only contains the most fit molecule. Other molecules are discarded. + """ + for molecule in molecules: + self.elites[self.cvt_index(molecule.descriptor)].update(molecule) return None def sample(self, size: int) -> List[Chem.Mol]: - pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0] + """ + Returns a list of elite molecules of the requisted length. + The elite molecules are randomly drawn, weighted by their fitness. + """ + pairs = [(elite.molecule, elite.molecule.fitness) for elite in self.elites if elite.molecule] molecules, weights = map(list, zip(*pairs)) return random.choices(molecules, k=size, weights=weights) def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]: - pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0] + """ + Returns a list of pairs of elite molecules of the requisted length. + The elite molecules are randomly drawn, weighted by their fitness. + """ + pairs = [(elite.molecule, elite.molecule.fitness) for elite in self.elites if elite.molecule] molecules, weights = map(list, zip(*pairs)) sample_molecules = random.choices(molecules, k=size, weights=weights) sample_pairs = np.random.choice(list(filter(None, sample_molecules)), size=(size, 2), replace=True) - sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs] + sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs] return sample_pairs - def store_archive(self, generation: float) -> None: - elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data() - data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses} - pd.DataFrame(data=data).to_csv("archive_{}.csv".format(generation), index=False) - return None - - def store_statistics(self, generation: float) -> None: - elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data() - fractional_size = len(elites_smiles)/self.archive_size - statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size] + def store_data(self, generation: float) -> None: + """ + Creates a dataframe representing the archive and writes it to disk. In addtion, basic statistics about + the state of the archive are saved to disk and printed to the IO stream. + """ + archive_data = self.get_archive_data() + fractional_size = len(archive_data["smiles"])/self.archive_size + max_fitness, mean_fitness = np.max(archive_data["fitnesses"]), np.mean(archive_data["fitnesses"]) if os.path.isfile('statistics.csv'): with open('statistics.csv', 'a') as file: - csv.writer(file).writerow(statistics) + csv.writer(file).writerow([generation, max_fitness, mean_fitness, fractional_size]) file.close() else: with open('statistics.csv', 'w') as file: file.close() - print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4])) - print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3])) + pd.DataFrame(data=archive_data).to_csv("archive_{}.csv".format(generation), index=False) + print('Generation: {}, Size: {:.2f}'.format(generation, fractional_size)) + print('Fitness Max: {:.5f}, Fitness Mean: {:.5f}'.format(max_fitness, mean_fitness)) return None - - def elites_data(self) -> Tuple[List[str], List[float], List[float]]: - elites_list = [elite for elite in self.elites if elite.molecule] - elites_smiles = [Chem.MolToSmiles(elite.molecule) for elite in elites_list] - elites_descriptors = [elite.descriptor for elite in elites_list] - elites_fitnesses = [elite.fitness for elite in elites_list] - return elites_smiles, elites_descriptors, elites_fitnesses - - -class arbiter: + + def get_archive_data(self) -> None: + elite_indices = [elite.index for elite in self.elites if elite.molecule] + elite_molecules = [elite.molecule for elite in self.elites if elite.molecule] + elites_smiles = [molecule.smiles for molecule in elite_molecules] + elites_pedigree = [molecule.pedigree for molecule in elite_molecules] + elites_descriptors = [molecule.descriptor for molecule in elite_molecules] + elites_fitnesses = [molecule.fitness for molecule in elite_molecules] + archive_data = {'index': elite_indices, 'smiles': elites_smiles, 'pedigree': elites_pedigree, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses} + return archive_data + +class Arbiter: """ A catalog class containing different druglike filters for small molecules. Includes the option to run the structural filters from ChEMBL. """ def __init__(self, arbiter_config) -> None: + self.cache_smiles = [] self.rules_dict = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/alert_collection.csv")) self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)] self.rules_list = self.rules_dict["smarts"].values.tolist() self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist() self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list] - def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]: + def __call__(self, molecules): """ Applies the chosen filters (hologenicity, veber_infractions, - ChEMBL structural alerts, ...) to a list of molecules. + ChEMBL structural alerts, ...) to a list of molecules and removes duplicates. """ filtered_molecules = [] + molecules = self.unique_molecules(molecules) for molecule in molecules: - if self.molecule_validity(molecule): + molecular_graph = Chem.MolFromSmiles(molecule.smiles) + if self.molecule_filter(molecular_graph): filtered_molecules.append(molecule) return filtered_molecules - def molecule_validity(self, molecule: Chem.Mol) -> bool: + def unique_molecules(self, molecules: List[Molecule]) -> List[Molecule]: + """ + Checks if a molecule in a lost of molcules is duplicated, either in this batch or before. + """ + unique_molecules = [] + for molecule in molecules: + if molecule.smiles not in self.cache_smiles: + unique_molecules.append(molecule) + self.cache_smiles.append(molecule.smiles) + return unique_molecules + + def molecule_filter(self, molecular_graph: Chem.Mol) -> bool: """ - Checks if a given molecule passes through the chosen filters (hologenicity, + Checks if a given molecular structure passes through the chosen filters (hologenicity, veber_infractions, ChEMBL structural alerts, ...). """ - toxicity = self.toxicity(molecule) - hologenicity = self.hologenicity(molecule) - veber_infraction = self.veber_infraction(molecule) + toxicity = self.toxicity(molecular_graph) + hologenicity = self.hologenicity(molecular_graph) + veber_infraction = self.veber_infraction(molecular_graph) validity = not (toxicity or hologenicity or veber_infraction) - if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')): - ring_infraction = self.ring_infraction(molecule) + if molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[R]')): + ring_infraction = self.ring_infraction(molecular_graph) validity = validity and not (ring_infraction) return validity - def toxicity(self, molecule: Chem.Mol) -> bool: + def toxicity(self, molecular_graph: Chem.Mol) -> bool: """ Checks if a given molecule fails the structural filters. """ for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list): - if len(molecule.GetSubstructMatches(pattern)) > tolerance: + if len(molecular_graph.GetSubstructMatches(pattern)) > tolerance: return True return False @staticmethod - def hologenicity(molecule: Chem.Mol) -> bool: + def hologenicity(molecular_graph: Chem.Mol) -> bool: """ Checks if a given molecule fails the hologenicity filters. """ - fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6 - bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3 - chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3 + fluorine_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6 + bromide_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3 + chlorine_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3 return chlorine_saturation or bromide_saturation or fluorine_saturation @staticmethod - def ring_infraction(molecule: Chem.Mol) -> bool: + def ring_infraction(molecular_graph: Chem.Mol) -> bool: """ Checks if a given molecule fails the ring infraction filters. """ - ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]')) - macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6 - double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]')) + ring_allene = molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]')) + macro_cycle = max([len(j) for j in molecular_graph.GetRingInfo().AtomRings()]) > 6 + double_bond_in_small_ring = molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]')) return ring_allene or macro_cycle or double_bond_in_small_ring @staticmethod - def veber_infraction(molecule: Chem.Mol) -> bool: + def veber_infraction(molecular_graph: Chem.Mol) -> bool: """ Checks if a given molecule fails the veber infraction filters. """ - rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10 - hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10 + rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecular_graph) > 10 + hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecular_graph) + Lipinski.NumHDonors(molecular_graph) > 10 return rotatable_bond_saturation or hydrogen_bond_saturation diff --git a/argenomic/mechanism.py b/argenomic/mechanism.py index 0c363ff..a09c9cb 100644 --- a/argenomic/mechanism.py +++ b/argenomic/mechanism.py @@ -15,13 +15,13 @@ from rdkit.Chem import rdMolDescriptors from rdkit.DataStructs.cDataStructs import TanimotoSimilarity -class descriptor: +class Descriptor: """ A strategy class for calculating the descriptor vector of a molecule. """ def __init__(self, config_descriptor) -> None: self.properties = [] - self.ranges = config_descriptor.ranges + self.ranges = config_descriptor.ranges self.property_names = config_descriptor.properties for name in self.property_names: module, fuction = name.split(".") @@ -29,59 +29,61 @@ def __init__(self, config_descriptor) -> None: self.properties.append(getattr(module, fuction)) return None - def __call__(self, molecule: Chem.Mol) -> List[float]: + def __call__(self, molecule) -> None: """ - Calculating the descriptor vector of a molecule. + Updates the descriptor vector of a molecule. """ descriptor = [] + molecular_graph = Chem.MolFromSmiles(molecule.smiles) for property, range in zip(self.properties, self.ranges): - descriptor.append(self.rescale(property(molecule), range)) - return descriptor + descriptor.append(self.rescale(property(molecular_graph), range)) + molecule.descriptor = descriptor + return molecule @staticmethod def rescale(feature: List[float], range: List[float]) -> List[float]: """ - Rescaling the feature to the unit range. + Rescales the feature to the unit range. """ rescaled_feature = (feature - range[0])/(range[1] - range[0]) return rescaled_feature -class fitness: +class Fitness: """ A strategy class for calculating the fitness of a molecule. """ def __init__(self, config_fitness) -> None: - self.memoized_cache = dict() self.fingerprint_type = config_fitness.type self.target = Chem.MolFromSmiles(config_fitness.target) self.target_fingerprint = self.get_fingerprint(self.target, self.fingerprint_type) return None - def __call__(self, molecule: Chem.Mol) -> float: - smiles = Chem.MolToSmiles(molecule) - if smiles in self.memoized_cache: - fitness = self.memoized_cache[smiles] - else: - molecule_fingerprint = self.get_fingerprint(molecule, self.fingerprint_type) - fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint) - self.memoized_cache[smiles] = fitness - return fitness + def __call__(self, molecule) -> None: + """ + Updates the fitness value of a molecule. + """ + molecular_graph = Chem.MolFromSmiles(Chem.CanonSmiles(molecule.smiles)) + molecule_fingerprint = self.get_fingerprint(molecular_graph, self.fingerprint_type) + fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint) + molecule.fitness = fitness + return molecule - def get_fingerprint(self, molecule: Chem.Mol, fingerprint_type: str): + def get_fingerprint(self, molecular_graph: Chem.Mol, fingerprint_type: str): method_name = 'get_' + fingerprint_type method = getattr(self, method_name) if method is None: raise Exception('{} is not a supported fingerprint type.'.format(fingerprint_type)) - return method(molecule) + return method(molecular_graph) + + def get_ECFP4(self, molecular_graph: Chem.Mol): + return AllChem.GetMorganFingerprint(molecular_graph, 2) - def get_ECFP4(self, molecule: Chem.Mol): - return AllChem.GetMorganFingerprint(molecule, 2) + def get_ECFP6(self, molecular_graph: Chem.Mol): + return AllChem.GetMorganFingerprint(molecular_graph, 3) - def get_ECFP6(self, molecule: Chem.Mol): - return AllChem.GetMorganFingerprint(molecule, 3) + def get_FCFP4(self, molecular_graph: Chem.Mol): + return AllChem.GetMorganFingerprint(molecular_graph, 2, useFeatures=True) - def get_FCFP4(self, molecule: Chem.Mol): - return AllChem.GetMorganFingerprint(molecule, 2, useFeatures=True) + def get_FCFP6(self, molecular_graph: Chem.Mol): + return AllChem.GetMorganFingerprint(molecular_graph, 3, useFeatures=True) - def get_FCFP6(self, molecule: Chem.Mol): - return AllChem.GetMorganFingerprint(molecule, 3, useFeatures=True) diff --git a/argenomic/operations.py b/argenomic/operations.py index bf6af71..f473dcb 100644 --- a/argenomic/operations.py +++ b/argenomic/operations.py @@ -12,52 +12,59 @@ from rdkit.Chem import AllChem from rdkit.Chem import rdMMPA -class mutator: +from argenomic.base import Molecule + +class Mutator: """ - A catalog class containing and implementing mutations to small molecules - according to the principles of positional analogue scanning. + A catalog class containing and implementing mutations to small molecules according to the principles of positional analogue scanning. """ - def __init__(self) -> None: - self.mutation_data = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/mutation_collection.tsv"), sep='\t') + def __init__(self, config_mutator) -> None: + self.mutation_data = pd.read_csv(hydra.utils.to_absolute_path(config_mutator.data_file), sep='\t') - def __call__(self, molecule:Chem.Mol) -> List[Chem.Mol]: + def __call__(self, molecule) -> List[Molecule]: sampled_mutation = self.mutation_data.sample(n=1, weights='probability').iloc[0] reaction = AllChem.ReactionFromSmarts(sampled_mutation['smarts']) + pedigree = ("mutation", sampled_mutation['smarts'], molecule.smiles) try: - molecules = [products[0] for products in reaction.RunReactants([molecule])] + molecular_graphs = [products[0] for products in reaction.RunReactants([Chem.MolFromSmiles(molecule.smiles)])] + smiles_list = [Chem.MolToSmiles(molecular_graph) for molecular_graph in molecular_graphs if molecular_graph is not None] + molecules = [Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list if Chem.MolFromSmiles(smiles)] except: molecules = [] return molecules -class crossover: +class Crossover: """ A strategy class implementing a parent-centric crossover of small molecules. """ def __init__(self): pass - def __call__(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> List[Chem.Mol]: - molecule_cores, molecule_sidechains = self.fragmentate(molecule_pair) - molecules = self.merge(molecule_cores, molecule_sidechains) + def __call__(self, molecule_pair): + pedigree = ("crossover", molecule_pair[0].smiles, molecule_pair[1].smiles) + smiles_list = self.merge(molecule_pair) + molecules = [Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list if Chem.MolFromSmiles(smiles)] return molecules - def merge(self, molecule_cores:List[Chem.Mol], molecule_sidechains:List[Chem.Mol]) -> List[Chem.Mol]: - molecules = [] - random.shuffle(molecule_sidechains) + def merge(self, molecule_pair): + molecular_graphs = [] + graph_cores, graph_sidechains = self.fragment(molecule_pair) + random.shuffle(graph_sidechains) reaction = AllChem.ReactionFromSmarts('[*:1]-[1*].[1*]-[*:2]>>[*:1]-[*:2]') - for core, sidechain in zip(molecule_cores, molecule_sidechains): - molecules.append(reaction.RunReactants((core, sidechain))[0][0]) - return molecules + for core, sidechain in zip(graph_cores, graph_sidechains): + molecular_graphs.append(reaction.RunReactants((core, sidechain))[0][0]) + smiles_list = [Chem.MolToSmiles(molecular_graph) for molecular_graph in molecular_graphs if molecular_graph is not None] + return smiles_list - def fragmentate(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> Tuple[List[Chem.Mol], List[Chem.Mol]]: - molecule_cores = [] - molecule_sidechains = [] + def fragment(self, molecule_pair): + graph_cores = [] + graph_sidechains = [] for molecule in molecule_pair: - molecule_frags = rdMMPA.FragmentMol(molecule, maxCuts=1, resultsAsMols=False) - if len(molecule_frags) > 0: - _, molecule_frags = map(list, zip(*molecule_frags)) - for molecule_pair in molecule_frags: - core, sidechain = molecule_pair.split(".") - molecule_cores.append(Chem.MolFromSmiles(core.replace("[*:1]", "[1*]"))) - molecule_sidechains.append(Chem.MolFromSmiles(sidechain.replace("[*:1]", "[1*]"))) - return molecule_cores, molecule_sidechains + graph_frags = rdMMPA.FragmentMol(Chem.MolFromSmiles(molecule.smiles), maxCuts=1, resultsAsMols=False) + if len(graph_frags) > 0: + _, graph_frags = map(list, zip(*graph_frags)) + for frag_pair in graph_frags: + core, sidechain = frag_pair.split(".") + graph_cores.append(Chem.MolFromSmiles(core.replace("[*:1]", "[1*]"))) + graph_sidechains.append(Chem.MolFromSmiles(sidechain.replace("[*:1]", "[1*]"))) + return graph_cores, graph_sidechains diff --git a/configuration/config.yaml b/configuration/config.yaml index f7387f3..0136a7e 100644 --- a/configuration/config.yaml +++ b/configuration/config.yaml @@ -1,5 +1,5 @@ --- -data_file: data/smiles/guacamol_initial_rediscovery_thiotixene.smi +data_file: data/smiles/guacamol_intitial_rediscovery_troglitazone.smi batch_size: 40 initial_size: 100 workers: 1 @@ -25,9 +25,11 @@ descriptor: - - 40 - 130 fitness: - target: CN(C)S(=O)(=O)c1ccc2Sc3ccccc3C(=CCCN4CCN(C)CC4)c2c1 + target: "O=C1NC(=O)SC1Cc4ccc(OCC3(Oc2c(c(c(O)c(c2CC3)C)C)C)C)cc4" type: ECFP4 arbiter: rules: - Glaxo +mutator: + data_file: data/smarts/mutation_collection.tsv diff --git a/data/README.md b/data/README.md index d465147..6dc6d9f 100644 --- a/data/README.md +++ b/data/README.md @@ -1,21 +1,13 @@ -# Sample Package Data +# Data +This directory contains data that were included with the argenomic package. This is also a place where non-code related additional information (such as data files, molecular structures, etc.) can +conveniently stored. Please note that the output files are automatically stored in their own folder. -This directory contains sample additional data you may want to include with your package. -This is a place where non-code related additional information (such as data files, molecular structures, etc.) can -go that you want to ship alongside your code. +An overview of the subdirectories: -Please note that it is not recommended to place large files in your git directory. If your project requires files larger -than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files -as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version -in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository -cap. +* cvt: Contains the cvt centroids as stored in cache. New cvt centroids, as a generated by argenomic, will be stored here automaticaly. -## Including package data +* figures: A collection of figures generated with data from argenomic. -Modify your package's `setup.py` file and the `setup()` command. Include the -[`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the -correct files. +* smarts: Two smarts-files: alert_collection.csv (containing the smarts needed to remove unwanted molecules) and mutation_collection.tsv containing the smarts causing the mutations an their probability weights. -## Manifest - -* `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150) +* smiles: a number of files contianing databases of smiles from ZINC, ChEMBL and the Guacamol projects. diff --git a/data/cvt/cache_2_25000.csv b/data/cvt/cache_2_25000.csv new file mode 100644 index 0000000..2d43a16 --- /dev/null +++ b/data/cvt/cache_2_25000.csv @@ -0,0 +1,150 @@ +5.214209003438609047e-01 3.491316341809322932e-01 +6.572639038034633341e-01 8.567305610109627878e-01 +1.303307299222913884e-01 5.618729675703061366e-01 +8.897109583652169640e-01 1.823853610120633184e-01 +1.112530223485239667e-01 2.125501570542110974e-01 +8.810485063108196568e-01 6.122016082233591172e-01 +3.172242608672095621e-01 7.877221983336107902e-01 +5.638951314398871206e-01 1.390602294501244174e-01 +3.747759490579619657e-01 5.166290158658712883e-01 +7.156505009795108840e-01 5.158020582770069806e-01 +8.855806372593864051e-01 7.760577077222354792e-01 +1.932917979672891251e-01 8.695757025678164664e-01 +5.193996142549586192e-01 6.819997613639713352e-01 +7.720992401187742882e-01 2.812513181445543964e-01 +1.967793652955975303e-01 2.039334324085603223e-01 +2.352549408824252120e-01 3.542762225886592065e-02 +4.177055087677166156e-01 8.890403494605696455e-01 +7.693683978100818166e-01 4.049010623334153847e-02 +8.902332386247120422e-01 3.473863236770655671e-01 +3.922246776846531335e-01 1.488768714888923927e-01 +2.559342321943171017e-01 4.216102113216871339e-01 +8.798530818662737563e-01 8.592640781857143750e-01 +3.612778158118024052e-02 1.036870890873520246e-01 +3.510767573549956611e-01 9.637000722224431382e-01 +1.224900179721745141e-01 7.161863573661265692e-01 +8.612922384536358944e-02 4.426260471780079642e-01 +6.555067788482615843e-01 6.677712290512817228e-01 +5.504482583622519742e-01 5.197246061218662172e-01 +2.831731491435160453e-01 6.398368855366747265e-01 +5.314617680314005499e-01 2.296585498913069068e-01 +3.984072570036540917e-01 3.337886928463870140e-01 +8.333238035694940926e-01 1.134076843720980277e-01 +3.163692070297491332e-02 9.512468980935808727e-01 +7.205792391115668494e-01 2.041550631812601435e-01 +8.087555260558467740e-01 7.138242631876635658e-01 +6.794411380054670158e-01 4.416724920884920746e-01 +5.393332710466596236e-01 9.658971387152128774e-01 +3.702092163590727791e-01 6.038501060352695626e-01 +9.681447104216851507e-01 9.601857249202805367e-01 +9.616608612212861340e-01 7.270940358476680743e-01 +1.387118746086496990e-01 2.850561698172589531e-01 +3.195399580739361478e-01 2.629479407135443814e-01 +7.573648509812191953e-01 4.274626637112869942e-01 +7.694632049488048242e-01 9.593063682048701413e-01 +5.853447568827151448e-01 8.066919752545115285e-01 +2.170097874003349969e-01 5.061635451836496991e-01 +3.970089323290530992e-01 7.505582749837897971e-01 +8.050315478644549660e-01 1.984576609380733214e-01 +4.812033557458149091e-01 3.097518436134361064e-02 +9.604678072063290717e-01 8.117042389708366557e-01 +2.008381024663641878e-01 7.794389048430581113e-01 +4.692908801723650325e-01 2.865736759496665398e-01 +8.073741232549369595e-01 3.662128565463809715e-01 +7.382048913037019267e-01 7.913194942197723236e-01 +1.761703699387680011e-01 9.639133896156604209e-01 +3.496167881473810635e-02 5.804975950556504616e-01 +1.928213400661419885e-01 3.575214785642295001e-01 +1.077711933279865697e-01 8.939322765817070415e-01 +6.374027015053089773e-01 6.113297330041378430e-02 +1.445612372438661031e-01 3.788953199110395298e-02 +2.633122054127355893e-01 8.438824838695733188e-01 +7.963634650595494691e-01 6.323165707597924179e-01 +6.737949008337890122e-01 2.766189733124589933e-01 +5.274702277241158921e-01 4.373025059938003278e-01 +4.304150612853208102e-02 2.846488432599008633e-01 +9.647113321368945282e-01 8.882386748209299965e-01 +4.532849626097593676e-01 4.923688927297610318e-01 +7.310348249978582125e-01 8.857302526180675928e-01 +9.579362671208806468e-01 2.252990049955453378e-01 +9.535781752654851617e-01 4.777069044746957149e-01 +9.040628422715089041e-01 8.111615467527399925e-02 +6.182423593013797181e-01 9.590362561636518457e-01 +1.695183616744483923e-01 1.261849951640344747e-01 +3.912728348717070537e-02 8.599571285833029766e-01 +2.643933415487987881e-01 9.489729036337286594e-01 +3.203002274601614574e-01 1.065119796974987043e-01 +8.876138547138956447e-02 6.331635974709677317e-01 +8.813397046919623223e-01 5.209305045404831747e-01 +2.993241591845399174e-01 3.495323031802242131e-01 +2.707368953274269519e-01 7.325525350208567099e-01 +9.604093329178933214e-01 3.069369274839566231e-01 +3.465402413666332504e-01 6.908887071187498341e-01 +7.127350216961223506e-01 6.052670469128804687e-01 +5.888521290504437067e-01 6.310897921876609251e-01 +4.373809204843845944e-01 9.663557115721628232e-01 +4.506617210695661457e-01 4.030904916641723235e-01 +8.532788650421817556e-01 3.281661719360373652e-02 +3.478303020998657624e-02 1.859259674906943438e-01 +1.626961335708531631e-01 6.430188206375511140e-01 +3.988095228969700257e-01 8.184398893195028446e-01 +1.047020046074574884e-01 3.549161442712227554e-01 +6.394421950638540242e-01 5.547949071104841812e-01 +7.341152933424237581e-01 1.215903491414830895e-01 +2.863958769918517611e-02 3.846665214332258831e-01 +5.081150544763712862e-02 3.576573965102103347e-02 +2.153929104872027156e-01 5.866033110529029759e-01 +4.958841042067115290e-01 7.522586885094930675e-01 +6.146361852750878985e-01 2.078919852605342622e-01 +6.973805024019630316e-01 3.730181185213271577e-02 +6.739361084321531248e-01 7.581619302505517854e-01 +8.878768404318930774e-01 6.939084709914972171e-01 +5.882726959291096902e-01 2.985770494865925828e-01 +3.385692702107084839e-02 4.962881216910638993e-01 +3.649212802963819335e-01 4.229624519518367931e-01 +6.099014853388573520e-01 4.675345834068886375e-01 +4.694455197182448858e-01 1.013493495098539143e-01 +5.169048233667209447e-01 5.814256101608156202e-01 +8.034783458500311948e-01 5.592932459274203572e-01 +4.373281756997882130e-01 6.662247776421277301e-01 +3.945369906704876972e-01 4.896706580165904876e-02 +1.299618742969136376e-01 5.004433189751911204e-01 +8.648002620121615358e-01 2.776245412123384826e-01 +9.591990260907414800e-01 4.054440505067447020e-02 +5.777460896787789402e-01 8.884680499674402210e-01 +2.365335527911612834e-01 2.783666312626085215e-01 +3.658831115952154711e-02 6.836500758455124060e-01 +5.657516560805646577e-01 4.101823160162576531e-02 +9.031657002227472253e-01 9.463393694189856653e-01 +5.940950057534915008e-01 7.269034663708533461e-01 +3.981189669637171802e-01 2.371576058571607803e-01 +2.931982213424287886e-01 1.894775508857726720e-01 +2.163667400438791799e-01 6.912413185724788356e-01 +3.365126995284586364e-01 8.836112486165982327e-01 +6.216043792037700966e-01 3.736353447618341317e-01 +8.103602201395208038e-01 7.923019377477317216e-01 +4.400573378598088281e-02 7.765960057559868623e-01 +2.913708750605449316e-01 5.625327134151457686e-01 +2.991312276686851623e-01 4.888558862352131995e-01 +4.899724900427019358e-01 9.087993656686448318e-01 +4.716804015209817846e-01 1.871896263836810848e-01 +3.191700905342936823e-01 3.501468874862628811e-02 +9.572032351641173165e-01 6.354205068526487254e-01 +7.124398600540035043e-01 3.531460747367958786e-01 +1.624822601983763604e-01 4.286909982746384129e-01 +1.007349828790176649e-01 9.660750420822548090e-01 +1.239233587164308426e-01 8.113025574272882956e-01 +9.617165396406597289e-01 3.963283021692636243e-01 +6.502062645640349636e-01 1.430817365785390094e-01 +7.378736400348993030e-01 6.933054273821108726e-01 +4.886903807060516414e-01 8.305469251510426698e-01 +8.432538584906650314e-01 9.549108008529461067e-01 +9.644638498155262418e-01 1.347018225560029925e-01 +6.883723988416640127e-01 9.531433944856376739e-01 +8.083828824495542520e-01 4.806184952696599955e-01 +4.458542432605734351e-01 5.796875996515719054e-01 +8.016880637375696228e-01 8.726789294931538432e-01 +2.454045033751177107e-01 1.169903171997553892e-01 +9.606265008504202285e-01 5.589727681639240497e-01 +8.758801013322721207e-01 4.318442003214353897e-01 +1.017579884037961846e-01 1.227205974955919432e-01 diff --git a/data/figures/logo.png b/data/figures/logo.png index b6f2a09..a19d9ff 100644 Binary files a/data/figures/logo.png and b/data/figures/logo.png differ diff --git a/data/figures/logo_old.png b/data/figures/logo_old.png new file mode 100644 index 0000000..b6f2a09 Binary files /dev/null and b/data/figures/logo_old.png differ diff --git a/data/figures/new_logo.png b/data/figures/new_logo.png new file mode 100644 index 0000000..e6f078c Binary files /dev/null and b/data/figures/new_logo.png differ diff --git a/data/smarts/mutation_collection.tsv b/data/smarts/mutation_collection.tsv index 23cb007..99bedce 100644 --- a/data/smarts/mutation_collection.tsv +++ b/data/smarts/mutation_collection.tsv @@ -94,27 +94,3 @@ add_ring [*;!r;!H0:1]~[*;!r:2]~[*;!r;!H0:3]>>[*:1]1~[*:2]~[*:3]1 0.05 add_ring [*;!r;!H0:1]~[*!r:2]~[*!r:3]~[*;!r;!H0:4]>>[*:1]1~[*:2]~[*:3]~[*:4]1 0.05 add_ring [*;!r;!H0:1]~[*!r:2]~[*:3]~[*:4]~[*;!r;!H0:5]>>[*:1]1~[*:2]~[*:3]~[*:4]~[*:5]1 0.45 add_ring [*;!r;!H0:1]~[*!r:2]~[*:3]~[*:4]~[*!r:5]~[*;!r;!H0:6]>>[*:1]1~[*:2]~[*:3]~[*:4]~[*:5]~[*:6]1 0.45 -append_group [*;!H0:1]>>[*:1]-C-O 0.15 -append_group [*;!H0:1]>>[*:1]-[C](=O)-N 0.15 -append_group [*;!H0:1]>>[*:1]-C#N 0.15 -append_group [*;!H0:1]>>[*:1]-C=N 0.15 -append_group [*;!H0:1]>>[*:1]-[C](=N)-N 0.1 -append_group [*;!H0:1]>>[*:1]-N-[C](=N)-N 0.1 -append_group [*;!H0:1]>>[*:1]-N-[C](=O)-O 0.1 -append_group [*;!H0:1]>>[*:1]-[S](=O)(=O)-O 0.05 -append_group [*;!H0:1]>>[*:1]-[S](=O)(=O)-N 0.05 -delete_group [*:1]~C-O>>[*:1] 0.15 -delete_group [*:1]~[C](=O)-N>>[*:1] 0.15 -delete_group [*:1]~C#N>>[*:1] 0.15 -delete_group [*:1]~C=N>>[*:1] 0.15 -delete_group [*:1]~[C](=N)-N>>[*:1] 0.1 -delete_group [*:1]~N-[C](=N)-N>>[*:1] 0.1 -delete_group [*:1]~N-[C](=O)-O>>[*:1] 0.1 -delete_group [*:1]~[S](=O)(=O)-O>>[*:1] 0.05 -delete_group [*:1]~[S](=O)(=O)-N>>[*:1] 0.05 -insert_group [*:1]~[*:2]>>[*:1]-C-O-[*:2] 0.2 -insert_group [*:1]~[*:2]>>[*:1]-[C](=O)-N-[*:2] 0.2 -insert_group [*:1]~[*:2]>>[*:1]-[S](=O)(=O)-N-[*:2] 0.2 -insert_group [*:1]~[*:2]>>[*:1]-[C](=O)-O-[*:2] 0.1 -insert_group [*:1]~[*:2]>>[*:1]-[C](=O)-[N](-F)-[*:2] 0.15 -insert_group [*:1]~[*:2]>>[*:1]-C-[C](-F)=C-C-[*:2] 0.15 diff --git a/environment.yml b/environment.yml index 66ca998..ca14446 100644 --- a/environment.yml +++ b/environment.yml @@ -77,6 +77,7 @@ dependencies: - python-dateutil=2.8.1=py_0 - python_abi=3.7=1_cp37m - pytz=2020.1=py_0 + - pyyaml=5.3.1=py37h7b6447c_1 - rdkit=2020.03.3=py37hdd87690_0 - readline=8.0=h7b6447c_0 - scikit-learn=0.23.1=py37h423224d_0 @@ -115,6 +116,4 @@ dependencies: - hydra-core==1.0.0rc2 - importlib-resources==3.0.0 - omegaconf==2.0.1rc11 - - pyyaml==5.3.1 -prefix: /home/jonas/anaconda3/envs/argenomic-stable - +prefix: /home/jonasver/anaconda3/envs/argenomic-stable diff --git a/illuminate.py b/illuminate.py index 001d5bc..b985891 100644 --- a/illuminate.py +++ b/illuminate.py @@ -1,6 +1,7 @@ import hydra +import numpy as np import pandas as pd -from typing import List, Tuple +from typing import List, Tuple, Type from rdkit import Chem from rdkit.Chem import PandasTools as pdtl @@ -8,78 +9,81 @@ from dask import bag from dask.distributed import Client -from argenomic.operations import crossover, mutator -from argenomic.mechanism import descriptor, fitness -from argenomic.infrastructure import archive, arbiter +from argenomic.base import Molecule +from argenomic.operations import Mutator, Crossover +from argenomic.infrastructure import Arbiter, Archive +from argenomic.mechanism import Fitness, Descriptor -class illumination: +class Illuminate: def __init__(self, config) -> None: self.data_file = config.data_file + self.generations = config.generations self.batch_size = config.batch_size self.initial_size = config.initial_size - self.generations = config.generations - self.mutator = mutator() - self.crossover = crossover() - self.arbiter = arbiter(config.arbiter) - self.descriptor = descriptor(config.descriptor) - self.archive = archive(config.archive, config.descriptor) - self.fitness = fitness(config.fitness) + self.arbiter = Arbiter(config.arbiter) + self.fitness = Fitness(config.fitness) + self.mutator = Mutator(config.mutator) + self.crossover = Crossover() + self.descriptor = Descriptor(config.descriptor) + self.archive = Archive(config.archive, config.descriptor) self.client = Client(n_workers=config.workers, threads_per_worker=config.threads) return None def __call__(self) -> None: self.initial_population() - for generation in range(self.generations): + for generation in range(1, self.generations): molecules = self.generate_molecules() - molecules, descriptors, fitnesses = self.process_molecules(molecules) - self.archive.add_to_archive(molecules, descriptors, fitnesses) - self.archive.store_statistics(generation) - self.archive.store_archive(generation) + molecules = self.process_molecules(molecules) + self.archive.add_to_archive(molecules) + self.archive.store_data(generation) return None def initial_population(self) -> None: - dataframe = pd.read_csv(hydra.utils.to_absolute_path(self.data_file)) - pdtl.AddMoleculeColumnToFrame(dataframe, 'smiles', 'molecule') - molecules = dataframe['molecule'].sample(n=self.initial_size).tolist() - molecules = self.arbiter(self.unique_molecules(molecules)) - molecules, descriptors, fitnesses = self.process_molecules(molecules) - self.archive.add_to_archive(molecules, descriptors, fitnesses) + molecules = self.arbiter(self.load_from_database()) + molecules = self.calculate_descriptors(molecules) + molecules = self.calculate_fitnesses(molecules) + self.archive.add_to_archive(molecules) + self.archive.store_data(0) return None - def generate_molecules(self) -> None: + def load_from_database(self) -> List[Molecule]: + dataframe = pd.read_csv(hydra.utils.to_absolute_path(self.data_file)) + smiles_list = dataframe['smiles'].sample(n=self.initial_size).tolist() + pedigree = ("database", "no reaction", "no parent") + molecules = [Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list] + return molecules + + def generate_molecules(self) -> List[Molecule]: molecules = [] - sample_molecules = self.archive.sample(self.batch_size) - sample_molecule_pairs = self.archive.sample_pairs(self.batch_size) - for molecule in sample_molecules: - molecules.extend(self.mutator(molecule)) - for molecule_pair in sample_molecule_pairs: - molecules.extend(self.crossover(molecule_pair)) - molecules = self.arbiter(self.unique_molecules(molecules)) + molecule_samples = self.archive.sample(self.batch_size) + molecule_sample_pairs = self.archive.sample_pairs(self.batch_size) + for molecule in molecule_samples: + molecules.extend(self.mutator(molecule)) + for molecule_pair in molecule_sample_pairs: + molecules.extend(self.crossover(molecule_pair)) return molecules - def process_molecules(self, molecules: List[Chem.Mol]) -> Tuple[List[List[float]],List[float]]: - descriptors = bag.map(self.descriptor, bag.from_sequence(molecules)).compute() - molecules, descriptors = zip(*[(molecule, descriptor) for molecule, descriptor in zip(molecules, descriptors)\ - if all(1.0 > property > 0.0 for property in descriptor)]) - molecules, descriptors = list(molecules), list(descriptors) - fitnesses = bag.map(self.fitness, bag.from_sequence(molecules)).compute() - return molecules, descriptors, fitnesses + def process_molecules(self, molecules: List[Molecule]) -> List[Molecule]: + molecules = self.arbiter(molecules) + molecules = self.calculate_descriptors(molecules) + molecules = self.calculate_fitnesses(molecules) + return molecules - @staticmethod - def unique_molecules(molecules: List[Chem.Mol]) -> List[Chem.Mol]: - molecules = [Chem.MolFromSmiles(Chem.MolToSmiles(molecule)) for molecule in molecules if molecule is not None] - molecule_records = [(molecule, Chem.MolToSmiles(molecule)) for molecule in molecules if molecule is not None] - molecule_dataframe = pd.DataFrame(molecule_records, columns = ['molecules', 'smiles']) - molecule_dataframe.drop_duplicates('smiles', inplace = True) - return molecule_dataframe['molecules'] + def calculate_fitnesses(self, molecules: List[Molecule]) -> List[Molecule]: + molecules = bag.map(self.fitness, bag.from_sequence(molecules)).compute() + return molecules + def calculate_descriptors(self, molecules: List[Molecule]) -> List[Molecule]: + molecules = bag.map(self.descriptor, bag.from_sequence(molecules)).compute() + molecules = [molecule for molecule in molecules if all(1.0 > property > 0.0 for property in molecule.descriptor)] + return molecules @hydra.main(config_path="configuration", config_name="config.yaml") def launch(config) -> None: print(config.pretty()) - current_instance = illumination(config) + current_instance = Illuminate(config) current_instance() current_instance.client.close() diff --git a/tests/__init__.pyc b/tests/__init__.pyc new file mode 100644 index 0000000..288db23 Binary files /dev/null and b/tests/__init__.pyc differ diff --git a/tests/__pycache__/__init__.cpython-36.pyc b/tests/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 293048f..0000000 Binary files a/tests/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/tests/__pycache__/__init__.cpython-37.pyc b/tests/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 6b277ce..0000000 Binary files a/tests/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/tests/__pycache__/test_infrastructure.cpython-36-pytest-5.4.3.pyc b/tests/__pycache__/test_infrastructure.cpython-36-pytest-5.4.3.pyc deleted file mode 100644 index e78492f..0000000 Binary files a/tests/__pycache__/test_infrastructure.cpython-36-pytest-5.4.3.pyc and /dev/null differ diff --git a/tests/__pycache__/test_infrastructure.cpython-37-PYTEST.pyc b/tests/__pycache__/test_infrastructure.cpython-37-PYTEST.pyc deleted file mode 100644 index 548384a..0000000 Binary files a/tests/__pycache__/test_infrastructure.cpython-37-PYTEST.pyc and /dev/null differ diff --git a/tests/__pycache__/test_infrastructure.cpython-37-pytest-5.4.3.pyc b/tests/__pycache__/test_infrastructure.cpython-37-pytest-5.4.3.pyc deleted file mode 100644 index 2653076..0000000 Binary files a/tests/__pycache__/test_infrastructure.cpython-37-pytest-5.4.3.pyc and /dev/null differ diff --git a/tests/__pycache__/test_mechanism.cpython-36-pytest-5.4.3.pyc b/tests/__pycache__/test_mechanism.cpython-36-pytest-5.4.3.pyc deleted file mode 100644 index 76d2df0..0000000 Binary files a/tests/__pycache__/test_mechanism.cpython-36-pytest-5.4.3.pyc and /dev/null differ diff --git a/tests/__pycache__/test_mechanism.cpython-37-PYTEST.pyc b/tests/__pycache__/test_mechanism.cpython-37-PYTEST.pyc deleted file mode 100644 index 3a52a69..0000000 Binary files a/tests/__pycache__/test_mechanism.cpython-37-PYTEST.pyc and /dev/null differ diff --git a/tests/__pycache__/test_mechanism.cpython-37-pytest-5.4.3.pyc b/tests/__pycache__/test_mechanism.cpython-37-pytest-5.4.3.pyc deleted file mode 100644 index c03beea..0000000 Binary files a/tests/__pycache__/test_mechanism.cpython-37-pytest-5.4.3.pyc and /dev/null differ diff --git a/tests/__pycache__/test_operations.cpython-36-pytest-5.4.3.pyc b/tests/__pycache__/test_operations.cpython-36-pytest-5.4.3.pyc deleted file mode 100644 index bcebbc2..0000000 Binary files a/tests/__pycache__/test_operations.cpython-36-pytest-5.4.3.pyc and /dev/null differ diff --git a/tests/__pycache__/test_operations.cpython-37-PYTEST.pyc b/tests/__pycache__/test_operations.cpython-37-PYTEST.pyc deleted file mode 100644 index bb5aa63..0000000 Binary files a/tests/__pycache__/test_operations.cpython-37-PYTEST.pyc and /dev/null differ diff --git a/tests/__pycache__/test_operations.cpython-37-pytest-5.4.3.pyc b/tests/__pycache__/test_operations.cpython-37-pytest-5.4.3.pyc deleted file mode 100644 index f9885bc..0000000 Binary files a/tests/__pycache__/test_operations.cpython-37-pytest-5.4.3.pyc and /dev/null differ diff --git a/tests/test_config.yaml b/tests/test_config.yaml index d6177d1..33566a3 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -21,4 +21,5 @@ fitness: arbiter: rules: - Glaxo - +mutator: + data_file: data/smarts/mutation_collection.tsv diff --git a/tests/test_infrastructure.py b/tests/test_infrastructure.py index e306dbc..74a50e0 100644 --- a/tests/test_infrastructure.py +++ b/tests/test_infrastructure.py @@ -1,7 +1,9 @@ import pytest import omegaconf +import numpy as np from rdkit import Chem -from argenomic.infrastructure import archive, arbiter +from argenomic.infrastructure import Archive, Arbiter +from argenomic.base import Molecule @pytest.fixture def default_archive(): @@ -9,7 +11,7 @@ def default_archive(): ''' Returns an archive instance of a 150 niches, spanned by ExactMolWt and MolLogP. ''' - return archive(configuration_file.archive, configuration_file.descriptor) + return Archive(configuration_file.archive, configuration_file.descriptor) @pytest.fixture def default_arbiter(): @@ -17,16 +19,17 @@ def default_arbiter(): ''' Returns an arbiter instance, initialised with GSK structural alerts. ''' - return arbiter(configuration_file.arbiter) + return Arbiter(configuration_file.arbiter) @pytest.fixture def default_molecules(): - smiles = ["Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O", "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"] - molecules = [Chem.MolFromSmiles(individual_smiles) for individual_smiles in smiles] + smiles_list = ["Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O", "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"] + pedigree = ("database", "no reaction", "no parent") + molecules = [Molecule(Chem.CanonSmiles(smiles), pedigree, fitness=0.2 , descriptor=[np.random.rand(), np.random.rand()]) for smiles in smiles_list] return molecules def test_default_archive(default_archive, default_molecules): - default_archive.add_to_archive(default_molecules, [[0.1, 0.1], [0.9, 0.9]], [0.0, 1.0]) + default_archive.add_to_archive(default_molecules) assert len(default_archive.sample(2)) == 2 assert len(default_archive.sample_pairs(5)) == 5 diff --git a/tests/test_mechanism.py b/tests/test_mechanism.py index 76744f8..eff1c5c 100644 --- a/tests/test_mechanism.py +++ b/tests/test_mechanism.py @@ -1,7 +1,8 @@ import pytest import omegaconf from rdkit import Chem -from argenomic.mechanism import descriptor, fitness +from argenomic.mechanism import Descriptor, Fitness +from argenomic.base import Molecule @pytest.fixture def default_descriptor(): @@ -9,30 +10,30 @@ def default_descriptor(): Returns a descriptor instance, set-up to calculate normalised ExactMolWt and MolLogP. ''' configuration_file = omegaconf.OmegaConf.load("./tests/test_config.yaml") - return descriptor(configuration_file.descriptor) + return Descriptor(configuration_file.descriptor) @pytest.fixture def default_fitness(): configuration_file = omegaconf.OmegaConf.load("./tests/test_config.yaml") - return fitness(configuration_file.fitness) + return Fitness(configuration_file.fitness) @pytest.fixture def default_molecules(): - ''' - Returns a list of two molecules. - ''' - smiles = ["Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O", "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"] - molecules = [Chem.MolFromSmiles(individual_smiles) for individual_smiles in smiles] + smiles_list = ["Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O", "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"] + pedigree = ("database", "no reaction", "no parent") + molecules = [Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list] return molecules def test_default_descriptor(default_descriptor, default_molecules): - descriptors = default_descriptor(default_molecules) - for descriptor in descriptors: - assert 0.00 <= descriptor - assert descriptor <= 1.00 + for molecule in default_molecules: + molecule = default_descriptor(molecule) + for descriptor in molecule.descriptors: + assert 0.00 <= descriptor + assert descriptor <= 1.00 + def test_default_descriptor(default_fitness, default_molecules): for molecule in default_molecules: - fitness = default_fitness(molecule) - assert 0.00 <= fitness - assert fitness <= 1.00 + molecule = default_fitness(molecule) + assert 0.00 <= molecule.fitness + assert molecule.fitness <= 1.00 diff --git a/tests/test_operations.py b/tests/test_operations.py index bfa9dad..f9c50f6 100644 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -1,29 +1,29 @@ import pytest import omegaconf from rdkit import Chem -from argenomic.operations import mutator, crossover - +from argenomic.operations import Mutator, Crossover +from argenomic.base import Molecule + @pytest.fixture def default_mutator(): ''' Returns an instance of a mutator. ''' - return mutator() + configuration_file = omegaconf.OmegaConf.load("./tests/test_config.yaml") + return Mutator(configuration_file.mutator) @pytest.fixture def default_crossover(): ''' Returns an instance of a crossover. ''' - return crossover() + return Crossover() @pytest.fixture def default_molecules(): - ''' - Returns a list of two molecules. - ''' - smiles = ["Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O", "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"] - molecules = [Chem.MolFromSmiles(individual_smiles) for individual_smiles in smiles] + smiles_list = ["Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O", "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"] + pedigree = ("database", "no reaction", "no parent") + molecules = [Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list] return molecules @pytest.mark.xfail @@ -32,11 +32,13 @@ def test_default_mutator(default_mutator, default_molecules): Tests the action of the mutator. May fail occasionally due to stochasticity. The result of this test is reported separtely. ''' - for molecule in default_molecules: + molecules = default_molecules + for molecule in molecules: assert len(default_mutator(molecule)) > 0 def test_default_crossover(default_crossover, default_molecules): ''' Tests the action of the crossover. ''' - assert len(default_crossover(default_molecules)) > 0 + molecules = default_molecules + assert len(default_crossover(molecules)) > 0