diff --git a/AdaptivePELE/analysis/backtrackAdaptiveTrajectory.py b/AdaptivePELE/analysis/backtrackAdaptiveTrajectory.py index fe63a9ea..9fc820a7 100644 --- a/AdaptivePELE/analysis/backtrackAdaptiveTrajectory.py +++ b/AdaptivePELE/analysis/backtrackAdaptiveTrajectory.py @@ -88,8 +88,11 @@ def main(trajectory, snapshot, epoch, outputPath, out_filename, topology, use_pd epoch = str(epoch) sys.stderr.write("Writing pathway...\n") with open(outputPath+out_filename, "a") as f: - f.write("ENDMDL\n".join(itertools.chain.from_iterable(pathway))) - + if topology: + #Quick fix to avoid problems when visualizing with PyMol + f.write("ENDMDL\nMODEL 2\n".join(itertools.chain.from_iterable(pathway))) + else: + f.write("ENDMDL\n".join(itertools.chain.from_iterable(pathway))) if __name__ == "__main__": traj, num_snapshot, num_epoch, output_path, output_filename, top, use_pdb = parseArguments() diff --git a/AdaptivePELE/analysis/bestStructs.py b/AdaptivePELE/analysis/bestStructs.py index 1b7a8b7d..3a84d521 100644 --- a/AdaptivePELE/analysis/bestStructs.py +++ b/AdaptivePELE/analysis/bestStructs.py @@ -37,11 +37,12 @@ OUTPUT_FOLDER = 'BestStructs' DIR = os.path.abspath(os.getcwd()) STEPS=3 +HELP = "USE:\n\n- For xtc: python bestStructs.py 5 --top topology.pdb\n\n- For pdb: python bestStructs.py 5" def parse_args(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description=HELP, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("crit", type=str, nargs='+', help="Criteria we want to rank and output the strutures for. Must be a column of the report. i.e: Binding Energy") parser.add_argument("--crit2", type=str, nargs='+', help="Second Criteria we want to rank and output the strutures for. Must be a column of the report. i.e: SasaLig, i.e.2. 3", default="") parser.add_argument("--path", type=str, help="Path to Pele's results root folder i.e: path=/Pele/results/", default=DIR) @@ -124,7 +125,7 @@ def main(criteria, path=DIR, n_structs=10, sort_order="min", out_freq=FREQ, outp def extract_snapshot_from_pdb(path, f_id, output, topology, step, out_freq, f_out): f_in = glob.glob(os.path.join(os.path.dirname(path), "*trajectory*_{}.pdb".format(f_id))) if not f_in: - glob.glob(os.path.join(os.path.dirname(path), "*trajectory*_{}.*".format(f_id))) + f_in = glob.glob(os.path.join(os.path.dirname(path), "*trajectory*_{}.*".format(f_id))) if len(f_in) == 0: sys.exit("Trajectory {} not found. Be aware that PELE trajectories must contain the label \'trajectory\' in their file name to be detected".format("*trajectory*_{}".format(f_id))) f_in = f_in[0] @@ -147,7 +148,7 @@ def extract_snapshot_from_pdb(path, f_id, output, topology, step, out_freq, f_ou raise AttributeError("Model not found. Check the -f option.") traj.append("ENDMDL\n") f.write("\n".join(traj)) - + print("Model {} selected".format(os.path.join(output, f_out))) def extract_snapshot_from_xtc(path, f_id, output, topology, step, out_freq, f_out): f_in = glob.glob(os.path.join(os.path.dirname(path), "*trajectory*_{}.xtc".format(f_id))) @@ -156,6 +157,8 @@ def extract_snapshot_from_xtc(path, f_id, output, topology, step, out_freq, f_ou if len(f_in) == 0: sys.exit("Trajectory {} not found. Be aware that PELE trajectories must contain the label \'trajectory\' in their file name to be detected".format("*trajectory*_{}".format(f_id))) splitTrajectory.main(output, [f_in[0], ], topology, [(step)/out_freq+1, ], template= f_out) + print("Model {} selected".format(f_out)) + diff --git a/AdaptivePELE/analysis/cluster.py b/AdaptivePELE/analysis/cluster.py new file mode 100644 index 00000000..48f6ee74 --- /dev/null +++ b/AdaptivePELE/analysis/cluster.py @@ -0,0 +1,345 @@ +import matplotlib +import mdtraj +import time +matplotlib.use('TkAgg') +import signal +import matplotlib.pyplot as plt +from matplotlib.widgets import RectangleSelector +import numpy as np +from multiprocessing import Pool +import os +import hdbscan +import sklearn.metrics as mt +from tqdm import tqdm, trange +import prody +import errno +import argparse +import pandas as pd +import glob +import re +import sys +import warnings +import AdaptivePELE.analysis.splitTrajectory as st +import AdaptivePELE.analysis.backtrackAdaptiveTrajectory as bk + +""" + + Description: Parse all the reports found under 'path' and sort them all + by the chosen criteria (Binding Energy as default) having into account the + frequency our pele control file writes a structure through the -ofreq param + (1 by default). To sort from higher to lower value use -f "max" otherwise + will rank the structures from lower to higher criteria's values. The number + of structures will be ranked is controlled by -i 'nstruct' (default 10). + + For any problem do not hesitate to contact us through the email address written below. + +""" + +__author__ = "Daniel Soler Viladrich" +__email__ = "daniel.soler@nostrumbiodiscovery.com" + +# DEFAULT VALUES +ORDER = "min" +CRITERIA = ["Binding", "Energy"] +OUTPUT = "Structure_{}.pdb" +FREQ = 1 +REPORT = "report" +TRAJ = "trajectory" +ACCEPTED_STEPS = 'numberOfAcceptedPeleSteps' +OUTPUT_FOLDER = 'BestStructs' +DIR = os.path.abspath(os.getcwd()) +STEPS = 3 + + +def parse_args(): + + parser = argparse.ArgumentParser() + parser.add_argument("crit1", type=int, help="First Criteria we want to rank and output the strutures for. Must be a column of the report. i.e: Binding Energy") + parser.add_argument("crit2", type=int, help="Second Criteria we want to rank and output the strutures for. Must be a column of the report. i.e: Binding Energy") + parser.add_argument("ad_steps", type=int, help="Adaptive Steps i.e: self.ad_steps") + parser.add_argument("--path", type=str, help="Path to Pele's results root folder i.e: path=/Pele/results/", default=DIR) + parser.add_argument("--ofreq", "-f", type=int, help="Every how many steps the trajectory were outputted on PELE i.e: self.ad_steps", default=FREQ) + parser.add_argument("--out", "-o", type=str, help="Output Path. i.e: BindingEnergies_apo", default=OUTPUT_FOLDER) + parser.add_argument("--numfolders", "-nm", action="store_true", help="Not to parse non numerical folders") + parser.add_argument("--top", "-t", type=str, help="Topology file for xtc", default=None) + parser.add_argument("--first", action="store_true", help="Skip first line") + parser.add_argument("--zcol", type=int, help="Thirs Criteria we want to rank and output the strutures for. Must be a column of the report. i.e: SASA", default=2) + parser.add_argument("--resname", type=str, help="Resname of the ligand. Resquested for clusterization", default="LIG") + parser.add_argument("--percentage", type=int, help="Percentage of snapshots taken based on the first criteria to clusterize", default=30) + parser.add_argument("--thresh", type=float, help="Treshold of the first criteria below which it will be clusterize. i.e --thresh -60 (binding energy)", default=None) + parser.add_argument("--cpus", type=int, help="Number of workers to use", default=1) + args = parser.parse_args() + + return args.crit1, args.crit2, args.zcol, args.ad_steps, os.path.abspath(args.path), args.ofreq, args.out, args.numfolders, args.top, args.first, args.resname, args.percentage, args.thresh, args.cpus + + +def is_adaptive(): + folders = glob.glob("{}/*/".format(DIR)) + folders_numerical = [os.path.basename(os.path.normpath(folder)) for folder in folders] + if len(folders_numerical) > 1: + return True + else: + return False + + +def extract_ligand_coords(info): + path, snapshot = info + traj = mdtraj.load_frame(path, snapshot, top="topology.pdb") + atoms_info = traj.topology.to_dataframe()[0] + condition = atoms_info['resName'] == resname + atom_numbers_ligand = atoms_info[condition].serial.tolist() + coords = [] + for atom_num in atom_numbers_ligand: + try: + coords.extend(traj.xyz[0, atom_num-1].tolist()) + except IndexError: + continue + return coords + + + +def cluster(data, resName, out_freq=1, cpus=1): + print("Clusterizing") + + # Extract data + epoch = data[DIR].tolist() + trajectory = data[REPORT].tolist() + snapshot = data.iloc[:, 4].tolist() + snapshots = ["{}/*trajectory_{}.*".format(os.path.basename(os.path.dirname(e)), traj) for e, traj in zip(epoch, trajectory)] + + # Get Files + paths = [] + for s in snapshots: + if glob.glob(s): + paths.extend(glob.glob(s)) + else: + paths.extend(glob.glob(os.path.basename(s))) + + # Extract atom coordinates from files + # Could be parallelize in a future + t0 = time.time() + print("Using {} cpus".format(cpus)) + p = Pool(processes=cpus) + #Python2.7 compatibility + pool_input = [[pt, s] for pt, s in zip(paths, snapshot)] + all_coords = p.map(extract_ligand_coords, pool_input) + p.close() + t1 = time.time() + print("Time extract atom coords") + print(t1-t0) + + # Clusterize + cluster_with_dbscan(paths, snapshot, all_coords, topology=topology) + + +def main(criteria1, criteria2, criteria3, ad_steps, path=DIR, out_freq=FREQ, output=OUTPUT_FOLDER, numfolders=False, topology=None, skip_first=False, + resname="LIG", percentage=30, threshold=None, cpus=1): + """ + + Description: Rank the traj found in the report files under path + by the chosen criteria. Finally, output the best n_structs. + + Input: + + Path: Path to look for *report* files in all its subfolders. + + Criteria: Criteria to sort the structures. + Needs to be the name of one of the Pele's report file column. + (Default= "Binding Energy") + + n_structs: Numbers of structures to create. + + sort_order: "min" if sorting from lower to higher "max" from high to low. + + out_freq: "Output frequency of our Pele control file" + + Output: + + f_out: Name of the n outpu + """ + # Check whether is adaptive simulation or not + adaptive = is_adaptive() + + # Find reports + reports = find_reports(path, numfolders) + + # Retrieve Column Names from report + steps, crit1_name, crit2_name, crit3_name = get_column_names(reports, STEPS, criteria1, criteria2, criteria3) + + # Retrieve Data from reports + data = parse_values(reports, criteria1, criteria2, steps, crit1_name, crit2_name, skip_first) + + # Filter values + if threshold: + filter_data = Filter(data, percentage, crit1_name, thresh=threshold) + else: + filter_data = Filter(data, percentage, crit1_name) + + # cluster + cluster(filter_data, resname, out_freq, cpus) + + +def Filter(values, percentage, column_name, thresh=None): + if thresh: + condition = (values[column_name] < thresh) + data_filtered = values[condition] + else: + n_values_to_take = int(values.shape[0]*percentage/100) + data_filtered = values.nlargest(n_values_to_take, column_name) + print("Filtered simulation data {}".format(data_filtered.shape)) + return data_filtered + + +def find_reports(path, numfolders): + reports = glob.glob(os.path.join(path, "*/*report_*")) + reports = glob.glob(os.path.join(path, "*report_*")) if not reports else reports + reports = filter_non_numerical_folders(reports, numfolders) + try: + reports[0] + except IndexError: + raise IndexError("Not report file found. Check you are in adaptive's or Pele root folder") + return reports + + +def parse_values(reports, criteria1, criteria2, steps, crit1_name, crit2_name, first=False, cpus=1): + """ + + Description: Parse the 'reports' and create a sorted array + of size n_structs following the criteria chosen by the user. + + """ + + info_reports = [ retrieve_report_data(report) for report in reports] + data = pd.concat(info_reports) + data.drop_duplicates(subset=[crit1_name, crit2_name], inplace=True) + print("Simulation data {}".format(data.shape)) + return data + + +def retrieve_report_data(report): + # Get report + report_number = os.path.basename(report).split("_")[-1] + # Read data + try: + data = pd.read_csv(report, sep=' ', engine='python') + except pd.errors.EmptyDataError: + warnings.warn("Report {} corrupted".format(report), UserWarning) + # Skip first line if asked + # if first and os.path.basename(os.path.dirname(report)): + # data = data.iloc[1:] + # Insert path and filename + data.insert(0, DIR, [report]*data.shape[0]) + data.insert(1, REPORT, [report_number]*data.shape[0]) + return data + + +def filter_non_numerical_folders(reports, numfolders): + """ + Filter non numerical folders among + the folders to parse + """ + if(numfolders): + new_reports = [report for report in reports if(os.path.basename(os.path.dirname(report)).isdigit())] + return new_reports + else: + return reports + + +def get_column_names(reports, steps, criteria1, criteria2, criteria3): + data = pd.read_csv(reports[0], sep=' ', engine='python') + data = list(data) + + return data[int(steps)-1], data[criteria1-1], data[criteria2-1], data[criteria3-1] + + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + + +def cluster_with_dbscan(paths, snapshots, all_coordinates, out_freq=1, topology=None): + + """ + Use high performance computing hdbscan + to do an all-atom cluster of the chosen + plot structures + """ + + n_samples = len(snapshots) + + # Clusterize + labels = [] + results = [] + t0 = time.time() + try: + db = hdbscan.HDBSCAN(min_samples=int(n_samples*0.10)+1).fit(all_coordinates) + except ValueError: + raise ValueError("Ligand not found check the option --resname. i.e python interactive.py 5 6 7 --resname LIG") + result = db.labels_ + labels.append(len(set(result))) + results.append(result) + t1 = time.time() + print("time clustering") + print(t1-t0) + + # Get Best Result + t0 = time.time() + mx_idx = np.argmax(np.array(labels)) + final_result = results[mx_idx] + try: + silhouette_samples = mt.silhouette_samples(all_coordinates, final_result) + except ValueError: + raise ValueError("Clustering failed. Structures do not follow any pattern or they are not enough") + max_clust = {label: [path, snap, sil] for (path, snap, label, sil) in zip(paths, snapshots, final_result, silhouette_samples)} + + # Get representative + for path, snapshot, label, sil in zip(paths, snapshots, final_result, silhouette_samples): + if sil > max_clust[label][2]: + max_clust[label] = [path, snapshot, sil] + + # Get Structures + for i, (label, info) in enumerate(max_clust.items()): + # if label == -1: continue + output = "Clusters" + if not os.path.exists(output): + os.mkdir(output) + f_out = "cluster_{}.pdb".format(label+1) + f_in, snapshot, _ = info + + #XTC + if topology: + found = st.main(output, [f_in, ], topology, [snapshot, ], template=f_out) + if found: + print("MODEL {} has been selected as {}".format(f_in, f_out)) + else: + print("MODEL {} not found. Check -f option".format(f_in)) + #PDB + else: + traj = [] + model = (snapshot)/out_freq+1 + + with open(f_in, 'r') as input_file: + file_content = input_file.read() + trajectory_selected = re.search('MODEL\s+%d(.*?)ENDMDL' %int(model), file_content, re.DOTALL) + with open(os.path.join(output, f_out),'w') as f: + traj.append("MODEL %d" % int(model)) + try: + traj.append(trajectory_selected.group(1)) + except AttributeError: + raise AttributeError("Model {} not found. Check the -f option.".format(f_in)) + traj.append("ENDMDL\n") + f.write("\n".join(traj)) + print("MODEL {} has been selected as {}".format(f_in, f_out)) + t1 = time.time() + print("Time post processing") + print(t1-t0) + + +if __name__ == "__main__": + criteria1, criteria2, criteria3, ad_steps, path, out_freq, output, numfolders, topology, skip_first, resname, percentage, thresh, cpus = parse_args() + main(criteria1, criteria2, criteria3, ad_steps, path, out_freq, output, numfolders, topology, skip_first, resname, percentage, thresh, cpus) diff --git a/AdaptivePELE/analysis/clusterAdaptiveRun.py b/AdaptivePELE/analysis/clusterAdaptiveRun.py index b8d2cefd..dde213a5 100644 --- a/AdaptivePELE/analysis/clusterAdaptiveRun.py +++ b/AdaptivePELE/analysis/clusterAdaptiveRun.py @@ -1,7 +1,9 @@ import matplotlib matplotlib.use('Agg') import sys +from functools import partial import os +import multiprocessing as mp import glob from pylab import rcParams import pandas as pd @@ -10,7 +12,7 @@ import argparse from AdaptivePELE.utilities import utilities from AdaptivePELE.freeEnergies import cluster, extractCoords -from AdaptivePELE.analysis import splitTrajectory +from AdaptivePELE.analysis import splitTrajectory, simulationToCsv def parseArgs(): parser = argparse.ArgumentParser(description="Script that reclusters the Adaptive clusters") @@ -25,11 +27,39 @@ def parseArgs(): parser.add_argument('--report', type=str, help="Report filenames i.e. run_report_", default="report_") parser.add_argument('--traj', type=str, help="Trajectory filenames i.e. run_trajectory_", default="trajectory_") parser.add_argument('--use_pdb', action="store_true", help="To use when having pdb files with .xtc extension") + parser.add_argument('--png', action="store_true", help="Save plot in png format") + parser.add_argument('--CA', action="store_true", help="Cluster by CA") + parser.add_argument('--sidechains', action="store_true", help="Cluster by sidechain RMSD") + parser.add_argument('--restart', action="store_true", help="Restart analysis from previous clusters") args = parser.parse_args() - return args.nClusters, args.crit1, args.crit2, args.ligand_resname, args.atomId, args.o, args.top, args.cpus, args.report, args.traj, args.use_pdb + return args.nClusters, args.crit1, args.crit2, args.ligand_resname, args.atomId, args.o, args.top, args.cpus, args.report, args.traj, args.use_pdb, args.png, args.CA, args.sidechains, args.restart -def plotClusters(fields1, fields2, crit1, crit2, output): +class cd: + """Context manager for changing the current working directory""" + def __init__(self, newPath): + self.newPath = os.path.expanduser(newPath) + + def __enter__(self): + self.savedPath = os.getcwd() + os.chdir(self.newPath) + + def __exit__(self, etype, value, traceback): + os.chdir(self.savedPath) + + +def write_snapshot(snap_num, trajectory, filename, topology=None, use_pdb=False): + if not topology: + snapshots = utilities.getSnapshots(trajectory, topology=topology, use_pdb=use_pdb) + with open(filename, "w") as fw: + fw.write(snapshots[snap_num]) + else: + splitTrajectory.main("", [trajectory, ], topology, [snap_num+1,],template=filename, use_pdb=use_pdb) + + + + +def plotClusters(fields1, fields2, crit1, crit2, output, png=False): labels = ["cluster_{}".format(i) for i in np.arange(len(fields1))] fig, ax = plt.subplots() ax.scatter(fields1, fields2, label=labels) @@ -37,7 +67,10 @@ def plotClusters(fields1, fields2, crit1, crit2, output): ax.set_xlabel(crit1) ax.set_ylabel(crit2) print("Plotting") - fig.savefig(os.path.join(output, "ClusterMap.pdf"), format='pdf', dpi=1200) + if png: + fig.savefig(os.path.join(output, "ClusterMap.png")) + else: + fig.savefig(os.path.join(output, "ClusterMap.pdf"), format='pdf', dpi=1200) def writePDB(pmf_xyzg, title="clusters.pdb"): templateLine = "HETATM%s H%sCLT L 502 %s%s%s 0.75%s H\n" @@ -70,7 +103,7 @@ def writeInitialStructures(field1, field2, crit1, crit2, centers_info, filename_ def get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters): - centersInfo = {x: {"structure": None, "minDist": 1e6, "center": None} for x in xrange(num_clusters)} + centersInfo = {x: {"structure": None, "minDist": 1e6, "center": None} for x in range(num_clusters)} trajFiles = glob.glob(os.path.join(trajectoryFolder, trajectoryBasename)) for traj in trajFiles: @@ -82,7 +115,7 @@ def get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, cluster nSnap = snapshot[0] snapshotCoords = snapshot[1:] dist = np.sqrt(np.sum((clusterCenters-snapshotCoords)**2, axis=1)) - for clusterInd in xrange(num_clusters): + for clusterInd in range(num_clusters): if dist[clusterInd] < centersInfo[clusterInd]['minDist']: centersInfo[clusterInd]['minDist'] = dist[clusterInd] centersInfo[clusterInd]['structure'] = (epoch, int(iTraj), nSnap) @@ -93,38 +126,103 @@ def get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, cluster def get_metric(criteria, epoch_num, traj_num, snap_num, report): report = os.path.join(str(epoch_num), "{}{}".format(report, traj_num)) report_data = pd.read_csv(report, sep=' ', engine='python') - value = report_data.iloc[snap_num, criteria-1] + print(snap_num, type(snap_num)) + print(type(report_data["Step"].tolist()[0])) + print(report_data) + print(report_data["numberOfAcceptedPeleSteps"] == (snap_num-1)) + value = report_data[(report_data["numberOfAcceptedPeleSteps"] == (snap_num-1))].values[0][criteria-1] header = list(report_data)[criteria-1] return value, header -def main(num_clusters, criteria1, criteria2, output_folder, ligand_resname, atom_ids, cpus=2, topology=None, report="report_", traj="trajectory_", use_pdb=False): - if not glob.glob("*/extractedCoordinates/coord_*"): - extractCoords.main(lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids, nProcessors=cpus, parallelize=False, topology=topology, use_pdb=use_pdb) +def assesClusterConvergence(df, num_clusters, traj_name = "trajectory_", topology=None): + for i in range(num_clusters): + path = "ClustersSummary/Cluster{}".format(i) + if not os.path.exists(path): + os.makedirs(path) + selected_clust = df[df["Cluster"] == float(i)].astype(float) + maximum = selected_clust.nlargest(10, "Binding Energy") + epochs = maximum[simulationToCsv.EPOCH].values + trajs = maximum[simulationToCsv.TRAJ].values + steps = maximum[simulationToCsv.STEPS].values + for j, (epoch, traj, step) in enumerate(zip(epochs, trajs, steps)): + trajectory = "{}/{}{}.xtc".format(int(epoch), traj_name, int(traj)) if topology else "{}/{}{}.pdb".format(int(epoch), traj_name, int(traj)) + write_snapshot(int(step), trajectory, os.path.join(path, "Cluster_{}_{}.pdb".format(i, j)), topology=topology) + + +def save_to_df(input): + df, traject, dtraj = input + dfs_tmp = [] + epoch, traj = traject.strip(".dat").split("_")[-2:] + for i, d in enumerate(dtraj): + df_tmp = df[(df[simulationToCsv.EPOCH] == float(epoch)) & (df[simulationToCsv.TRAJ] == float(traj)) & (df[simulationToCsv.STEPS] == float(i))] + df_tmp["Cluster"] = d + dfs_tmp.append(df_tmp) + #df.update(df_tmp) + return dfs_tmp + +def main(num_clusters, criteria1, criteria2, ligand_resname, output_folder = "ClusterCentroids", atom_ids="", cpus=2, topology=None, report="report_", traj="trajectory_", use_pdb=False, png=False, CA=0, sidechains=0, restart="all"): + #Create multiprocess pool + if cpus>1: + pool = mp.Pool(cpus) + else: + pool=mp.Pool(1) + #Extract COM ligand for each snapshot + if not glob.glob("allTrajs/traj*"): + extractCoords.main(lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids, nProcessors=cpus, parallelize=True, topology=topology, use_pdb=use_pdb, protein_CA=CA, sidechains=sidechains) + + print("Clusterize trajectories by RMSD of COM") trajectoryFolder = "allTrajs" trajectoryBasename = "*traj*" stride = 1 clusterCountsThreshold = 0 folders = utilities.get_epoch_folders(".") folders.sort(key=int) - - clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, - trajectoryBasename, alwaysCluster=True, - stride=stride) - clusteringObject.clusterTrajectories() - clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) - clusterCenters = clusteringObject.clusterCenters + if not restart: + + clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, + trajectoryBasename, alwaysCluster=True, + stride=stride) + clusteringObject.clusterTrajectories() + clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) + clusterCenters = clusteringObject.clusterCenters + np.savetxt("clustercenters.dat", clusterCenters) + dtrajs = clusteringObject.dtrajs + + print("Extract metrics for each snapshot") + min_metric_trajs = {} + epochs = [folder for folder in glob.glob("./*/") if folder.isdigit()] + reports = simulationToCsv.gather_reports() + fields = simulationToCsv.retrieve_fields(reports[0]) + df = simulationToCsv.init_df(fields) + df = simulationToCsv.fill_data(reports, df, pool) + + print("Update data with metrics and clusters") + df.index = range(df.shape[0]) + df["Cluster"] = [None]*df.shape[0] + input_list = [ [df, Traj, d] for d, Traj in zip(dtrajs, clusteringObject.trajFilenames) ] + results = pool.map(save_to_df, input_list) + for data in results: + for df_tmp in data: + df.update(df_tmp) + df.to_csv("Simulation.csv", index=False) + if restart: + df = pd.read_csv("Simulation.csv") + clusterCenters = np.loadtxt("clustercenters.dat") + print(clusterCenters) centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) - COMArray = [centersInfo[i]['center'] for i in xrange(num_clusters)] + COMArray = [centersInfo[i]['center'] for i in range(num_clusters)] + print("Retrieve clusters and metric") fields1 = [] fields2 = [] + print(centersInfo) for cluster_num in centersInfo: epoch_num, traj_num, snap_num = map(int, centersInfo[cluster_num]['structure']) field1, crit1_name = get_metric(criteria1, epoch_num, traj_num, snap_num, report) field2, crit2_name = get_metric(criteria2, epoch_num, traj_num, snap_num, report) - fields1.append(field1) - fields2.append(field2) + fields1.append(field1) + fields2.append(field2) if output_folder is not None: outputFolder = os.path.join(output_folder, "") @@ -132,10 +230,13 @@ def main(num_clusters, criteria1, criteria2, output_folder, ligand_resname, atom os.makedirs(outputFolder) else: outputFolder = "" + print("Output structures") writePDB(COMArray, outputFolder+"clusters_%d_KMeans_allSnapshots.pdb" % num_clusters) writeInitialStructures(fields1, fields2, crit1_name, crit2_name, centersInfo, outputFolder+"cluster_{}_{}_{}_{}_{}.pdb", traj, topology=topology, use_pdb=use_pdb) - plotClusters(fields1, fields2, crit1_name, crit2_name, outputFolder) + plotClusters(fields1, fields2, crit1_name, crit2_name, outputFolder, png=png) + assesClusterConvergence(df, num_clusters, traj, topology) + return if __name__ == "__main__": - n_clusters, criteria1, criteria2, lig_name, atom_id, output, top, cpus, report, traj, use_pdb = parseArgs() - main(n_clusters, criteria1, criteria2, output, lig_name, atom_id, cpus, top, report, traj, use_pdb) + n_clusters, criteria1, criteria2, lig_name, atom_id, output, top, cpus, report, traj, use_pdb, png, CA, sidechains, restart = parseArgs() + main(n_clusters, criteria1, criteria2, lig_name, output, atom_id, cpus, top, report, traj, use_pdb, png, CA, sidechains, restart) diff --git a/AdaptivePELE/analysis/numberOfClusters.py b/AdaptivePELE/analysis/numberOfClusters.py index 04d6ff45..aebb7770 100644 --- a/AdaptivePELE/analysis/numberOfClusters.py +++ b/AdaptivePELE/analysis/numberOfClusters.py @@ -10,6 +10,8 @@ matplotlib.use('wxagg') elif 'login' in machine: matplotlib.use('TkAgg') +else: + matplotlib.use("agg") import matplotlib.pyplot as plt try: # This might fail for older versions of matplotlib (e.g in life cluster) @@ -292,7 +294,8 @@ def main(filename, outputPath): plt.xlabel("Contact ratio") if filename != "": plt.savefig("%s%s_hist.png" % (outputPath, filename)) - plt.show() + else: + plt.show() if __name__ == "__main__": file_name, outputFolder = printHelp() diff --git a/AdaptivePELE/analysis/reportSimulation.py b/AdaptivePELE/analysis/reportSimulation.py new file mode 100644 index 00000000..6a21cd06 --- /dev/null +++ b/AdaptivePELE/analysis/reportSimulation.py @@ -0,0 +1,161 @@ +from fpdf import FPDF +import re +import sys +import os +import subprocess +from AdaptivePELE.analysis import plotAdaptive +import argparse + +CONTACTS = "contacts" +BE = "bindingEnergy" +SASA = "sasa" +kind_Print, colModifier, traj_range = "PRINT_BE_RMSD", 4, None + + +def arg_parse(): + parser = argparse.ArgumentParser(description='Build a summary pdf file of the simulation') + parser.add_argument('control_file', type=str, help='adaptive control file') + parser.add_argument('--traj', type=str, help='Trajectory file name i.e. run_traj_', default='trajectory_') + parser.add_argument('--report', type=str, help='Report file name i.e run_report_', default='report_') + args = parser.parse_args() + + return args + +def retrieve_metrics(control_file): + metrics_names = ["rmsd", "com_distance", "distanceToPoint", BE, SASA] + discard = ["random", "constrainAtomToPosition", "sphericalBox", "toThisOtherAtom", "constrainThisAtom", "springConstant", "equilibriumDistance", ] + try: + with open(control_file, "r") as f: + metrics = [line for line in f if "type" in (line.strip()) and not any(x in line for x in discard)] + except IOError: + raise IOError("Pele control file {} not found. Check the path inside Adaptive control file".format(control_file)) + pattern = r'"([A-Za-z0-9_\./\\-]*)"' + metrics = re.findall(pattern, "".join(metrics)) + metrics = [metric for metric in metrics if metric in metrics_names] + return metrics + + +def write_report(metrics, resname, initial_column=4, traj="trajectory_", report="report_"): + + OUTPUT = 'adaptive_report.pdf' + + pdf = FPDF() + pdf.add_page() + + # Title + pdf.set_font('Arial', 'B', 15) + pdf.cell(80) + pdf.cell(30, 10, 'Simulation Report', align='C') + + # Plot Binding SASA + plot(1+metrics.index(BE)+initial_column, 1+metrics.index(SASA)+initial_column, ".", "BE.png", "BindingE", "sasa", report=report) + pdf.cell(-100) + pdf.set_font('Arial', 'B', 11) + pdf.cell(10, 49, 'Interaction Energy vs Sasa' +34*"\t" + "Total Energy vs Interaction Energy") + pdf.image("BE.png", 10, 40, 83) + + # Plot Total Biding + plot(initial_column, 1+metrics.index(BE)+initial_column, ".", "total.png", "totalE", "BindingE", report=report) + pdf.cell(0) + pdf.set_font('Arial', 'B', 11) + pdf.image("total.png", 100, 40, 83) + + # Contacts + create_contact_plot(".") + pdf.cell(0) + pdf.set_font('Arial', 'B', 11) + pdf.image("{}_threshold.png".format(CONTACTS), 10, 110, 83) + pdf.image("{}_hist.png".format(CONTACTS), 100, 110, 83) + + + pdf.add_page() + + #Plot other metrics agains binding + images_per_page = 1 + images_per_row = 0 + #metrics = ["rmsd", "com_distance", "distanceToPoint", "clusters"] + metrics_names = ["rmsd", "com_distance", "distanceToPoint"] + for user_metric in metrics_names: + if images_per_page > 4: + pdf.add_page() + images_per_page = 1 + if user_metric in metrics or user_metric == "clusters": + pdf, images_per_page, images_per_row = write_metric( + pdf, user_metric, BE, metrics, images_per_page, images_per_row, resname, traj=traj, report=report) + + #Output report + pdf.output(OUTPUT, 'F') + + +def plot_clusters(metric1, metric2, metrics, resname, traj, report): + command = "python -m AdaptivePELE.analysis.clusterAdaptiveRun 200 {} {} {} --report {} --traj {} --png".format( + get_column(metric1, metrics), get_column(metric2, metrics), resname, report, traj) + os.system(command) + +def get_column(metric, metrics, initial_pos=4): + return 1+metrics.index(metric)+initial_pos + +def write_metric(pdf, metric1, metric2, metrics, images_per_page, images_per_row, resname=None, initial_pos=4, traj="trajectory_", report="report_"): + + #Create Image + if metric1 == "clusters": + image_name = "Cluster_analisis/ClusterMap.png".format(metric1) + plot_clusters("bindingEnergy", "sasa", metrics, resname, report) + else: + image_name = "{}.png".format(metric1) + plot(1+metrics.index(metric1)+initial_pos, 1+metrics.index(metric2)+initial_pos, ".", + image_name, metric1, metric2, zcol=initial_pos + 1+ metrics.index("sasa"), report=report) + + #Move over pdf + pdf.image(image_name, pdf.get_x(), pdf.get_y(), 83) + images_per_row += 1 + if images_per_row == 1: + pdf.cell(80, 0, "", 0, 0) + else: + pdf.set_xy(10, 100) + pdf.cell(80, 0, "", 0, 2); + images_per_row = 0 + images_per_page += 1 + + #Update cursor + return pdf, images_per_page, images_per_row + +def plot(Xcol, Ycol, path, name, xcol_name, ycol_name, zcol=5, report="report_"): + try: + plot_line = plotAdaptive.generatePrintString(8, Xcol, Ycol, report, "PRINT_BE_RMSD", zcol, None).strip("\n") + except TypeError: + raise TypeError("Report not found use the flag --report. i.e --report run_report_ ") + command = '''gnuplot -e "set terminal png; set output '{}'; set xlabel '{}'; set ylabel '{}'; {}"'''.format( + name, xcol_name, ycol_name, plot_line) + os.system(command) + +def create_contact_plot(path, filenames=CONTACTS): + command = "python -m AdaptivePELE.analysis.numberOfClusters -f contacts" + os.system(command) + +def retrieve_fields(control_file): + with open(control_file, "r") as f: + content = f.readlines() + control_file_line = [line for line in content if line.strip().startswith('"controlFile"')][0] + ligand_res_line = [line for line in content if line.strip().startswith('"ligandResname"')][0] + pele, resname = control_file_line.split(":")[1].strip().strip(",").strip('"'), ligand_res_line.split(":")[1].strip().strip(",").strip('"') + if not os.path.isfile(pele): + path = os.path.dirname(os.path.abspath(control_file)) + pele = os.path.join(path, pele) + return pele, resname + + + +def main(control_file, traj, report): + print("Search pele control file") + pele_conf, resname = retrieve_fields(control_file) + print("Retrieve metrics") + metrics = retrieve_metrics(pele_conf) + print("Build report") + write_report(metrics, resname, 4, traj, report) + print("Analysis finished succesfully") + + +if __name__ == "__main__": + args = arg_parse() + main(args.control_file, args.traj, args.report) diff --git a/AdaptivePELE/analysis/simulationToCsv.py b/AdaptivePELE/analysis/simulationToCsv.py new file mode 100644 index 00000000..cbf8c2ec --- /dev/null +++ b/AdaptivePELE/analysis/simulationToCsv.py @@ -0,0 +1,85 @@ +import pandas as pd +import argparse +import os +import glob + +EPOCH = 'Epoch' +TRAJ = 'Traj' +STEPS = 'numberOfAcceptedPeleSteps' + + +def arg_parse(): + parser = argparse.ArgumentParser(description='Create a csv file with the summary of the simulation. Must be run under Adaptive folder') + args = parser.parse_args() + return args + +def retrieve_fields(report): + data = pd.read_csv(report, sep=' ', engine='python') + return list(data)[2:] + +def gather_reports(): + reports = glob.glob(os.path.join("*/*report*")) + reports = glob.glob(os.path.join("*report*")) if not reports else reports + reports = filter_non_numerical_folders(reports) + try: + reports[0] + except IndexError: + raise IndexError("Not report file found. Check you are in adaptive's or Pele root folder") + return reports + +def filter_non_numerical_folders(reports, numfolders=True): + """ + Filter non numerical folders among + the folders to parse + """ + if(numfolders): + new_reports = [report for report in reports if(os.path.basename(os.path.dirname(report)).isdigit())] + return new_reports + else: + return reports + + +def fill_data(reports, df, pool): + + workers = [] + if pool is None: + for report in reports: + # serial version + data = extract_data(report) + df = pd.concat([df, data], axis=0) + else: + results = pool.map(extract_data, reports) + for data in results: + df = pd.concat([df, data], axis=0) + return df + + +def extract_data(report): + data = pd.read_csv(report, sep=' ', engine='python') + data = data.drop(data.columns[0], axis=1) + data = data.drop(data.columns[0], axis=1) + size = data.shape[0] + data.insert(0, EPOCH, [int(os.path.dirname(report))]*size) + data.insert(1, TRAJ, [int(os.path.basename(report).split("_")[-1])]*size) + return data + + +def init_df(fields): + df = pd.DataFrame({EPOCH : [], TRAJ : [] }) + for field in fields: + df[field] = [] + return df + + +def main(): + pool = None + epochs = [folder for folder in glob.glob("./*/") if folder.isdigit()] + reports = gather_reports() + fields = retrieve_fields(reports[0]) + df = init_df(fields) + df = fill_data(reports, df, pool) + df.to_csv("report_summary.csv", index=False, decimal=".", float_format='%.2f') + +if __name__ == "__main__": + arg_parse() + main() diff --git a/AdaptivePELE/analysis/unbinding.py b/AdaptivePELE/analysis/unbinding.py new file mode 100644 index 00000000..e428cbaa --- /dev/null +++ b/AdaptivePELE/analysis/unbinding.py @@ -0,0 +1,38 @@ +import argparse +import AdaptivePELE.analysis.bestStructs as bs +import AdaptivePELE.analysis.backtrackAdaptiveTrajectory as bt + +HELP = " \ +Command:\n \ +----------\n \ +python -m AdaptivePELE.analysis.unbinding column_name_of_report --epoch epoch_folder --top topology_file.pdb --n number_of_structures_to_output \n \ +i.e: python -m AdaptivePELE.analysis.unbinding sasaLig --epoch 11 --top topology.pdb \ +" + + +def parse_args(): + parser = argparse.ArgumentParser(description=HELP, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('crit', type=str, help=' Column of Pele report to build the exit path with (Normally sasaLig)') + parser.add_argument('--epoch', type=str, help='Epoch where to finish the exit path', default=".") + parser.add_argument('--n', type=int, help='Topology file. It must be specified for .xtc files not for pdb', default="1") + parser.add_argument('--top', type=str, help='Number of exit path to retrieve', default=None) + parser.add_argument('--outputPath', type=str, help='Output path. i.e: exit_path', default=".") + parser.add_argument('--outputFile', type=str, help='Output file name. i.e: path.pdb', default=None) + parser.add_argument('--sort', type=str, help='The end of the exit part will be the max or min value of the metric. default="max". i.e --sort min', default="max") + args = parser.parse_args() + + return args.crit, args.epoch, args.n, args.top, args.outputPath, args.outputFile, args.sort + + +def analyze_unbinding(crit1, epoch=".", top=None, n=1, outputPath=".", out_filename=None, sort="max"): + files_out, epochs, file_ids, step_indexes = bs.main(crit1, path=epoch, topology=top, n_structs=n, output=outputPath, sort_order=sort) + initial_filename = out_filename + for epoch, traj, snap in zip(epochs, file_ids, step_indexes): + if not initial_filename: + out_filename = "{}_{}_{}_pathway.pdb".format(epoch, traj, snap) + bt.main(int(traj), int(snap), epoch, outputPath, out_filename, top) + + +if __name__ == "__main__": + crit, epoch, n, top, output_path, output_name, sort = parse_args() + analyze_unbinding(crit, epoch=epoch, top=top, n=n, outputPath=output_path, out_filename=output_name, sort=sort)