Allows user supplied phylogenic tree.

WilsonSayresLab · May 22, 2017 · 0fbdafd · 0fbdafd
1 parent ee6b8c9
commit 0fbdafd
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 82 deletions.
diff --git a/AlignmentProcessor.py b/AlignmentProcessor.py
@@ -135,15 +135,17 @@ def calculateKaKs(outdir, method, cpu):
 	if ck.returncode == 0:
 		return True
 
-def runcodeml(cpu, outdir, forward, cleanup):
+def runcodeml(cpu, outdir, forward, ntree, cleanup):
 	'''Runs codeml on a directory.'''
 	# Build commands and add options if necessary
 	cmd = ("python bin/04_CallCodeML.py -t " + str(cpu) + " -i " + outdir + 
 			"03_phylipFiles" + " -o " + outdir + "04_CodemlOutput")
-	if cleanup == False:
+	if cleanup == True:
 		cmd += " --cleanUp"
 	if forward:
 		cmd += " -f " + forward
+	if ntree:
+		cmd += " -n " + ntree
 	cm = Popen(split(cmd))
 	cm.wait()
 	if cm.returncode == 0:
@@ -156,7 +158,7 @@ def main():
 	# Set arguments
 	parser = argparse.ArgumentParser(description="AlignmentProcessor will run \
 the subsituion rate pipeline to produce trimmed axt or phylip files for use \
-with KaKs_calculator or PhyMl.\nAlignmentProcessor1.2 Copyright 2016 by \
+with KaKs_calculator or PhyMl.\nAlignmentProcessor1.4 Copyright 2016 by \
 Shawn Rupp\nThis program comes with ABSOLUTELY NO WARRANTY\nThis is free \
 software, and you are welcome to redistribute it under certain conditions\n")
 	parser.add_argument("-i", help="Path to input file.")
@@ -182,7 +184,8 @@ def main():
 	parser.add_argument("-t", type=int, default=1, help="Number of threads.")
 	parser.add_argument("-f", default="", 
 help="Forward species (name must be the same as it appears in input files.")
-	parser.add_argument("--cleanUp", action="store_false", 
+	parser.add_argument("-n", help = "Path to optional Newick tree.")
+	parser.add_argument("--cleanUp", action="store_true", 
 help="Remove temporary files (it may be useful to retain phylogenic trees \
 for future use).")
 	# Parse arguments and assign to variables
@@ -201,6 +204,7 @@ def main():
 	codeml = args.codeml
 	cpu = args.t
 	forward = args.f
+	ntree = args.n
 	cleanup = args.cleanUp
 	# Check inout commands prior to running:
 	if not ref:
@@ -231,7 +235,7 @@ def main():
 				done = calculateKaKs(outdir, method, cpu)
 			# Run codeml
 			elif codeml == True:
-				done = runcodeml(cpu, outdir, forward, cleanup)
+				done = runcodeml(cpu, outdir, forward, ntree, cleanup)
 			else:
 				# Exit if neither program was called
 				done = True

diff --git a/AlignmentProcessorReadMe.pdf b/AlignmentProcessorReadMe.pdf
diff --git a/bin/01_SplitFasta.py b/bin/01_SplitFasta.py
@@ -24,40 +24,51 @@ def splitFasta(infile, outdir):
 	# Parse input fasta
 	with open(infile, "r") as fasta:
 		newid = True
+		prev = True
 		seq = ""
 		n = 0
 		for line in fasta:
-			if line != "\n":
+			if line.strip():
 				if line[0] == ">":
-					line, geneid = convertHeader(line)
+					prev = True
+					build, geneid = convertHeader(line)
 					# Concatenate lines for all species for each gene
-					seq += str(line)
-					# Determine number of sequences and species names
-					n += 1
 					if newid == True:
 						# Set reference species ID as file name
 						filename = geneid
 						newid = False
 				else:
 					# Concatenate remaining lines
-					seq += str(line)
-			elif line == "\n" and newid == False:
+					line = line.upper()
+					if ("A" not in line or "C" not in line or "G" not in line
+						 or "T" not in line):
+						pass
+					else:
+						# Save gene if it contains nucleotides
+						if prev == True:
+							n += 1
+							seq += build
+							prev == False
+						seq += str(line)
+			elif not line.strip() and newid == False:
 				# Use empty lines to determine where genes end
-				if n >= 2:
+				if n >= 2 and seq.count("\n") > 3:
 					# Print gene sequences to file if there are at least two
 					# species and reset for next gene
 					outfile = (outdir + filename + "." + str(n) + ".fa")
 					with open(outfile, "w") as output:
 						output.write(seq)
 					newid = True
+					prev = False
 					seq = ""
 					n = 0
 					passed += 1
-				elif n < 2:
+				else:
 					# Skip genes with only one sequence and save ID in log
 					with open(log, "a") as logfile:
 						logfile.write(geneid + "\n")
 					excluded += 1
+					newid = True
 	with open(log, "a") as logfile:
 		logfile.write(("\nTotal transcripts written to file: {}\n").format(passed))
 		logfile.write(("Total transcripts with only one sequence: {}").format(excluded))
@@ -68,13 +79,24 @@ def convertHeader(line):
 		# Extract relevant data from UCSC header
 		genebuild = line[1:].split()[0]
 		genebuild = genebuild.split("_")
-		build = ">" + str(genebuild[1]) + "\n"
-		geneid = str(genebuild[0].split(".")[0])
+		if line[1] == "E":
+			# Ensembl IDs
+			build = ">" + str(genebuild[1]) + "\n"
+			geneid = str(genebuild[0].split(".")[0])
+		elif line[1] == "N":
+			# NCBI IDs
+			build = ">" + str(genebuild[2]) + "\n"
+			geneid = str(genebuild[0]) + "_" + str(genebuild[1])
 	else:
 		# Extract build and geneid 
 		build = ">" + line.split(".")[0][1:].rstrip() + "\n"
 		geneid = str(line.split(".")[1]).rstrip()
-	return build, geneid
+	if geneid and build:
+		return build, geneid
+	else:
+		print("Please use a fasta file with Ensembl, NCBI, or Galaxy Stitch \
+Gene Blocks IDs.")
+		quit()
 
 def main():
 	parser = argparse.ArgumentParser(description="This will take the \

diff --git a/bin/02_FilterFasta.py b/bin/02_FilterFasta.py
@@ -82,7 +82,7 @@ def seqDict(fasta, n):
 				species = line[1:].strip()
 			if line[0] != ">":
 				codons = []
-				seq = line.strip().upper()
+				seq = line.strip()
 				for i in range(0, len(seq), 3):
 					codons.append(seq[i:i +3])
 					i += 3
@@ -130,7 +130,6 @@ def fixCodons(newseq):
 def removeStops(n, newseq, retainStops, geneid, log):
 	'''This will replace internal stop codons with gaps, create a list
 of genes with internal stop codons, and remove those sequences.'''
-	count = n
 	remove = []
 	for species in newseq:
 		# Replace terminal stop codons so the program can identify
@@ -142,7 +141,7 @@ def removeStops(n, newseq, retainStops, geneid, log):
 				newseq[species][-1] = "---"
 		except (IndexError, ValueError) as e:
 			# Remove empty sequences
-			count -= 1
+			n -= 1
 			del newseq[species] 
 			with open(log, "a") as logfile:
 				logfile.write(geneid + "\t" + species + 
@@ -163,18 +162,18 @@ def removeStops(n, newseq, retainStops, geneid, log):
 	if retainStops == False:
 		# Remove key after iterating to avoid using multiple breaks
 		for species in remove:
-			count -= 1
+			n -= 1
 			del newseq[species]
-	if count >= 2:
-		return True, newseq, count
-	elif count < 2:
+	if n >= 2:
+		return True, newseq, n
+	elif n < 2:
 		# Skip files with only one sequence left
 		pass
 
 def countBases(n, newseq, percent, geneid, log):
 	'''Counts the number of nucleotides and only writes the sequence to an
 output file if they compose greater than the cutoff threshold of the sequence'''
-	count = n
+	failed = []
 	for species in newseq:
 		seq = ""
 		for i in newseq[species]:
@@ -187,21 +186,20 @@ def countBases(n, newseq, percent, geneid, log):
 		aligned = acount + tcount + ccount + gcount
 		try:
 			# Determine whether or not the sequence passes the treshold
-			if aligned/len(seq) >= percent:
-				pass
-			else:
+			if aligned/len(seq) < percent:
 				# Remove sequences that do not meet the threshold
-				del newseq[species]
+				failed.append(species)
 				with open(log, "a") as logfile:
 					logfile.write(geneid + "\t" + species + 
 								"\tLow Nucleotide Count\n")
-				count -= 1
-				break
 		except ZeroDivisionError:
-			del newseq[species]
-	if count >= 2:
-		return True, newseq, count
-	elif count < 2:
+			failed.append(species)
+	for i in failed:
+		del newseq[i]
+		n -= 1
+	if n >= 2:
+		return True, newseq, n
+	elif n < 2:
 		# Do not proceed if dict does not have at least two sequences
 		pass 
 

diff --git a/bin/04_CallCodeML.py b/bin/04_CallCodeML.py
@@ -73,9 +73,9 @@ def main():
 	ap = os.getcwd() + "/"
 	if " " in ap:
 		# Change to warning ########################################################
-		print("\tError: AlignmentProcessor will not run properly if there \
+		print("\tWARNING: AlignmentProcessor will not run properly if there \
 is a space in its PATH name.")
-		quit()
+		ap = ap.replace(" (ASU)", "")
 	run = False
 	# Parse command
 	parser = argparse.ArgumentParser(description="Runs CodeML on all files \
@@ -85,6 +85,7 @@ def main():
 	parser.add_argument("-t", type=int, default=1, help="Number of threads.")
 	parser.add_argument("-f", default="", 
 help="Forward species (name must be the same as it appears in input files.")
+	parser.add_argument("-n", help = "Path to optional Newick tree.")
 	parser.add_argument("--cleanUp", action="store_true", 
 help="Remove temporary files (it may be useful to retain phylogenic trees \
 for future use).")
@@ -100,6 +101,7 @@ def main():
 	if cpu > MAXCPU:
 		cpu = MAXCPU
 	forward = args.f
+	ntree = args.n
 	# Reads in required data
 	finished, completed = outputFiles(outdir)
 	ctl, multiple = controlFiles(indir, outdir, forward, cpu)
@@ -109,7 +111,7 @@ def main():
 		genes = glob(indir + "*.phylip")
 		l = int(len(genes))
 		func = partial(parallelize, ap, outdir, finished, completed, multiple,
-						cpu, ctl, forward)
+						cpu, ctl, forward, ntree)
 		print(("\tRunning CodeML on {0!s} genes with {1!s} threads...."
 				).format(l, cpu))
 		pool = Pool(processes = cpu)

diff --git a/bin/parallelCodeML.py b/bin/parallelCodeML.py
@@ -9,6 +9,7 @@
 from glob import glob
 from subprocess import Popen
 from shlex import split
+from shutil import rmtree
 import math
 import shutil
 import os
@@ -21,40 +22,39 @@ def makeTree(ap, gene, wd, treefile, forward):
 	# Call PhyML to make gene tree
 	phy = Popen(split(ap + "PhyML/PhyML -q -i " + gene), stdout = DEVNULL)
 	phy.wait()
-	if phy.returncode == 0:
-		# Move PhyML output to temp directory
-		output = glob(gene + "_phyml_*") 
-		for i in output:
-			shutil.copy(i, wd)
-			os.remove(i)
-		# Read in PhyML tree
-		with open(treefile, "r") as genetree:
-			try:
-				tree = genetree.readlines()[0]
-			except IndexError:
-				pass
-		# Remove branch lables introduced by PhyML
-		tree = re.sub(r"\d+\.\d+:", ":", tree)
-		# Add forward node to tree if specified 
-		if forward:
-			if forward in tree:
-				# Determine location and length of species name
-				i = tree.index(forward) + len(forward)
-				if ":" in tree:
-					# Find end of branch length
-					comma = tree.find(",", i)
-					paren = tree.find(")", i)
-					i = min([comma, paren])
-				# Insert space and node symbol after species name
-				tree = (tree[:i] + " #1" + tree[i:])
-		elif forward not in tree:
+	# Move PhyML output to temp directory
+	output = glob(gene + "_phyml_*") 
+	for i in output:
+		shutil.copy(i, wd)
+		os.remove(i)
+	# Read in PhyML tree
+	with open(treefile, "r") as genetree:
+		try:
+			tree = genetree.readlines()[0]
+		except IndexError:
 			pass
-		with open(treefile, "w") as outtree:
-			# Overwrite treefile
-			string = ""
-			for i in tree:
-				string += i
-			outtree.write(string)
+	# Remove branch lables introduced by PhyML
+	tree = re.sub(r"\d+\.\d+:", ":", tree)
+	# Add forward node to tree if specified 
+	if forward:
+		if forward in tree:
+			# Determine location and length of species name
+			i = tree.index(forward) + len(forward)
+			if ":" in tree:
+				# Find end of branch length
+				comma = tree.find(",", i)
+				paren = tree.find(")", i)
+				i = min([comma, paren])
+			# Insert space and node symbol after species name
+			tree = (tree[:i] + " #1" + tree[i:])
+	elif forward not in tree:
+		pass
+	with open(treefile, "w") as outtree:
+		# Overwrite treefile
+		string = ""
+		for i in tree:
+			string += i
+		outtree.write(string)
 
 def makeCtl(gene, outfile, tempctl, treefile, ctl):
 	'''Creates unique control file'''
@@ -72,7 +72,7 @@ def makeCtl(gene, outfile, tempctl, treefile, ctl):
 #-----------------------------------------------------------------------------
 
 def parallelize(ap, outdir, finished, completed, multiple, cpu, ctl, 
-				forward, gene):
+				forward, treefile, gene):
 	filename = gene.split("/")[-1]
 	geneid = filename.split(".")[0]
 	if (geneid + "\n") in completed:
@@ -90,27 +90,31 @@ def parallelize(ap, outdir, finished, completed, multiple, cpu, ctl,
 		if multiple == True:
 			if filename.split(".")[1] == "2":
 				# Skip pairwise genes and remove directory
-				os.rmdir(wd)
+				rmtree(wd)
 				pass
 			else:
-				treefile = wd + filename + "_phyml_tree.txt"
-				# Make control file and run PhyML
+				if not treefile:
+					# Run Phyml
+					treefile = wd + filename + "_phyml_tree.txt"
+					makeTree(ap, gene, wd, treefile, forward)
+				# Make control file
 				makeCtl(gene, outfile, tempctl, treefile, ctl)
-				makeTree(ap, gene, wd, treefile, forward)
 				os.chdir(wd)
 				# Call CodeML
-				cm = Popen(split(ap + "paml/bin/codeml " + tempctl), 
-								stdout = DEVNULL)
-				cm.wait()
+				with open(wd + "codemlLog.txt", "w") as tmpout:
+					cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
+									shell = True, stdout = tmpout)
+					cm.wait()
 		elif multiple == False:
 			# Make control file
 			treefile = wd + filename + ""
 			makeCtl(gene, outfile, tempctl, treefile, ctl)
 			# Call CodeML for all files
 			os.chdir(wd)
-			cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
-							stdout = DEVNULL)
-			cm.wait()
+			with open(wd + "codemlLog.txt", "w") as tmpout:
+				cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
+								shell = True, stdout = tmpout)
+				cm.wait()
 		with open(finished, "a") as fin:
 			# Append gene id to list when done
 			fin.write(geneid + "\n")