Skip to content

Commit

Permalink
Allows user supplied phylogenic tree.
Browse files Browse the repository at this point in the history
  • Loading branch information
icwells committed May 22, 2017
1 parent ee6b8c9 commit 0fbdafd
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 82 deletions.
14 changes: 9 additions & 5 deletions AlignmentProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,15 +135,17 @@ def calculateKaKs(outdir, method, cpu):
if ck.returncode == 0:
return True

def runcodeml(cpu, outdir, forward, cleanup):
def runcodeml(cpu, outdir, forward, ntree, cleanup):
'''Runs codeml on a directory.'''
# Build commands and add options if necessary
cmd = ("python bin/04_CallCodeML.py -t " + str(cpu) + " -i " + outdir +
"03_phylipFiles" + " -o " + outdir + "04_CodemlOutput")
if cleanup == False:
if cleanup == True:
cmd += " --cleanUp"
if forward:
cmd += " -f " + forward
if ntree:
cmd += " -n " + ntree
cm = Popen(split(cmd))
cm.wait()
if cm.returncode == 0:
Expand All @@ -156,7 +158,7 @@ def main():
# Set arguments
parser = argparse.ArgumentParser(description="AlignmentProcessor will run \
the subsituion rate pipeline to produce trimmed axt or phylip files for use \
with KaKs_calculator or PhyMl.\nAlignmentProcessor1.2 Copyright 2016 by \
with KaKs_calculator or PhyMl.\nAlignmentProcessor1.4 Copyright 2016 by \
Shawn Rupp\nThis program comes with ABSOLUTELY NO WARRANTY\nThis is free \
software, and you are welcome to redistribute it under certain conditions\n")
parser.add_argument("-i", help="Path to input file.")
Expand All @@ -182,7 +184,8 @@ def main():
parser.add_argument("-t", type=int, default=1, help="Number of threads.")
parser.add_argument("-f", default="",
help="Forward species (name must be the same as it appears in input files.")
parser.add_argument("--cleanUp", action="store_false",
parser.add_argument("-n", help = "Path to optional Newick tree.")
parser.add_argument("--cleanUp", action="store_true",
help="Remove temporary files (it may be useful to retain phylogenic trees \
for future use).")
# Parse arguments and assign to variables
Expand All @@ -201,6 +204,7 @@ def main():
codeml = args.codeml
cpu = args.t
forward = args.f
ntree = args.n
cleanup = args.cleanUp
# Check inout commands prior to running:
if not ref:
Expand Down Expand Up @@ -231,7 +235,7 @@ def main():
done = calculateKaKs(outdir, method, cpu)
# Run codeml
elif codeml == True:
done = runcodeml(cpu, outdir, forward, cleanup)
done = runcodeml(cpu, outdir, forward, ntree, cleanup)
else:
# Exit if neither program was called
done = True
Expand Down
Binary file modified AlignmentProcessorReadMe.pdf
Binary file not shown.
46 changes: 34 additions & 12 deletions bin/01_SplitFasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,40 +24,51 @@ def splitFasta(infile, outdir):
# Parse input fasta
with open(infile, "r") as fasta:
newid = True
prev = True
seq = ""
n = 0
for line in fasta:
if line != "\n":
if line.strip():
if line[0] == ">":
line, geneid = convertHeader(line)
prev = True
build, geneid = convertHeader(line)
# Concatenate lines for all species for each gene
seq += str(line)
# Determine number of sequences and species names
n += 1
if newid == True:
# Set reference species ID as file name
filename = geneid
newid = False
else:
# Concatenate remaining lines
seq += str(line)
elif line == "\n" and newid == False:
line = line.upper()
if ("A" not in line or "C" not in line or "G" not in line
or "T" not in line):
pass
else:
# Save gene if it contains nucleotides
if prev == True:
n += 1
seq += build
prev == False
seq += str(line)
elif not line.strip() and newid == False:
# Use empty lines to determine where genes end
if n >= 2:
if n >= 2 and seq.count("\n") > 3:
# Print gene sequences to file if there are at least two
# species and reset for next gene
outfile = (outdir + filename + "." + str(n) + ".fa")
with open(outfile, "w") as output:
output.write(seq)
newid = True
prev = False
seq = ""
n = 0
passed += 1
elif n < 2:
else:
# Skip genes with only one sequence and save ID in log
with open(log, "a") as logfile:
logfile.write(geneid + "\n")
excluded += 1
newid = True
with open(log, "a") as logfile:
logfile.write(("\nTotal transcripts written to file: {}\n").format(passed))
logfile.write(("Total transcripts with only one sequence: {}").format(excluded))
Expand All @@ -68,13 +79,24 @@ def convertHeader(line):
# Extract relevant data from UCSC header
genebuild = line[1:].split()[0]
genebuild = genebuild.split("_")
build = ">" + str(genebuild[1]) + "\n"
geneid = str(genebuild[0].split(".")[0])
if line[1] == "E":
# Ensembl IDs
build = ">" + str(genebuild[1]) + "\n"
geneid = str(genebuild[0].split(".")[0])
elif line[1] == "N":
# NCBI IDs
build = ">" + str(genebuild[2]) + "\n"
geneid = str(genebuild[0]) + "_" + str(genebuild[1])
else:
# Extract build and geneid
build = ">" + line.split(".")[0][1:].rstrip() + "\n"
geneid = str(line.split(".")[1]).rstrip()
return build, geneid
if geneid and build:
return build, geneid
else:
print("Please use a fasta file with Ensembl, NCBI, or Galaxy Stitch \
Gene Blocks IDs.")
quit()

def main():
parser = argparse.ArgumentParser(description="This will take the \
Expand Down
34 changes: 16 additions & 18 deletions bin/02_FilterFasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def seqDict(fasta, n):
species = line[1:].strip()
if line[0] != ">":
codons = []
seq = line.strip().upper()
seq = line.strip()
for i in range(0, len(seq), 3):
codons.append(seq[i:i +3])
i += 3
Expand Down Expand Up @@ -130,7 +130,6 @@ def fixCodons(newseq):
def removeStops(n, newseq, retainStops, geneid, log):
'''This will replace internal stop codons with gaps, create a list
of genes with internal stop codons, and remove those sequences.'''
count = n
remove = []
for species in newseq:
# Replace terminal stop codons so the program can identify
Expand All @@ -142,7 +141,7 @@ def removeStops(n, newseq, retainStops, geneid, log):
newseq[species][-1] = "---"
except (IndexError, ValueError) as e:
# Remove empty sequences
count -= 1
n -= 1
del newseq[species]
with open(log, "a") as logfile:
logfile.write(geneid + "\t" + species +
Expand All @@ -163,18 +162,18 @@ def removeStops(n, newseq, retainStops, geneid, log):
if retainStops == False:
# Remove key after iterating to avoid using multiple breaks
for species in remove:
count -= 1
n -= 1
del newseq[species]
if count >= 2:
return True, newseq, count
elif count < 2:
if n >= 2:
return True, newseq, n
elif n < 2:
# Skip files with only one sequence left
pass

def countBases(n, newseq, percent, geneid, log):
'''Counts the number of nucleotides and only writes the sequence to an
output file if they compose greater than the cutoff threshold of the sequence'''
count = n
failed = []
for species in newseq:
seq = ""
for i in newseq[species]:
Expand All @@ -187,21 +186,20 @@ def countBases(n, newseq, percent, geneid, log):
aligned = acount + tcount + ccount + gcount
try:
# Determine whether or not the sequence passes the treshold
if aligned/len(seq) >= percent:
pass
else:
if aligned/len(seq) < percent:
# Remove sequences that do not meet the threshold
del newseq[species]
failed.append(species)
with open(log, "a") as logfile:
logfile.write(geneid + "\t" + species +
"\tLow Nucleotide Count\n")
count -= 1
break
except ZeroDivisionError:
del newseq[species]
if count >= 2:
return True, newseq, count
elif count < 2:
failed.append(species)
for i in failed:
del newseq[i]
n -= 1
if n >= 2:
return True, newseq, n
elif n < 2:
# Do not proceed if dict does not have at least two sequences
pass

Expand Down
8 changes: 5 additions & 3 deletions bin/04_CallCodeML.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ def main():
ap = os.getcwd() + "/"
if " " in ap:
# Change to warning ########################################################
print("\tError: AlignmentProcessor will not run properly if there \
print("\tWARNING: AlignmentProcessor will not run properly if there \
is a space in its PATH name.")
quit()
ap = ap.replace(" (ASU)", "")
run = False
# Parse command
parser = argparse.ArgumentParser(description="Runs CodeML on all files \
Expand All @@ -85,6 +85,7 @@ def main():
parser.add_argument("-t", type=int, default=1, help="Number of threads.")
parser.add_argument("-f", default="",
help="Forward species (name must be the same as it appears in input files.")
parser.add_argument("-n", help = "Path to optional Newick tree.")
parser.add_argument("--cleanUp", action="store_true",
help="Remove temporary files (it may be useful to retain phylogenic trees \
for future use).")
Expand All @@ -100,6 +101,7 @@ def main():
if cpu > MAXCPU:
cpu = MAXCPU
forward = args.f
ntree = args.n
# Reads in required data
finished, completed = outputFiles(outdir)
ctl, multiple = controlFiles(indir, outdir, forward, cpu)
Expand All @@ -109,7 +111,7 @@ def main():
genes = glob(indir + "*.phylip")
l = int(len(genes))
func = partial(parallelize, ap, outdir, finished, completed, multiple,
cpu, ctl, forward)
cpu, ctl, forward, ntree)
print(("\tRunning CodeML on {0!s} genes with {1!s} threads...."
).format(l, cpu))
pool = Pool(processes = cpu)
Expand Down
92 changes: 48 additions & 44 deletions bin/parallelCodeML.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from glob import glob
from subprocess import Popen
from shlex import split
from shutil import rmtree
import math
import shutil
import os
Expand All @@ -21,40 +22,39 @@ def makeTree(ap, gene, wd, treefile, forward):
# Call PhyML to make gene tree
phy = Popen(split(ap + "PhyML/PhyML -q -i " + gene), stdout = DEVNULL)
phy.wait()
if phy.returncode == 0:
# Move PhyML output to temp directory
output = glob(gene + "_phyml_*")
for i in output:
shutil.copy(i, wd)
os.remove(i)
# Read in PhyML tree
with open(treefile, "r") as genetree:
try:
tree = genetree.readlines()[0]
except IndexError:
pass
# Remove branch lables introduced by PhyML
tree = re.sub(r"\d+\.\d+:", ":", tree)
# Add forward node to tree if specified
if forward:
if forward in tree:
# Determine location and length of species name
i = tree.index(forward) + len(forward)
if ":" in tree:
# Find end of branch length
comma = tree.find(",", i)
paren = tree.find(")", i)
i = min([comma, paren])
# Insert space and node symbol after species name
tree = (tree[:i] + " #1" + tree[i:])
elif forward not in tree:
# Move PhyML output to temp directory
output = glob(gene + "_phyml_*")
for i in output:
shutil.copy(i, wd)
os.remove(i)
# Read in PhyML tree
with open(treefile, "r") as genetree:
try:
tree = genetree.readlines()[0]
except IndexError:
pass
with open(treefile, "w") as outtree:
# Overwrite treefile
string = ""
for i in tree:
string += i
outtree.write(string)
# Remove branch lables introduced by PhyML
tree = re.sub(r"\d+\.\d+:", ":", tree)
# Add forward node to tree if specified
if forward:
if forward in tree:
# Determine location and length of species name
i = tree.index(forward) + len(forward)
if ":" in tree:
# Find end of branch length
comma = tree.find(",", i)
paren = tree.find(")", i)
i = min([comma, paren])
# Insert space and node symbol after species name
tree = (tree[:i] + " #1" + tree[i:])
elif forward not in tree:
pass
with open(treefile, "w") as outtree:
# Overwrite treefile
string = ""
for i in tree:
string += i
outtree.write(string)

def makeCtl(gene, outfile, tempctl, treefile, ctl):
'''Creates unique control file'''
Expand All @@ -72,7 +72,7 @@ def makeCtl(gene, outfile, tempctl, treefile, ctl):
#-----------------------------------------------------------------------------

def parallelize(ap, outdir, finished, completed, multiple, cpu, ctl,
forward, gene):
forward, treefile, gene):
filename = gene.split("/")[-1]
geneid = filename.split(".")[0]
if (geneid + "\n") in completed:
Expand All @@ -90,27 +90,31 @@ def parallelize(ap, outdir, finished, completed, multiple, cpu, ctl,
if multiple == True:
if filename.split(".")[1] == "2":
# Skip pairwise genes and remove directory
os.rmdir(wd)
rmtree(wd)
pass
else:
treefile = wd + filename + "_phyml_tree.txt"
# Make control file and run PhyML
if not treefile:
# Run Phyml
treefile = wd + filename + "_phyml_tree.txt"
makeTree(ap, gene, wd, treefile, forward)
# Make control file
makeCtl(gene, outfile, tempctl, treefile, ctl)
makeTree(ap, gene, wd, treefile, forward)
os.chdir(wd)
# Call CodeML
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
stdout = DEVNULL)
cm.wait()
with open(wd + "codemlLog.txt", "w") as tmpout:
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
shell = True, stdout = tmpout)
cm.wait()
elif multiple == False:
# Make control file
treefile = wd + filename + ""
makeCtl(gene, outfile, tempctl, treefile, ctl)
# Call CodeML for all files
os.chdir(wd)
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
stdout = DEVNULL)
cm.wait()
with open(wd + "codemlLog.txt", "w") as tmpout:
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
shell = True, stdout = tmpout)
cm.wait()
with open(finished, "a") as fin:
# Append gene id to list when done
fin.write(geneid + "\n")

0 comments on commit 0fbdafd

Please sign in to comment.