Skip to content

Commit

Permalink
Bug fixes:
Browse files Browse the repository at this point in the history
Packaging function fixes: 1) zip option; 2) the code to detect d value
in MACS xls file.

Venn for DHS overlap will not be skipped incorrectly.

CEAS --name

Some "rep" need to be "_rep".

Add shell=True to subprocess.call() to allow shell redirection.

Option changes:

Package into 'sample#sample_id.tar.bz2' file.

Use python code to sort and extract top N summits for SeqPos.

info() function flush the output after each writing.

packaging results becomes the step 8.

Set top # of peaks (by peak height) to run SeqPos.
  • Loading branch information
taoliu committed Mar 8, 2011
1 parent 81b586e commit 071f413
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 32 deletions.
72 changes: 41 additions & 31 deletions Scripts/ChIP-Seq_Pipeline1.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# Time-stamp: <2011-03-08 12:20:57 Tao Liu>
# Time-stamp: <2011-03-08 16:33:11 Tao Liu>

"""Script Description: A demo ChIP-seq pipeline script. From reads
mapping to motif analysis. It will do 4 validity checks before running
Expand Down Expand Up @@ -342,7 +342,7 @@ def step1_bowtie (configs):
if configs["others.startstep"] <= 1 and 1 <= configs["others.endstep"]:
info("Step 1: BOWTIE...")
else:
info("Bowtie is skipped as requested.")
info("Step 1 Bowtie is skipped as requested.")
return False
if configs["data.raw_data_type"] == "seq":
pass
Expand Down Expand Up @@ -419,14 +419,14 @@ def step2_macs (configs):
if configs["others.startstep"] <= 2 and 2 <= configs["others.endstep"]:
info("Step 2: MACS...")
else:
info("MACS is skipped as requested.")
info("Step 2 MACS is skipped as requested.")
return False
if configs["data.raw_data_type"] == "alignment":
_step2_macs_alignment(configs)
elif configs["data.raw_data_type"] == "seq":
_step2_macs_seq(configs)
else:
info("Since raw data typs is not alignment or sequences, MACS is skipped.")
info("Since raw data typs is not alignment or sequences, step 2 MACS is skipped.")
return False

def _step2_macs_alignment (configs):
Expand Down Expand Up @@ -609,7 +609,7 @@ def step3_ceas (configs):
if configs["others.startstep"] <= 3 and 3 <= configs["others.endstep"]:
info("Step 3: CEAS...")
else:
info("CEAS is skipped as requested.")
info("Step 3 CEAS is skipped as requested.")
return False

# check the input
Expand Down Expand Up @@ -656,7 +656,7 @@ def step4_venn (configs):
if configs["others.startstep"] <= 4 and 4 <= configs["others.endstep"]:
info("Step 4: Venn diagram...")
else:
info("Venn diagram is skipped as requested.")
info("Step 4 Venn diagram is skipped as requested.")
return False

# check the input
Expand All @@ -680,19 +680,17 @@ def step4_venn (configs):
warn("Venn diagram can't process > 3 files!")
command_line = "touch "+configs["venn.replicates_output_png"]
run_cmd(command_line)
return True
elif configs["data.number_replicates"] == 1:
# no replicates, no Venn diagram
info("No replicates, so no Venn diagram for replicates")
command_line = "touch "+configs["venn.replicates_output_png"]
run_cmd(command_line)
return True

# run Venn diagram for replicates
command_line = configs["venn.venn_diagram_main"]+" -t Overlap_of_Replicates"+" "+" ".join(peak_bed_file_replicates)+" "+" ".join(map(lambda x:"-l replicate_"+str(x),xrange(1,configs["data.number_replicates"]+1)))
run_cmd(command_line)
command_line = "mv venn_diagram.png "+configs["venn.replicates_output_png"]
run_cmd(command_line)
else:
# run Venn diagram for replicates
command_line = configs["venn.venn_diagram_main"]+" -t Overlap_of_Replicates"+" "+" ".join(peak_bed_file_replicates)+" "+" ".join(map(lambda x:"-l replicate_"+str(x),xrange(1,configs["data.number_replicates"]+1)))
run_cmd(command_line)
command_line = "mv venn_diagram.png "+configs["venn.replicates_output_png"]
run_cmd(command_line)

# run Venn for DHS overlap
command_line = configs["venn.venn_diagram_main"]+" -t Overlap_with_DHS"+" -l Peaks "+peak_bed_file+" -l DHS "+DHS_bed_file
Expand All @@ -711,7 +709,7 @@ def step5_cor (configs):
if configs["others.startstep"] <= 5 and 5 <= configs["others.endstep"]:
info("Step 5: Correlation plot...")
else:
info("Correlation plot is skipped as requested.")
info("Step 5 Correlation plot is skipped as requested.")
return False

# check the input
Expand Down Expand Up @@ -772,7 +770,7 @@ def step6_conserv (configs):
if configs["others.startstep"] <= 6 and 6 <= configs["others.endstep"]:
info("Step 6: Conservation plot...")
else:
info("Conservation plot is skipped as requested.")
info("Step 6 Conservation plot is skipped as requested.")
return False

# check the input
Expand All @@ -798,7 +796,7 @@ def step7_seqpos (configs):
if configs["others.startstep"] <= 7 and 7 <= configs["others.endstep"]:
info("Step 7: Conservation plot...")
else:
info("SeqPos is skipped as requested.")
info("Step 7 SeqPos is skipped as requested.")
return False

# check the input
Expand All @@ -809,10 +807,18 @@ def step7_seqpos (configs):
sys.exit(1)

# generate top # of peaks
top_n = configs["seqpos.seqpos_top_peaks"]
top_n_summits = "top"+top_n+"_summits.bed"
command_line = "sort -n -r -k 5 "+peak_summits_file+" | head -"+top_n+" > "+top_n_summits
run_cmd(command_line)
psf_fhd = open(peak_summits_file)
p_list = []
for i in psf_fhd:
p_list.append( (i,float(i.split()[-1])) )
top_n = int(configs["seqpos.seqpos_top_peaks"])
top_n_summits = map(lambda x:x[0],sorted(p_list,key=lambda x:x[1],reverse=True)[:top_n])
top_n_summits_file = "top"+str(top_n)+"_summits.bed"
top_n_summits_fhd = open(top_n_summits_file,"w")
for i in top_n_summits:
top_n_summits_fhd.write(i)
top_n_summits_fhd.close()
info("Top %d summits are written to %s" % (top_n,top_n_summits_file))

# run SeqPos: use the current daisy version as standard, you may need to modify the options.
# options
Expand All @@ -830,23 +836,25 @@ def step7_seqpos (configs):
seqpos_motif_option = " -m transfac.xml,pbm.xml,jaspar.xml "
seqpos_filter_option = " -s "+configs["sample.species"]

command_line = configs["seqpos.seqpos_main"]+" -d "+seqpos_width_option+seqpos_pvalue_option+seqpos_motif_option+seqpos_filter_option+" "+top_n_summits+" "+configs["sample.assembly_name"]
command_line = configs["seqpos.seqpos_main"]+" -d "+seqpos_width_option+seqpos_pvalue_option+seqpos_motif_option+seqpos_filter_option+" "+top_n_summits_file+" "+configs["sample.assembly_name"]
run_cmd(command_line)
command_line = "zip "+configs["seqpos.output_zip"]+" result/"
command_line = "zip -r "+configs["seqpos.output_zip"]+" results/"
run_cmd(command_line)
return True

def step8_package_result ( configs ):
"""Package result and generate a summary file.
"""Package result and generate a summary file. -> a subfolder
named as sample#sample_id and zip it as sample#sample_id.tar.bz2
"""
if configs["others.startstep"] <= 8 and 8 <= configs["others.endstep"]:
info("Step 8: Packaging results...")
else:
info("Packaging is skipped as requested.")
info("Step 8 Packaging is skipped as requested.")
return False

command_line = "mkdir "+configs["sample.sample_id"]
subfolder = "sample"+configs["sample.sample_id"]
command_line = "mkdir "+subfolder
run_cmd(command_line)

all_files = []
Expand Down Expand Up @@ -881,16 +889,18 @@ def step8_package_result ( configs ):
f = open(configs["macs.output_xls"])
n = 0
for l in f:
if l.startswith("# d= "):
l.rstrip()
if l.startswith("# d = "):
n = int(l[6:])
break
logfhd.write("Value d from MACS is %d\n----\n" % n)
f.close()

command_line = "mv "+" ".join(all_files)+" "+configs["sample.sample_id"]+"/"
command_line = "mv "+" ".join(all_files)+" "+subfolder+"/"
run_cmd(command_line)
command_line = "cp log "+configs["sample.sample_id"]+"/"
command_line = "cp log "+subfolder+"/"
run_cmd(command_line)
command_line = "tar -jcf "+configs["sample.sample_id"]+".tar.bz2 "+configs["sample.sample_id"]+"/"
command_line = "tar -jcf "+subfolder+".tar.bz2 "+subfolder+"/"
run_cmd(command_line)


Expand Down Expand Up @@ -925,7 +935,7 @@ def main():
if stopatstep:
configs["others.endstep"] = startstep
else:
configs["others.endstep"] = 7
configs["others.endstep"] = 8

configs["others.cwd"] = os.getcwd()

Expand Down
2 changes: 1 addition & 1 deletion test/Scripts/test1_ChIP-Seq_Pipeline1.conf
Original file line number Diff line number Diff line change
Expand Up @@ -181,4 +181,4 @@ SEQPOS_PVALUE_CUTOFF = 0.001
# SeqPos know motif DBs. The motif DB you want to search against. It
# includes: transfac.xml, pbm.xml, jaspar.xml, hpdi.xml,
# y1h.xml. Please use comma to separate multiple xml files.
SEQPOS_MOTIF_DB_SELECTION = transfac.xml,pbm.xml,jaspar.xml
SEQPOS_MOTIF_DB_SELECTION = transfac.xml,jaspar.xml

0 comments on commit 071f413

Please sign in to comment.