From 071f41382838ce544150f386ed592ffbfc2b737b Mon Sep 17 00:00:00 2001
From: Tao Liu <vladimir.liu@gmail.com>
Date: Tue, 8 Mar 2011 16:36:33 -0500
Subject: [PATCH] Bug fixes:

Packaging function fixes: 1) zip option; 2) the code to detect d value
in MACS xls file.

Venn for DHS overlap will not be skipped incorrectly.

CEAS --name

Some "rep" need to be "_rep".

Add shell=True to subprocess.call() to allow shell redirection.

Option changes:

Package into 'sample#sample_id.tar.bz2' file.

Use python code to sort and extract top N summits for SeqPos.

info() function flush the output after each writing.

packaging results becomes the step 8.

Set top # of peaks (by peak height) to run SeqPos.
---
 Scripts/ChIP-Seq_Pipeline1.py              | 72 ++++++++++++----------
 test/Scripts/test1_ChIP-Seq_Pipeline1.conf |  2 +-
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/Scripts/ChIP-Seq_Pipeline1.py b/Scripts/ChIP-Seq_Pipeline1.py
index e2040b3..6be0ec1 100644
--- a/Scripts/ChIP-Seq_Pipeline1.py
+++ b/Scripts/ChIP-Seq_Pipeline1.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Time-stamp: <2011-03-08 12:20:57 Tao Liu>
+# Time-stamp: <2011-03-08 16:33:11 Tao Liu>
 
 """Script Description: A demo ChIP-seq pipeline script. From reads
 mapping to motif analysis. It will do 4 validity checks before running
@@ -342,7 +342,7 @@ def step1_bowtie (configs):
     if configs["others.startstep"] <= 1 and 1 <= configs["others.endstep"]:
         info("Step 1: BOWTIE...")
     else:
-        info("Bowtie is skipped as requested.")
+        info("Step 1 Bowtie is skipped as requested.")
         return False
     if configs["data.raw_data_type"] == "seq":
         pass
@@ -419,14 +419,14 @@ def step2_macs (configs):
     if configs["others.startstep"] <= 2 and 2 <= configs["others.endstep"]:
         info("Step 2: MACS...")
     else:
-        info("MACS is skipped as requested.")
+        info("Step 2 MACS is skipped as requested.")
         return False
     if configs["data.raw_data_type"] == "alignment":
         _step2_macs_alignment(configs)
     elif configs["data.raw_data_type"] == "seq":
         _step2_macs_seq(configs)
     else:
-        info("Since raw data typs is not alignment or sequences, MACS is skipped.")
+        info("Since raw data typs is not alignment or sequences, step 2 MACS is skipped.")
         return False
 
 def _step2_macs_alignment (configs):
@@ -609,7 +609,7 @@ def step3_ceas (configs):
     if configs["others.startstep"] <= 3 and 3 <= configs["others.endstep"]:
         info("Step 3: CEAS...")
     else:
-        info("CEAS is skipped as requested.")
+        info("Step 3 CEAS is skipped as requested.")
         return False
 
     # check the input
@@ -656,7 +656,7 @@ def step4_venn (configs):
     if configs["others.startstep"] <= 4 and 4 <= configs["others.endstep"]:
         info("Step 4: Venn diagram...")
     else:
-        info("Venn diagram is skipped as requested.")
+        info("Step 4 Venn diagram is skipped as requested.")
         return False
 
     # check the input
@@ -680,19 +680,17 @@ def step4_venn (configs):
         warn("Venn diagram can't process > 3 files!")
         command_line = "touch "+configs["venn.replicates_output_png"]
         run_cmd(command_line)
-        return True
     elif configs["data.number_replicates"] == 1:
         # no replicates, no Venn diagram
         info("No replicates, so no Venn diagram for replicates")
         command_line = "touch "+configs["venn.replicates_output_png"]
         run_cmd(command_line)
-        return True
-        
-    # run Venn diagram for replicates
-    command_line = configs["venn.venn_diagram_main"]+" -t Overlap_of_Replicates"+" "+" ".join(peak_bed_file_replicates)+" "+" ".join(map(lambda x:"-l replicate_"+str(x),xrange(1,configs["data.number_replicates"]+1)))
-    run_cmd(command_line)
-    command_line = "mv venn_diagram.png "+configs["venn.replicates_output_png"]
-    run_cmd(command_line)
+    else:
+        # run Venn diagram for replicates
+        command_line = configs["venn.venn_diagram_main"]+" -t Overlap_of_Replicates"+" "+" ".join(peak_bed_file_replicates)+" "+" ".join(map(lambda x:"-l replicate_"+str(x),xrange(1,configs["data.number_replicates"]+1)))
+        run_cmd(command_line)
+        command_line = "mv venn_diagram.png "+configs["venn.replicates_output_png"]
+        run_cmd(command_line)
 
     # run Venn for DHS overlap
     command_line = configs["venn.venn_diagram_main"]+" -t Overlap_with_DHS"+" -l Peaks "+peak_bed_file+" -l DHS "+DHS_bed_file
@@ -711,7 +709,7 @@ def step5_cor (configs):
     if configs["others.startstep"] <= 5 and 5 <= configs["others.endstep"]:
         info("Step 5: Correlation plot...")
     else:
-        info("Correlation plot is skipped as requested.")
+        info("Step 5 Correlation plot is skipped as requested.")
         return False
 
     # check the input
@@ -772,7 +770,7 @@ def step6_conserv (configs):
     if configs["others.startstep"] <= 6 and 6 <= configs["others.endstep"]:
         info("Step 6: Conservation plot...")
     else:
-        info("Conservation plot is skipped as requested.")
+        info("Step 6 Conservation plot is skipped as requested.")
         return False
 
     # check the input
@@ -798,7 +796,7 @@ def step7_seqpos (configs):
     if configs["others.startstep"] <= 7 and 7 <= configs["others.endstep"]:
         info("Step 7: Conservation plot...")
     else:
-        info("SeqPos is skipped as requested.")
+        info("Step 7 SeqPos is skipped as requested.")
         return False
 
     # check the input
@@ -809,10 +807,18 @@ def step7_seqpos (configs):
         sys.exit(1)
 
     # generate top # of peaks
-    top_n = configs["seqpos.seqpos_top_peaks"]
-    top_n_summits = "top"+top_n+"_summits.bed"
-    command_line = "sort -n -r -k 5 "+peak_summits_file+" | head -"+top_n+" > "+top_n_summits
-    run_cmd(command_line)
+    psf_fhd = open(peak_summits_file)
+    p_list = []
+    for i in psf_fhd:
+        p_list.append( (i,float(i.split()[-1])) )
+    top_n = int(configs["seqpos.seqpos_top_peaks"])
+    top_n_summits = map(lambda x:x[0],sorted(p_list,key=lambda x:x[1],reverse=True)[:top_n])
+    top_n_summits_file = "top"+str(top_n)+"_summits.bed"
+    top_n_summits_fhd = open(top_n_summits_file,"w")
+    for i in top_n_summits:
+        top_n_summits_fhd.write(i)
+    top_n_summits_fhd.close()
+    info("Top %d summits are written to %s" % (top_n,top_n_summits_file))
     
     # run SeqPos: use the current daisy version as standard, you may need to modify the options.
     # options
@@ -830,23 +836,25 @@ def step7_seqpos (configs):
         seqpos_motif_option = " -m transfac.xml,pbm.xml,jaspar.xml "
     seqpos_filter_option = " -s "+configs["sample.species"]
     
-    command_line = configs["seqpos.seqpos_main"]+" -d "+seqpos_width_option+seqpos_pvalue_option+seqpos_motif_option+seqpos_filter_option+" "+top_n_summits+" "+configs["sample.assembly_name"]
+    command_line = configs["seqpos.seqpos_main"]+" -d "+seqpos_width_option+seqpos_pvalue_option+seqpos_motif_option+seqpos_filter_option+" "+top_n_summits_file+" "+configs["sample.assembly_name"]
     run_cmd(command_line)
-    command_line = "zip "+configs["seqpos.output_zip"]+" result/"
+    command_line = "zip -r "+configs["seqpos.output_zip"]+" results/"
     run_cmd(command_line)
     return True
 
 def step8_package_result ( configs ):
-    """Package result and generate a summary file.
+    """Package result and generate a summary file. -> a subfolder
+    named as sample#sample_id and zip it as sample#sample_id.tar.bz2
 
     """
     if configs["others.startstep"] <= 8 and 8 <= configs["others.endstep"]:
         info("Step 8: Packaging results...")
     else:
-        info("Packaging is skipped as requested.")
+        info("Step 8 Packaging is skipped as requested.")
         return False
 
-    command_line = "mkdir "+configs["sample.sample_id"]
+    subfolder = "sample"+configs["sample.sample_id"]
+    command_line = "mkdir "+subfolder
     run_cmd(command_line)
     
     all_files = []
@@ -881,16 +889,18 @@ def step8_package_result ( configs ):
     f = open(configs["macs.output_xls"])
     n = 0
     for l in f:
-        if l.startswith("# d= "):
+        l.rstrip()
+        if l.startswith("# d = "):
             n = int(l[6:])
+            break
     logfhd.write("Value d from MACS is %d\n----\n" % n)
     f.close()
 
-    command_line = "mv "+" ".join(all_files)+" "+configs["sample.sample_id"]+"/"
+    command_line = "mv "+" ".join(all_files)+" "+subfolder+"/"
     run_cmd(command_line)
-    command_line = "cp log "+configs["sample.sample_id"]+"/"
+    command_line = "cp log "+subfolder+"/"
     run_cmd(command_line)
-    command_line = "tar -jcf "+configs["sample.sample_id"]+".tar.bz2 "+configs["sample.sample_id"]+"/"
+    command_line = "tar -jcf "+subfolder+".tar.bz2 "+subfolder+"/"
     run_cmd(command_line)
 
 
@@ -925,7 +935,7 @@ def main():
     if stopatstep:
         configs["others.endstep"] = startstep
     else:
-        configs["others.endstep"] = 7
+        configs["others.endstep"] = 8
 
     configs["others.cwd"] = os.getcwd()
     
diff --git a/test/Scripts/test1_ChIP-Seq_Pipeline1.conf b/test/Scripts/test1_ChIP-Seq_Pipeline1.conf
index 58b874e..c4e6c3e 100644
--- a/test/Scripts/test1_ChIP-Seq_Pipeline1.conf
+++ b/test/Scripts/test1_ChIP-Seq_Pipeline1.conf
@@ -181,4 +181,4 @@ SEQPOS_PVALUE_CUTOFF = 0.001
 # SeqPos know motif DBs. The motif DB you want to search against. It
 # includes: transfac.xml, pbm.xml, jaspar.xml, hpdi.xml,
 # y1h.xml. Please use comma to separate multiple xml files.
-SEQPOS_MOTIF_DB_SELECTION = transfac.xml,pbm.xml,jaspar.xml
+SEQPOS_MOTIF_DB_SELECTION = transfac.xml,jaspar.xml