From d28c72beb354ab48ecddd0e9671b2bce33d44fed Mon Sep 17 00:00:00 2001
From: Julien FUMEY <julien.fumey@gmail.com>
Date: Wed, 28 Dec 2022 18:34:10 +0100
Subject: [PATCH] add process to extract results of busco to a csv file

---
 analyse_wg_ncbi.nf   | 37 ++++++++++++++++++++++--------
 bin/extractResult.py | 54 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 bin/extractResult.py

diff --git a/analyse_wg_ncbi.nf b/analyse_wg_ncbi.nf
index 13925dc..924c66e 100644
--- a/analyse_wg_ncbi.nf
+++ b/analyse_wg_ncbi.nf
@@ -160,7 +160,7 @@ process unzipFasta{
     tuple val(spName), file(fasta) from fastaFile
 
     output:
-    tuple val(spName), file('unzip.fasta') optional true  into fastaUnzipped, fastaUnzipped2
+    tuple val(spName), file(fasta) file('unzip.fasta') optional true  into fastaUnzipped, fastaUnzipped2
     
 
     script:
@@ -217,11 +217,11 @@ process removeAltScaffold{
     label 'samtools'
 
     input:
-    tuple val(spName), file(infasta) from fastaUnzipped
+    tuple val(spName), file(fasta), file(infasta) from fastaUnzipped
     file(goodScaffold) from listGoodScaffold
 
     output:
-    tuple val(spName), file('genome_trimmed.fasta') into trimmedFasta
+    tuple val(spName), file(fasta), file('genome_trimmed.fasta') into trimmedFasta
 
     script:
     """
@@ -232,7 +232,7 @@ process removeAltScaffold{
 
 
 process busco{
-    publishDir "${resultsDir}/results/${spName}/", mode:'copy'
+    //publishDir "${resultsDir}/results/${spName}/", mode:'copy'
     label 'busco'
 
     maxForks 50
@@ -241,17 +241,36 @@ process busco{
     input:
     val buscoref from buscoRefFile
     val buscoDLPath from buscoDLpath
-    tuple val(spName), file(fastaUnzipped) from ( notrim ? fastaUnzipped2 : trimmedFasta )
+    tuple val(spName), file(fasta), file(fastaUnzipped) from ( notrim ? fastaUnzipped2 : trimmedFasta )
 
     output:
     //tuple val(spName), path("*-busco.batch_summary.txt"), emit: batch_summary
-    tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/short_summary.*json") into short_summary_json
-    tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/short_summary.*txt") into short_summary_txt
-    tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/full_table*.tsv") optional true into full_tables 
-    tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/missing_busco_list.*tsv") optional true into busco_list
+    //tuple val(spName), file(val), file("${spName.replaceAll(/\s/,'_')}/short_summary.*json") into short_summary_json
+    tuple val(spName), file(val), file("${spName.replaceAll(/\s/,'_')}/short_summary.*txt") into short_summary_txt
+    //tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/full_table*.tsv") optional true into full_tables 
+    //tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/missing_busco_list.*tsv") optional true into busco_list
 
     script:
     """
     busco -i ${fastaUnzipped} -m genome -o ${spName.replaceAll(/\s/,'_')} -l ${buscoref} --download_path ${buscoDLPath} -c 40 --offline -f --metaeuk_parameters='--remove-tmp-files=1' --metaeuk_rerun_parameters='--remove-tmp-files=1'
     """
 }
+
+process extractResults{
+    label 'extractResults'
+
+    input:
+    tuple val(spName), file(fasta), file(json) from short_summary_txt
+
+    output:
+    file('busco_results.csv') into finalResults
+
+    script:
+    """
+    extractResults.sh --input ${json} --species ${spName} --genomeFile ${fasta.getName()} --output results.csv
+    """
+}
+
+finalResults.collectFile(name: "busco_results.csv", keepHeader: true, skip: 1).subscribe{
+	f -> f.copyTo(resultsDir.resolve(f.name))
+}
diff --git a/bin/extractResult.py b/bin/extractResult.py
new file mode 100644
index 0000000..1a2a2dc
--- /dev/null
+++ b/bin/extractResult.py
@@ -0,0 +1,54 @@
+#! /usr/bin/Python3
+
+import json
+import argparse
+
+parser = argparse.ArgumentParser(
+        description='Parse input file from Busco to the output file', 
+    )
+
+parser.add_argument('--input', help='Input file')
+parser.add_argument('--species', help='Species name')
+parser.add_argument('--genomeFile', help='Name of the genome file')
+parser.add_argument('--output', help='output file (csv format)')
+args = parser.parse_args()
+
+#open json file and load it
+data_dict = {}
+with open(args.input) as result_file:
+        line = result_file.readline()
+        while line:
+            linestrp = line.strip()
+            if linestrp.startswith('#') or linestrp == '':
+                pass
+            else:
+                if linestrp.startswith('C:') or linestrp == "***** Results: *****" or linestrp.startswith("Assembly"):
+                    pass
+                elif linestrp == "Dependencies and versions:":
+                    break
+                else:
+                    linesplt = linestrp.split('\t')
+                    data_dict[linesplt[1]] = linesplt[0]
+            line = result_file.readline()
+
+with open(args.output, 'w') as fhout:
+    fhout.write("'Species','Genome file','Busco groups searched','Total length','Perc gaps','Scaffold N50','Contigs N50','Complete','Perc complete','Single copy','Perc single copy','Duplicated','Perc duplicated','Fragmented','Perc fragmented','Missing','Perc missing'\n")
+    strToWrite = f"'{args.species}'"
+    strToWrite += f",'{args.genomeFile}'"
+    strToWrite += f",'{data_dict['Total BUSCO groups searched']}'"
+    strToWrite += f",'{data_dict['Total length']}'"
+    strToWrite += f",'{data_dict['Percent gaps']}'"
+    strToWrite += f",'{data_dict['Scaffold N50']}'"
+    strToWrite += f",'{data_dict['Contigs N50']}'"
+    strToWrite += f",'{data_dict['Complete BUSCOs (C)']}'"
+    strToWrite += f",'{int(data_dict['Complete BUSCOs (C)'])/int(data_dict['Total BUSCO groups searched'])}'"
+    strToWrite += f",'{data_dict['Complete and single-copy BUSCOs (S)']}'"
+    strToWrite += f",'{int(data_dict['Complete and single-copy BUSCOs (S)'])/int(data_dict['Total BUSCO groups searched'])}'"
+    strToWrite += f",'{data_dict['Complete and duplicated BUSCOs (D)']}'"
+    strToWrite += f",'{int(data_dict['Complete and duplicated BUSCOs (D)'])/int(data_dict['Total BUSCO groups searched'])}'"
+    strToWrite += f",'{data_dict['Fragmented BUSCOs (F)']}'"
+    strToWrite += f",'{int(data_dict['Fragmented BUSCOs (F)'])/int(data_dict['Total BUSCO groups searched'])}'"
+    strToWrite += f",'{data_dict['Missing BUSCOs (M)']}'"
+    strToWrite += f",'{int(data_dict['Missing BUSCOs (M)'])/int(data_dict['Total BUSCO groups searched'])}'\n"
+    
+    fhout.write(strToWrite)    
\ No newline at end of file