From d28c72beb354ab48ecddd0e9671b2bce33d44fed Mon Sep 17 00:00:00 2001 From: Julien FUMEY Date: Wed, 28 Dec 2022 18:34:10 +0100 Subject: [PATCH] add process to extract results of busco to a csv file --- analyse_wg_ncbi.nf | 37 ++++++++++++++++++++++-------- bin/extractResult.py | 54 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 bin/extractResult.py diff --git a/analyse_wg_ncbi.nf b/analyse_wg_ncbi.nf index 13925dc..924c66e 100644 --- a/analyse_wg_ncbi.nf +++ b/analyse_wg_ncbi.nf @@ -160,7 +160,7 @@ process unzipFasta{ tuple val(spName), file(fasta) from fastaFile output: - tuple val(spName), file('unzip.fasta') optional true into fastaUnzipped, fastaUnzipped2 + tuple val(spName), file(fasta) file('unzip.fasta') optional true into fastaUnzipped, fastaUnzipped2 script: @@ -217,11 +217,11 @@ process removeAltScaffold{ label 'samtools' input: - tuple val(spName), file(infasta) from fastaUnzipped + tuple val(spName), file(fasta), file(infasta) from fastaUnzipped file(goodScaffold) from listGoodScaffold output: - tuple val(spName), file('genome_trimmed.fasta') into trimmedFasta + tuple val(spName), file(fasta), file('genome_trimmed.fasta') into trimmedFasta script: """ @@ -232,7 +232,7 @@ process removeAltScaffold{ process busco{ - publishDir "${resultsDir}/results/${spName}/", mode:'copy' + //publishDir "${resultsDir}/results/${spName}/", mode:'copy' label 'busco' maxForks 50 @@ -241,17 +241,36 @@ process busco{ input: val buscoref from buscoRefFile val buscoDLPath from buscoDLpath - tuple val(spName), file(fastaUnzipped) from ( notrim ? fastaUnzipped2 : trimmedFasta ) + tuple val(spName), file(fasta), file(fastaUnzipped) from ( notrim ? fastaUnzipped2 : trimmedFasta ) output: //tuple val(spName), path("*-busco.batch_summary.txt"), emit: batch_summary - tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/short_summary.*json") into short_summary_json - tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/short_summary.*txt") into short_summary_txt - tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/full_table*.tsv") optional true into full_tables - tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/missing_busco_list.*tsv") optional true into busco_list + //tuple val(spName), file(val), file("${spName.replaceAll(/\s/,'_')}/short_summary.*json") into short_summary_json + tuple val(spName), file(val), file("${spName.replaceAll(/\s/,'_')}/short_summary.*txt") into short_summary_txt + //tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/full_table*.tsv") optional true into full_tables + //tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/missing_busco_list.*tsv") optional true into busco_list script: """ busco -i ${fastaUnzipped} -m genome -o ${spName.replaceAll(/\s/,'_')} -l ${buscoref} --download_path ${buscoDLPath} -c 40 --offline -f --metaeuk_parameters='--remove-tmp-files=1' --metaeuk_rerun_parameters='--remove-tmp-files=1' """ } + +process extractResults{ + label 'extractResults' + + input: + tuple val(spName), file(fasta), file(json) from short_summary_txt + + output: + file('busco_results.csv') into finalResults + + script: + """ + extractResults.sh --input ${json} --species ${spName} --genomeFile ${fasta.getName()} --output results.csv + """ +} + +finalResults.collectFile(name: "busco_results.csv", keepHeader: true, skip: 1).subscribe{ + f -> f.copyTo(resultsDir.resolve(f.name)) +} diff --git a/bin/extractResult.py b/bin/extractResult.py new file mode 100644 index 0000000..1a2a2dc --- /dev/null +++ b/bin/extractResult.py @@ -0,0 +1,54 @@ +#! /usr/bin/Python3 + +import json +import argparse + +parser = argparse.ArgumentParser( + description='Parse input file from Busco to the output file', + ) + +parser.add_argument('--input', help='Input file') +parser.add_argument('--species', help='Species name') +parser.add_argument('--genomeFile', help='Name of the genome file') +parser.add_argument('--output', help='output file (csv format)') +args = parser.parse_args() + +#open json file and load it +data_dict = {} +with open(args.input) as result_file: + line = result_file.readline() + while line: + linestrp = line.strip() + if linestrp.startswith('#') or linestrp == '': + pass + else: + if linestrp.startswith('C:') or linestrp == "***** Results: *****" or linestrp.startswith("Assembly"): + pass + elif linestrp == "Dependencies and versions:": + break + else: + linesplt = linestrp.split('\t') + data_dict[linesplt[1]] = linesplt[0] + line = result_file.readline() + +with open(args.output, 'w') as fhout: + fhout.write("'Species','Genome file','Busco groups searched','Total length','Perc gaps','Scaffold N50','Contigs N50','Complete','Perc complete','Single copy','Perc single copy','Duplicated','Perc duplicated','Fragmented','Perc fragmented','Missing','Perc missing'\n") + strToWrite = f"'{args.species}'" + strToWrite += f",'{args.genomeFile}'" + strToWrite += f",'{data_dict['Total BUSCO groups searched']}'" + strToWrite += f",'{data_dict['Total length']}'" + strToWrite += f",'{data_dict['Percent gaps']}'" + strToWrite += f",'{data_dict['Scaffold N50']}'" + strToWrite += f",'{data_dict['Contigs N50']}'" + strToWrite += f",'{data_dict['Complete BUSCOs (C)']}'" + strToWrite += f",'{int(data_dict['Complete BUSCOs (C)'])/int(data_dict['Total BUSCO groups searched'])}'" + strToWrite += f",'{data_dict['Complete and single-copy BUSCOs (S)']}'" + strToWrite += f",'{int(data_dict['Complete and single-copy BUSCOs (S)'])/int(data_dict['Total BUSCO groups searched'])}'" + strToWrite += f",'{data_dict['Complete and duplicated BUSCOs (D)']}'" + strToWrite += f",'{int(data_dict['Complete and duplicated BUSCOs (D)'])/int(data_dict['Total BUSCO groups searched'])}'" + strToWrite += f",'{data_dict['Fragmented BUSCOs (F)']}'" + strToWrite += f",'{int(data_dict['Fragmented BUSCOs (F)'])/int(data_dict['Total BUSCO groups searched'])}'" + strToWrite += f",'{data_dict['Missing BUSCOs (M)']}'" + strToWrite += f",'{int(data_dict['Missing BUSCOs (M)'])/int(data_dict['Total BUSCO groups searched'])}'\n" + + fhout.write(strToWrite) \ No newline at end of file