Skip to content

Commit

Permalink
add process to extract results of busco to a csv file
Browse files Browse the repository at this point in the history
  • Loading branch information
julienfumey committed Dec 28, 2022
1 parent f589fe1 commit d28c72b
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 9 deletions.
37 changes: 28 additions & 9 deletions analyse_wg_ncbi.nf
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ process unzipFasta{
tuple val(spName), file(fasta) from fastaFile

output:
tuple val(spName), file('unzip.fasta') optional true into fastaUnzipped, fastaUnzipped2
tuple val(spName), file(fasta) file('unzip.fasta') optional true into fastaUnzipped, fastaUnzipped2


script:
Expand Down Expand Up @@ -217,11 +217,11 @@ process removeAltScaffold{
label 'samtools'

input:
tuple val(spName), file(infasta) from fastaUnzipped
tuple val(spName), file(fasta), file(infasta) from fastaUnzipped
file(goodScaffold) from listGoodScaffold

output:
tuple val(spName), file('genome_trimmed.fasta') into trimmedFasta
tuple val(spName), file(fasta), file('genome_trimmed.fasta') into trimmedFasta

script:
"""
Expand All @@ -232,7 +232,7 @@ process removeAltScaffold{


process busco{
publishDir "${resultsDir}/results/${spName}/", mode:'copy'
//publishDir "${resultsDir}/results/${spName}/", mode:'copy'
label 'busco'

maxForks 50
Expand All @@ -241,17 +241,36 @@ process busco{
input:
val buscoref from buscoRefFile
val buscoDLPath from buscoDLpath
tuple val(spName), file(fastaUnzipped) from ( notrim ? fastaUnzipped2 : trimmedFasta )
tuple val(spName), file(fasta), file(fastaUnzipped) from ( notrim ? fastaUnzipped2 : trimmedFasta )

output:
//tuple val(spName), path("*-busco.batch_summary.txt"), emit: batch_summary
tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/short_summary.*json") into short_summary_json
tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/short_summary.*txt") into short_summary_txt
tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/full_table*.tsv") optional true into full_tables
tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/missing_busco_list.*tsv") optional true into busco_list
//tuple val(spName), file(val), file("${spName.replaceAll(/\s/,'_')}/short_summary.*json") into short_summary_json
tuple val(spName), file(val), file("${spName.replaceAll(/\s/,'_')}/short_summary.*txt") into short_summary_txt
//tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/full_table*.tsv") optional true into full_tables
//tuple val(spName), file("${spName.replaceAll(/\s/,'_')}/missing_busco_list.*tsv") optional true into busco_list

script:
"""
busco -i ${fastaUnzipped} -m genome -o ${spName.replaceAll(/\s/,'_')} -l ${buscoref} --download_path ${buscoDLPath} -c 40 --offline -f --metaeuk_parameters='--remove-tmp-files=1' --metaeuk_rerun_parameters='--remove-tmp-files=1'
"""
}

process extractResults{
label 'extractResults'

input:
tuple val(spName), file(fasta), file(json) from short_summary_txt

output:
file('busco_results.csv') into finalResults

script:
"""
extractResults.sh --input ${json} --species ${spName} --genomeFile ${fasta.getName()} --output results.csv
"""
}

finalResults.collectFile(name: "busco_results.csv", keepHeader: true, skip: 1).subscribe{
f -> f.copyTo(resultsDir.resolve(f.name))
}
54 changes: 54 additions & 0 deletions bin/extractResult.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#! /usr/bin/Python3

import json
import argparse

parser = argparse.ArgumentParser(
description='Parse input file from Busco to the output file',
)

parser.add_argument('--input', help='Input file')
parser.add_argument('--species', help='Species name')
parser.add_argument('--genomeFile', help='Name of the genome file')
parser.add_argument('--output', help='output file (csv format)')
args = parser.parse_args()

#open json file and load it
data_dict = {}
with open(args.input) as result_file:
line = result_file.readline()
while line:
linestrp = line.strip()
if linestrp.startswith('#') or linestrp == '':
pass
else:
if linestrp.startswith('C:') or linestrp == "***** Results: *****" or linestrp.startswith("Assembly"):
pass
elif linestrp == "Dependencies and versions:":
break
else:
linesplt = linestrp.split('\t')
data_dict[linesplt[1]] = linesplt[0]
line = result_file.readline()

with open(args.output, 'w') as fhout:
fhout.write("'Species','Genome file','Busco groups searched','Total length','Perc gaps','Scaffold N50','Contigs N50','Complete','Perc complete','Single copy','Perc single copy','Duplicated','Perc duplicated','Fragmented','Perc fragmented','Missing','Perc missing'\n")
strToWrite = f"'{args.species}'"
strToWrite += f",'{args.genomeFile}'"
strToWrite += f",'{data_dict['Total BUSCO groups searched']}'"
strToWrite += f",'{data_dict['Total length']}'"
strToWrite += f",'{data_dict['Percent gaps']}'"
strToWrite += f",'{data_dict['Scaffold N50']}'"
strToWrite += f",'{data_dict['Contigs N50']}'"
strToWrite += f",'{data_dict['Complete BUSCOs (C)']}'"
strToWrite += f",'{int(data_dict['Complete BUSCOs (C)'])/int(data_dict['Total BUSCO groups searched'])}'"
strToWrite += f",'{data_dict['Complete and single-copy BUSCOs (S)']}'"
strToWrite += f",'{int(data_dict['Complete and single-copy BUSCOs (S)'])/int(data_dict['Total BUSCO groups searched'])}'"
strToWrite += f",'{data_dict['Complete and duplicated BUSCOs (D)']}'"
strToWrite += f",'{int(data_dict['Complete and duplicated BUSCOs (D)'])/int(data_dict['Total BUSCO groups searched'])}'"
strToWrite += f",'{data_dict['Fragmented BUSCOs (F)']}'"
strToWrite += f",'{int(data_dict['Fragmented BUSCOs (F)'])/int(data_dict['Total BUSCO groups searched'])}'"
strToWrite += f",'{data_dict['Missing BUSCOs (M)']}'"
strToWrite += f",'{int(data_dict['Missing BUSCOs (M)'])/int(data_dict['Total BUSCO groups searched'])}'\n"

fhout.write(strToWrite)

0 comments on commit d28c72b

Please sign in to comment.