diff --git a/CHANGELOG.md b/CHANGELOG.md index 004375b5..bf69e711 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,13 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#730](https://github.com/nf-core/mag/pull/730) - Migrated from local BUSCO module to nf-core one (added by @dialvarezs) + ### `Changed` ### `Fixed` ### `Dependencies` -### `Dependencies` +| Tool | Previous version | New version | +| ----- | ---------------- | ----------- | +| BUSCO | 5.4.3 | 5.8.2 | +| csvtk | | 0.31.0 | + +### `Deprecated` + +- [#730](https://github.com/nf-core/mag/pull/730) - Deprecated `--busco_clean` due to not being supported in the nf-core BUSCO module ## 3.3.0 [2024-12-19] diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 2b8d3767..2cff1f85 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -5,6 +5,7 @@ import argparse import sys +import warnings import pandas as pd @@ -110,10 +111,14 @@ def main(args=None): if args.binqc_summary and args.binqc_tool == "busco": busco_results = pd.read_csv(args.binqc_summary, sep="\t") - if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)): + busco_bins = set(busco_results["Input_file"]) + + if set(bins) != busco_bins and len(busco_bins.intersection(set(bins))) > 0: + warnings.warn("Bins in BUSCO summary do not match bins in bin depths summary") + elif len(busco_bins.intersection(set(bins))) == 0: sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") results = pd.merge( - results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" + results, busco_results, left_on="bin", right_on="Input_file", how="outer" ) # assuming depths for all bins are given if args.binqc_summary and args.binqc_tool == "checkm": diff --git a/bin/run_busco.sh b/bin/run_busco.sh deleted file mode 100755 index b0864e22..00000000 --- a/bin/run_busco.sh +++ /dev/null @@ -1,166 +0,0 @@ -#! /usr/bin/env bash - -# Originally written by Sabrina Krakau and James Fellows Yates and released -# under the MIT license. -# See git repository (https://github.com/nf-core/mag) for full license text. - -p=$1 -cp_augustus_config=$2 -db=$3 -bin=$4 -task_cpus=$5 -lineage_dataset_provided=$6 -busco_clean=$7 -extra_args=$8 - -# ensure augustus has write access to config directory -if [ ${cp_augustus_config} = "Y" ]; then - cp -r /usr/local/config/ augustus_config/ - export AUGUSTUS_CONFIG_PATH=augustus_config -fi - -# place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) -if [ ${lineage_dataset_provided} = "Y" ]; then - mkdir dataset - mv ${db} dataset/ -fi - -# set nullgob: if pattern matches no files, expand to a null string rather than to itself -shopt -s nullglob - -# only used for saving busco downloads -most_spec_db="NA" - -if - busco ${p} \ - --mode genome \ - --in ${bin} \ - --cpu ${task_cpus} \ - ${extra_args} \ - --out "BUSCO" >${bin}_busco.log 2>${bin}_busco.err -then - - # get name of used specific lineage dataset - summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) - if [ ${#summaries[@]} -ne 1 ]; then - echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." - exit 1 - fi - [[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]] - db_name_spec="${BASH_REMATCH[1]}" - most_spec_db=${db_name_spec} - echo "Used specific lineage dataset: ${db_name_spec}" - - if [ ${lineage_dataset_provided} = "Y" ]; then - cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt - - # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting - # report bin as failed to allow consistent warnings within the pipeline for both settings - if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log; then - echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." - echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" - fi - else - # auto lineage selection - if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && - egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log; } || - { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && - egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log && - egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log; }; then - # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately - # and cause different log messages - echo "Domain and specific lineage could be selected by BUSCO." - cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt - - db_name_gen="" - summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt) - if [ ${#summaries_gen[@]} -lt 1 ]; then - echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same." - cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt - db_name_gen=${db_name_spec} - else - [[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]] - db_name_gen="${BASH_REMATCH[1]}" - echo "Used generic lineage dataset: ${db_name_gen}" - cp BUSCO/short_summary.generic.${db_name_gen}.BUSCO.txt short_summary.domain.${db_name_gen}.${bin}.txt - fi - - for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz - break - done - for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz - break - done - - elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log; then - echo "Domain could be selected by BUSCO, but no more specific lineage." - cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt - - elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log; then - echo "Domain could be selected by BUSCO, but no more specific lineage." - cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt - - elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log; then - # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? - echo "Domain could not be selected by BUSCO, but virus dataset was selected." - cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt - else - echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2 - exit 1 - fi - fi - - for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_spec}.faa.gz - break - done - for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_spec}.fna.gz - break - done - -elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err; then - echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." - echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" - -elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err; then - echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." - echo -e "${bin}\tPlacements failed" >"${bin}_busco.failed_bin.txt" - - message=$(egrep $'INFO:\t\\S+ selected' ${bin}_busco.log) - [[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]] - db_name_gen="${BASH_REMATCH[1]}" - most_spec_db=${db_name_gen} - echo "Used generic lineage dataset: ${db_name_gen}" - cp BUSCO/auto_lineage/run_${db_name_gen}/short_summary.txt short_summary.domain.${db_name_gen}.${bin}.txt - - for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz - break - done - for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz - break - done - -else - echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2 - exit 1 -fi - -# additionally output genes predicted with Prodigal (GFF3) -if [ -f BUSCO/logs/prodigal_out.log ]; then - mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff" -fi - -# output value of most_spec_db -echo ${most_spec_db} >info_most_spec_db.txt - -# if needed delete temporary BUSCO files -if [ ${busco_clean} = "Y" ]; then - find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \; - find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \; - find . -depth -type d -name "run_*" -execdir rm -rf "{}" + -fi diff --git a/bin/summary_busco.py b/bin/summary_busco.py deleted file mode 100755 index 9701783b..00000000 --- a/bin/summary_busco.py +++ /dev/null @@ -1,226 +0,0 @@ -#!/usr/bin/env python - -## Originally written by Daniel Straub, Sabrina Krakau, and Hadrien Gourlé -## and released under the MIT license. -## See git repository (https://github.com/nf-core/mag) for full license text. - -## USAGE: ./summary.busco.py -sd -ss -f - -import re -import sys -import argparse -import os.path -import pandas as pd - - -def parse_args(args=None): - parser = argparse.ArgumentParser() - parser.add_argument( - "-a", - "--auto", - default=False, - action="store_true", - help="BUSCO run in auto lineage selection mode.", - ) - parser.add_argument( - "-sd", - "--summaries_domain", - nargs="+", - metavar="FILE", - help="List of BUSCO summary files for domains.", - ) - parser.add_argument( - "-ss", - "--summaries_specific", - nargs="+", - metavar="FILE", - help="List of BUSCO summary files for specific lineages.", - ) - parser.add_argument( - "-f", - "--failed_bins", - nargs="+", - metavar="FILE", - help="List of files containing bin name for which BUSCO analysis failed.", - ) - parser.add_argument( - "-o", - "--out", - required=True, - metavar="FILE", - type=argparse.FileType("w"), - help="Output file containing final BUSCO summary.", - ) - return parser.parse_args(args) - - -def main(args=None): - args = parse_args(args) - - if ( - not args.summaries_domain - and not args.summaries_specific - and not args.failed_bins - ): - sys.exit( - "Either --summaries_domain, --summaries_specific or --failed_bins must be specified!" - ) - - # "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa" - # " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148" - - regexes = [ - r"# Summarized benchmarking in BUSCO notation for file (\S+)", - r"# The lineage dataset is: (\S+) \(", - r" C:(\S+)%\[S:", - r"%\[S:(\S+)%,D:", - r"%,D:(\S+)%\],F:", - r"%\],F:(\S+)%,M:", - r"%,M:(\S+)%,n:", - r"%,n:(\S+)", - ] - columns_domain = [ - "GenomeBin", - "Domain", - "%Complete (domain)", - "%Complete and single-copy (domain)", - "%Complete and duplicated (domain)", - "%Fragmented (domain)", - "%Missing (domain)", - "Total number (domain)", - ] - columns_specific = [ - "GenomeBin", - "Specific lineage dataset", - "%Complete (specific)", - "%Complete and single-copy (specific)", - "%Complete and duplicated (specific)", - "%Fragmented (specific)", - "%Missing (specific)", - "Total number (specific)", - ] - - if args.auto: - columns = [ - "GenomeBin", - "Domain", - "%Complete (domain)", - "%Complete and single-copy (domain)", - "%Complete and duplicated (domain)", - "%Fragmented (domain)", - "%Missing (domain)", - "Total number (domain)", - "Specific lineage dataset", - "%Complete (specific)", - "%Complete and single-copy (specific)", - "%Complete and duplicated (specific)", - "%Fragmented (specific)", - "%Missing (specific)", - "Total number (specific)", - ] - else: - columns = [ - "GenomeBin", - "Specific lineage dataset", - "%Complete (specific)", - "%Complete and single-copy (specific)", - "%Complete and duplicated (specific)", - "%Fragmented (specific)", - "%Missing (specific)", - "Total number (specific)", - ] - - # Search each summary file using its regex - results_domain = [] - if args.summaries_domain: - for file in args.summaries_domain: - with open(file) as infile: - results = [] - text = infile.read() - for index, regex in enumerate(regexes): - match = re.search(regex, text) - if match: - if index == 0: - results.append(os.path.basename(match.group(1))) - else: - results.append(match.group(1)) - results_domain.append(results) - df_domain = pd.DataFrame(results_domain, columns=columns_domain) - - results_specific = [] - if args.summaries_specific: - for file in args.summaries_specific: - with open(file) as infile: - results = [] - text = infile.read() - for index, regex in enumerate(regexes): - match = re.search(regex, text) - if match: - if index == 0: - results.append(os.path.basename(match.group(1))) - else: - results.append(match.group(1)) - results_specific.append(results) - df_specific = pd.DataFrame(results_specific, columns=columns_specific) - - # Add entries for bins with failed analysis (for domain and specific lineage where applicable) - failed = [] - if args.failed_bins: - for file in args.failed_bins: - with open(file) as infile: - line = infile.readline() - # in case of failed placements domain summary was used and specific part will be filled with NAs when merging - if re.split(r"[\t\n]", line)[1] != "Placements failed": - failed_bin = re.split(r"[\t\n]", line)[0] - if args.auto: - results = [ - failed_bin, - pd.NA, - "0.0", - "0.0", - "0.0", - "0.0", - "100.0", - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - ] - else: - results = [ - failed_bin, - pd.NA, - "0.0", - "0.0", - "0.0", - "0.0", - "100.0", - pd.NA, - ] - failed.append(results) - df_failed = pd.DataFrame(failed, columns=columns) - - # merge results - if args.auto: - df_final = df_domain.merge(df_specific, on="GenomeBin", how="outer").append( - df_failed - ) - # check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses' - df_final.loc[ - pd.isna(df_final["Domain"]) - & pd.notna(df_final["Specific lineage dataset"]), - "Domain", - ] = "Viruses" - - else: - df_final = df_specific.append(df_failed) - - df_final.to_csv(args.out, sep="\t", index=False) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/conf/base.config b/conf/base.config index 3bb06c8f..ac183ba3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -156,9 +156,10 @@ process { withName: MAG_DEPTHS { memory = { 16.GB * task.attempt } } - withName: BUSCO { - cpus = { 8 * task.attempt } - memory = { 20.GB * task.attempt } + withName: BUSCO_BUSCO { + cpus = { 10 * task.attempt } + memory = { 12.GB * task.attempt } + errorStrategy = { task.exitStatus in (130..145) ? 'retry' : 'ignore' } } withName: MAXBIN2 { errorStrategy = { task.exitStatus in [1, 255] ? 'ignore' : 'retry' } diff --git a/conf/modules.config b/conf/modules.config index 701598db..a31aaad8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -377,29 +377,26 @@ process { ] } - withName: BUSCO_DB_PREPARATION { - publishDir = [path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, mode: params.publish_dir_mode, pattern: "*.tar.gz"] - } - - withName: BUSCO { + withName: BUSCO_BUSCO { ext.args = [ params.busco_db ? '--offline' : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, - mode: params.publish_dir_mode, - pattern: "*.{log,err,faa.gz,fna.gz,gff,txt}" + [ + path: { "${params.outdir}/GenomeBinning/QC/BUSCO/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{txt,json}" + ], + [ + path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, + mode: params.publish_dir_mode, + overwrite: false, + pattern: "busco_downloads/lineages/*", + enabled: params.save_busco_db + ] ] } - withName: BUSCO_SAVE_DOWNLOAD { - publishDir = [path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] - } - - withName: 'BUSCO_SUMMARY|QUAST_BINS|QUAST_BINS_SUMMARY' { - publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] - } - withName: ARIA2_UNTAR { publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM/checkm_downloads" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_checkm_data] } @@ -424,7 +421,7 @@ process { ] } - withName: COMBINE_BINQC_TSV { + withName: CONCAT_BINQC_TSV { ext.prefix = { "${params.binqc_tool}_summary" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC" }, diff --git a/conf/test.config b/conf/test.config index 04fced63..a1aced51 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,8 +29,7 @@ params { skip_krona = false min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" - busco_clean = true + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb12.2024-11-14.tar.gz" skip_gtdbtk = true gtdbtk_min_completeness = 0.01 skip_concoct = true diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config index e8dab425..e555c5d4 100644 --- a/conf/test_ancient_dna.config +++ b/conf/test_ancient_dna.config @@ -31,7 +31,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb12.2024-11-14.tar.gz" skip_gtdbtk = true gtdbtk_min_completeness = 0.01 ancient_dna = true diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config index 9602197c..c8ede6f1 100644 --- a/conf/test_binrefinement.config +++ b/conf/test_binrefinement.config @@ -31,12 +31,11 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb12.2024-11-14.tar.gz" skip_gtdbtk = true gtdbtk_min_completeness = 0.01 refine_bins_dastool = true refine_bins_dastool_threshold = 0 // TODO not using 'both' until #489 merged postbinning_input = 'refined_bins_only' - busco_clean = true } diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config index e241e03e..d185b649 100644 --- a/conf/test_host_rm.config +++ b/conf/test_host_rm.config @@ -28,7 +28,7 @@ params { input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.host_rm.csv' min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb12.2024-11-14.tar.gz" skip_gtdbtk = true gtdbtk_min_completeness = 0.01 skip_concoct = true diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index cfb0991c..55b5d939 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -27,7 +27,7 @@ params { input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.hybrid.csv' min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb12.2024-11-14.tar.gz" skip_gtdbtk = true gtdbtk_min_completeness = 0.01 skip_concoct = true diff --git a/modules.json b/modules.json index 05e3b3dd..d2bbd153 100644 --- a/modules.json +++ b/modules.json @@ -36,6 +36,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "busco/busco": { + "branch": "master", + "git_sha": "d34caf3c0d3cf5b9bae0fae6107bab0933c96f37", + "installed_by": ["modules"] + }, "cat/fastq": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", @@ -64,12 +69,12 @@ }, "checkm2/databasedownload": { "branch": "master", - "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", + "git_sha": "ad43b5d5d464698d11b599b004adc1c00615ef1b", "installed_by": ["modules"] }, "checkm2/predict": { "branch": "master", - "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", + "git_sha": "ad43b5d5d464698d11b599b004adc1c00615ef1b", "installed_by": ["modules"] }, "chopper": { @@ -102,6 +107,11 @@ "git_sha": "baa30accc6c50ea8a98662417d4f42ed18966353", "installed_by": ["fasta_binning_concoct"] }, + "csvtk/concat": { + "branch": "master", + "git_sha": "aa5c23023134cf2d8b75a95d53557890e40261b9", + "installed_by": ["modules"] + }, "dastool/dastool": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/local/busco.nf b/modules/local/busco.nf deleted file mode 100644 index 4d0a561d..00000000 --- a/modules/local/busco.nf +++ /dev/null @@ -1,59 +0,0 @@ -process BUSCO { - tag "${bin}" - - conda "bioconda::busco=5.4.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/busco:5.4.3--pyhdfd78af_0': - 'biocontainers/busco:5.4.3--pyhdfd78af_0' }" - - input: - tuple val(meta), path(bin) - tuple val(db_meta), path(db) - - output: - tuple val(meta), path("short_summary.domain.*.${bin}.txt") , optional:true , emit: summary_domain - tuple val(meta), path("short_summary.specific_lineage.*.${bin}.txt"), optional:true , emit: summary_specific - tuple env(most_spec_db), path('busco_downloads/') , optional:true , emit: busco_downloads - path("${bin}_busco.log") - path("${bin}_busco.err") - path("${bin}_buscos.*.faa.gz") , optional:true - path("${bin}_buscos.*.fna.gz") , optional:true - path("${bin}_prodigal.gff") , optional:true , emit: prodigal_genes - tuple val(meta), path("${bin}_busco.failed_bin.txt") , optional:true , emit: failed_bin - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - def cp_augustus_config = workflow.profile.toString().indexOf("conda") != -1 ? "N" : "Y" - def lineage_dataset_provided = "${db_meta.lineage}" - def busco_clean = params.busco_clean ? "Y" : "N" - - def p = params.busco_auto_lineage_prok ? "--auto-lineage-prok" : "--auto-lineage" - if ( "${lineage_dataset_provided}" == "Y" ) { - p = "--lineage_dataset dataset/${db}" - } else if ( "${lineage_dataset_provided}" == "N" ) { - p += " --offline --download_path ${db}" - } else { - lineage_dataset_provided = "" - } - """ - run_busco.sh \\ - "${p}" \\ - "${cp_augustus_config}" \\ - "${db}" \\ - "${bin}" \\ - ${task.cpus} \\ - "${lineage_dataset_provided}" \\ - "${busco_clean}" \\ - "${args}" - - most_spec_db=\$( versions.yml - "${task.process}": - python: \$(python --version 2>&1 | sed 's/Python //g') - R: \$(R --version 2>&1 | sed -n 1p | sed 's/R version //' | sed 's/ (.*//') - busco: \$(busco --version 2>&1 | sed 's/BUSCO //g') - END_VERSIONS - """ -} diff --git a/modules/local/busco_db_preparation.nf b/modules/local/busco_db_preparation.nf deleted file mode 100644 index e3418cb6..00000000 --- a/modules/local/busco_db_preparation.nf +++ /dev/null @@ -1,26 +0,0 @@ -process BUSCO_DB_PREPARATION { - tag "${database.baseName}" - - conda "conda-forge::sed=4.7" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'nf-core/ubuntu:20.04' }" - - input: - path database - - output: - tuple val("${database.getSimpleName()}"), path("buscodb/*"), emit: db - path "versions.yml" , emit: versions - - script: - """ - mkdir buscodb - tar -xf ${database} -C buscodb - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - tar: \$(tar --version 2>&1 | sed -n 1p | sed 's/tar (GNU tar) //') - END_VERSIONS - """ -} diff --git a/modules/local/busco_save_download.nf b/modules/local/busco_save_download.nf deleted file mode 100644 index 099c4150..00000000 --- a/modules/local/busco_save_download.nf +++ /dev/null @@ -1,24 +0,0 @@ -process BUSCO_SAVE_DOWNLOAD { - // execute sequentially to avoid artefacts when saving files for multiple busco instances - maxForks 1 - - conda "conda-forge::bash=5.2.21" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container - ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' - : 'nf-core/ubuntu:20.04' }" - - input: - path(busco_downloads) - - output: - path 'busco_downloads/**', includeInputs: true, emit: busco_files - path 'versions.yml' , emit: versions - - script: - """ - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bash: \$(echo \$BASH_VERSION) - END_VERSIONS - """ -} diff --git a/modules/local/busco_summary.nf b/modules/local/busco_summary.nf deleted file mode 100644 index bafcc495..00000000 --- a/modules/local/busco_summary.nf +++ /dev/null @@ -1,35 +0,0 @@ -process BUSCO_SUMMARY { - - conda "conda-forge::pandas=1.4.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : - 'biocontainers/pandas:1.4.3' }" - - input: - path(summaries_domain) - path(summaries_specific) - path(failed_bins) - - output: - path "busco_summary.tsv", emit: summary - path "versions.yml" , emit: versions - - script: - def reference = params.busco_db.toString().contains('odb10') - def auto = reference ? "" : "-a" - def ss = summaries_specific.sort().size() > 0 ? "-ss ${summaries_specific}" : "" - def sd = summaries_domain.sort().size() > 0 ? "-sd ${summaries_domain}" : "" - def f = "" - if ("${reference}" == false && failed_bins.sort().size() > 0) - f = "-f ${failed_bins}" - """ - summary_busco.py $auto $ss $sd $f -o busco_summary.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version 2>&1 | sed 's/Python //g') - pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") - END_VERSIONS - """ -} - diff --git a/modules/local/combine_tsv.nf b/modules/local/combine_tsv.nf deleted file mode 100644 index 1fe7ec1a..00000000 --- a/modules/local/combine_tsv.nf +++ /dev/null @@ -1,26 +0,0 @@ -process COMBINE_TSV { - - // Using bioawk as already use that for CONVERT_DEPTHS and does same thing - conda "bioconda::bioawk=1.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : - 'biocontainers/bioawk:1.0--hed695b0_5' }" - - input: - path(bin_summaries, stageAs: "bin_summaries/*.tsv") - - output: - path("*.tsv") , emit: combined - path "versions.yml", emit: versions - - script: - def prefix = task.ext.prefix ?: "bin_depths_summary_combined" - """ - bioawk '(NR == 1) || (FNR > 1)' ${bin_summaries} > ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bioawk: \$(bioawk --version | cut -f 3 -d ' ' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/busco/busco/environment.yml b/modules/nf-core/busco/busco/environment.yml new file mode 100644 index 00000000..53e5e90e --- /dev/null +++ b/modules/nf-core/busco/busco/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::busco=5.8.2 diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf new file mode 100644 index 00000000..609cae95 --- /dev/null +++ b/modules/nf-core/busco/busco/main.nf @@ -0,0 +1,111 @@ +process BUSCO_BUSCO { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/busco:5.8.2--pyhdfd78af_0': + 'biocontainers/busco:5.8.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta, stageAs:'tmp_input/*') + val mode // Required: One of genome, proteins, or transcriptome + val lineage // Required: lineage for checking against, or "auto/auto_prok/auto_euk" for enabling auto-lineage + path busco_lineages_path // Recommended: busco lineages file - downloads if not set + path config_file // Optional: busco configuration file + + output: + tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary + tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt , optional: true + tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json , optional: true + tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table , optional: true + tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv") , emit: missing_busco_list , optional: true + tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa") , emit: single_copy_proteins , optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences") , emit: seq_dir , optional: true + tuple val(meta), path("*-busco/*/translated_proteins") , emit: translated_dir , optional: true + tuple val(meta), path("*-busco") , emit: busco_dir + tuple val(meta), path("busco_downloads/lineages/*") , emit: downloaded_lineages , optional: true + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + if ( mode !in [ 'genome', 'proteins', 'transcriptome' ] ) { + error "Mode must be one of 'genome', 'proteins', or 'transcriptome'." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}-${lineage}" + def busco_config = config_file ? "--config ${config_file}" : '' + def busco_lineage = lineage in [ 'auto', 'auto_prok', 'auto_euk'] + ? lineage.replaceFirst('auto', '--auto-lineage').replaceAll('_', '-') + : "--lineage_dataset ${lineage}" + def busco_lineage_dir = busco_lineages_path ? "--download_path ${busco_lineages_path}" : '' + """ + # Nextflow changes the container --entrypoint to /bin/bash (container default entrypoint: /usr/local/env-execute) + # Check for container variable initialisation script and source it. + if [ -f "/usr/local/env-activate.sh" ]; then + set +u # Otherwise, errors out because of various unbound variables + . "/usr/local/env-activate.sh" + set -u + fi + + # If the augustus config directory is not writable, then copy to writeable area + if [ ! -w "\${AUGUSTUS_CONFIG_PATH}" ]; then + # Create writable tmp directory for augustus + AUG_CONF_DIR=\$( mktemp -d -p \$PWD ) + cp -r \$AUGUSTUS_CONFIG_PATH/* \$AUG_CONF_DIR + export AUGUSTUS_CONFIG_PATH=\$AUG_CONF_DIR + echo "New AUGUSTUS_CONFIG_PATH=\${AUGUSTUS_CONFIG_PATH}" + fi + + # Ensure the input is uncompressed + INPUT_SEQS=input_seqs + mkdir "\$INPUT_SEQS" + cd "\$INPUT_SEQS" + for FASTA in ../tmp_input/*; do + if [ "\${FASTA##*.}" == 'gz' ]; then + gzip -cdf "\$FASTA" > \$( basename "\$FASTA" .gz ) + else + ln -s "\$FASTA" . + fi + done + cd .. + + busco \\ + --cpu $task.cpus \\ + --in "\$INPUT_SEQS" \\ + --out ${prefix}-busco \\ + --mode $mode \\ + $busco_lineage \\ + $busco_lineage_dir \\ + $busco_config \\ + $args + + # clean up + rm -rf "\$INPUT_SEQS" + + # Move files to avoid staging/publishing issues + mv ${prefix}-busco/batch_summary.txt ${prefix}-busco.batch_summary.txt + mv ${prefix}-busco/*/short_summary.*.{json,txt} . || echo "Short summaries were not available: No genes were found." + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}-${lineage}" + def fasta_name = files(fasta).first().name - '.gz' + """ + touch ${prefix}-busco.batch_summary.txt + mkdir -p ${prefix}-busco/$fasta_name/run_${lineage}/busco_sequences + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/busco/busco/meta.yml b/modules/nf-core/busco/busco/meta.yml new file mode 100644 index 00000000..8f719e08 --- /dev/null +++ b/modules/nf-core/busco/busco/meta.yml @@ -0,0 +1,163 @@ +name: busco_busco +description: Benchmarking Universal Single Copy Orthologs +keywords: + - quality control + - genome + - transcriptome + - proteome +tools: + - busco: + description: BUSCO provides measures for quantitative assessment of genome assembly, + gene set, and transcriptome completeness based on evolutionarily informed expectations + of gene content from near-universal single-copy orthologs selected from OrthoDB. + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + tool_dev_url: https://gitlab.com/ezlab/busco + doi: "10.1007/978-1-4939-9173-0_14" + licence: ["MIT"] + identifier: biotools:busco +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Nucleic or amino acid sequence file in FASTA format. + pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}" + - - mode: + type: string + description: The mode to run Busco in. One of genome, proteins, or transcriptome + pattern: "{genome,proteins,transcriptome}" + - - lineage: + type: string + description: The BUSCO lineage to use, or "auto", "auto_prok" or "auto_euk" + to automatically select lineage + - - busco_lineages_path: + type: directory + description: Path to local BUSCO lineages directory. + - - config_file: + type: file + description: Path to BUSCO config file. +output: + - batch_summary: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*-busco.batch_summary.txt": + type: file + description: Summary of all sequence files analyzed + pattern: "*-busco.batch_summary.txt" + - short_summaries_txt: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - short_summary.*.txt: + type: file + description: Short Busco summary in plain text format + pattern: "short_summary.*.txt" + - short_summaries_json: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - short_summary.*.json: + type: file + description: Short Busco summary in JSON format + pattern: "short_summary.*.json" + - full_table: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*-busco/*/run_*/full_table.tsv": + type: file + description: Full BUSCO results table + pattern: "full_table.tsv" + - missing_busco_list: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*-busco/*/run_*/missing_busco_list.tsv": + type: file + description: List of missing BUSCOs + pattern: "missing_busco_list.tsv" + - single_copy_proteins: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*-busco/*/run_*/single_copy_proteins.faa": + type: file + description: Fasta file of single copy proteins (transcriptome mode) + pattern: "single_copy_proteins.faa" + - seq_dir: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*-busco/*/run_*/busco_sequences": + type: directory + description: BUSCO sequence directory + pattern: "busco_sequences" + - translated_dir: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*-busco/*/translated_proteins": + type: directory + description: Six frame translations of each transcript made by the transcriptome + mode + pattern: "translated_dir" + - busco_dir: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*-busco": + type: directory + description: BUSCO lineage specific output + pattern: "*-busco" + - downloaded_lineages: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "busco_downloads/lineages/*": + type: directory + description: Lineages downloaded by BUSCO when running the analysis, for example bacteria_odb12 + pattern: "busco_downloads/lineages/*" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@priyanka-surana" + - "@charles-plessy" + - "@mahesh-panchal" + - "@muffato" + - "@jvhagey" + - "@gallvp" +maintainers: + - "@priyanka-surana" + - "@charles-plessy" + - "@mahesh-panchal" + - "@muffato" + - "@jvhagey" + - "@gallvp" diff --git a/modules/nf-core/busco/busco/tests/main.nf.test b/modules/nf-core/busco/busco/tests/main.nf.test new file mode 100644 index 00000000..55954a73 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/main.nf.test @@ -0,0 +1,417 @@ +nextflow_process { + + name "Test Process BUSCO_BUSCO" + script "../main.nf" + process "BUSCO_BUSCO" + + tag "modules" + tag "modules_nfcore" + tag "busco" + tag "busco/busco" + + test("test_busco_genome_single_fasta") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'bacteria_odb12' // Launch with 'auto' to use --auto-lineage, and specified lineages // 'auto' removed from test due to memory issues + input[3] = [] // Download busco lineage + input[4] = [] // No config + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + } + + test("test_busco_genome_multi_fasta") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/genome.fasta', checkIfExists: true) + ] + ] + input[1] = 'genome' + input[2] = 'bacteria_odb12' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1][0]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_txt[0][1][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1][0]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + with(path(process.out.short_summaries_json[0][1][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1][0]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(file(process.out.seq_dir[0][1][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_eukaryote_metaeuk") { + + config './nextflow.metaeuk.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'eukaryota_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_eukaryote_augustus") { + + config './nextflow.augustus.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'eukaryota_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + assert snapshot( + process.out.batch_summary[0][1], + process.out.versions[0] + ).match() + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Augustus did not recognize any genes') + + } + + assert process.out.short_summaries_json == [] + assert process.out.short_summaries_txt == [] + assert process.out.missing_busco_list == [] + assert process.out.full_table == [] + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_protein") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = 'proteins' + input[2] = 'bacteria_odb12' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_transcriptome") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fasta/test1.contigs.fa.gz', checkIfExists: true) + ] + input[1] = 'transcriptome' + input[2] = 'bacteria_odb12' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.translated_dir[0][1], + process.out.single_copy_proteins[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + } + + } + + test("minimal-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'bacteria_odb12' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.batch_summary, + process.out.versions + ).match() } + ) + } + } + +} \ No newline at end of file diff --git a/modules/nf-core/busco/busco/tests/main.nf.test.snap b/modules/nf-core/busco/busco/tests/main.nf.test.snap new file mode 100644 index 00000000..0f44373f --- /dev/null +++ b/modules/nf-core/busco/busco/tests/main.nf.test.snap @@ -0,0 +1,149 @@ +{ + "minimal-stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test-bacteria_odb12-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + "versions.yml:md5,c6e638f981761c13cd9ff7663cf707e6" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T15:30:45.505241761" + }, + "test_busco_eukaryote_augustus": { + "content": [ + "test-eukaryota_odb10-busco.batch_summary.txt:md5,3ea3bdc423a461dae514d816bdc61c89", + "versions.yml:md5,c6e638f981761c13cd9ff7663cf707e6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.1" + }, + "timestamp": "2024-12-11T13:07:45.550722277" + }, + "test_busco_genome_single_fasta": { + "content": [ + "test-bacteria_odb12-busco.batch_summary.txt:md5,e3e503e1540b633d95c273c465945740", + "full_table.tsv:md5,e2a08fdd9b2596322e70c5549d1affc7", + "missing_busco_list.tsv:md5,25417462f2c484f1942c86b21bcf77d0", + "versions.yml:md5,c6e638f981761c13cd9ff7663cf707e6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.1" + }, + "timestamp": "2024-12-11T12:43:40.359736221" + }, + "test_busco_genome_multi_fasta": { + "content": [ + [ + "full_table.tsv:md5,5e7df014f2804789f0d98ae2e09734ad", + "full_table.tsv:md5,e2a08fdd9b2596322e70c5549d1affc7" + ], + [ + "missing_busco_list.tsv:md5,d902f10173b463f81e4892ef64f63c50", + "missing_busco_list.tsv:md5,25417462f2c484f1942c86b21bcf77d0" + ], + "versions.yml:md5,c6e638f981761c13cd9ff7663cf707e6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.1" + }, + "timestamp": "2024-12-11T17:17:42.175675858" + }, + "test_busco_eukaryote_metaeuk": { + "content": [ + "test-eukaryota_odb10-busco.batch_summary.txt:md5,ff6d8277e452a83ce9456bbee666feb6", + "full_table.tsv:md5,9bfa9ef7d54ca6ad8bcf8e87729720b1", + "missing_busco_list.tsv:md5,325b529e5a8af2a392d747b4eddc150c", + "versions.yml:md5,c6e638f981761c13cd9ff7663cf707e6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.1" + }, + "timestamp": "2024-12-11T13:07:17.335085326" + }, + "test_busco_transcriptome": { + "content": [ + "test-bacteria_odb12-busco.batch_summary.txt:md5,6cd69d8a66b5f8b7fd4a9de758e7a739", + "full_table.tsv:md5,73a3a90c2fa8fef41cafed5a607fab66", + "missing_busco_list.tsv:md5,4778855c345f4e409750c9bbd38c5a0c", + [ + "9767721at2.faa:md5,1731738ca153959391f8302fd5a3679f", + "9778364at2.faa:md5,7a19a6b6696ae53efce30457b4dd1ab2", + "9782003at2.faa:md5,65d2a613c903852681981f8e8427dc70", + "9790352at2.faa:md5,5e18cfb68122dff7a61c5517246223fc", + "9791908at2.faa:md5,707ef4501f93a6e0dc217e037f26da54", + "9793681at2.faa:md5,e361d654145e70f06c386e75ad90f943", + "9800696at2.faa:md5,9e2f431e4aada7bdc2c317747105b874", + "9801107at2.faa:md5,83933b1426fc9abfe8891c49838cd02f", + "9801213at2.faa:md5,ec340354a86728189c3d1a294c0ccbad", + "9801753at2.faa:md5,39c09bd8a831c90aab44ded14c56d0e6", + "9802065at2.faa:md5,8361fa013dc1cd29af938c9d5ffebfe4", + "9802219at2.faa:md5,9e23aed07790f460da634f7f6132e73d", + "9802304at2.faa:md5,86b259197441716075f3d3d18f8743ba", + "9802309at2.faa:md5,b4b4613e9b69baa9274140c1b26cc27b", + "9802672at2.faa:md5,6c6d592c2fbb0d7a4e5e1f47a15644f0", + "9803420at2.faa:md5,eec6f7189ce9a596ed6ead06f2229c8a", + "9803541at2.faa:md5,132954cc7bfcb1c1fe9da105867c4b78", + "9803667at2.faa:md5,ec31d499f6b523cb081af6a3284a5a5c", + "9803773at2.faa:md5,efbe4c35075dd8c871827d4e5ac72922", + "9804006at2.faa:md5,fca5b560714ba37be0be3e2597f74c5a", + "9804243at2.faa:md5,3280570e4357fb4daedaea8a066dbf0b", + "9804478at2.faa:md5,98c2cfd8f089812a41a1e66fea630b2d", + "9804933at2.faa:md5,de648025c49061c614c77e7c9ce7ab62", + "9805026at2.faa:md5,eea9da88f3cd718514493d6890bf7660", + "9806637at2.faa:md5,c8a9e0c37a8aeb1fd44db64fd93aa3e1", + "9806651at2.faa:md5,f5abacf8930d78c81fdeb0c91c8681a7", + "9807064at2.faa:md5,1167d5c4c044b4eb82fac5d1955e7130", + "9807233at2.faa:md5,7c8adb6556a7f9a0244e7c7e5f75f20d", + "9807240at2.faa:md5,2eff2de1ab83b22f3234a529a44e22bb", + "9807458at2.faa:md5,bee695d260b2b7f8980a636fed6aa0c0", + "9808036at2.faa:md5,797ca476d2c7820151fec98d2815d6cb", + "9808348at2.faa:md5,4e8573a5d287e01aa4f5de8b48feaa42", + "9808936at2.faa:md5,30333f3f62f8e3d0ea6f6544d49572c6", + "9809052at2.faa:md5,0590efbf94fce0ad212513dcb2e8176f", + "9809084at2.faa:md5,37e6214b4204dc31858e2ef2bad5db4a", + "9809356at2.faa:md5,e18c1d5a4931a25baf7dbd1a40c417dc", + "9809796at2.faa:md5,857aac8a22c00472bfc9add7fde94c5c", + "9810191at2.faa:md5,72b63933bb045b680e0635eb03915cc0", + "9811804at2.faa:md5,da341c24e763a949d16432bb052af321", + "9812272at2.faa:md5,7a54f872dd8243c6814852d40cf1bfc0", + "9812943at2.faa:md5,149da17f067cdce328a73f6364a95b26", + "9813375at2.faa:md5,49835b9f3188434c771a840b628b07f6", + "9814755at2.faa:md5,9b4c4648d250c2e6d04acb78f9cf6df0" + ], + "single_copy_proteins.faa:md5,14124def13668c6d9b0d589207754b31", + "versions.yml:md5,c6e638f981761c13cd9ff7663cf707e6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.1" + }, + "timestamp": "2024-12-11T12:45:43.336777509" + }, + "test_busco_protein": { + "content": [ + "test-bacteria_odb12-busco.batch_summary.txt:md5,44d4cdebd61a3c8e8981ddf1829f83b3", + "full_table.tsv:md5,696bae3f377fd5dbaf19f1c522088d93", + "missing_busco_list.tsv:md5,d902f10173b463f81e4892ef64f63c50", + "versions.yml:md5,c6e638f981761c13cd9ff7663cf707e6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.1" + }, + "timestamp": "2024-12-11T12:45:16.960592213" + } +} \ No newline at end of file diff --git a/modules/nf-core/busco/busco/tests/nextflow.augustus.config b/modules/nf-core/busco/busco/tests/nextflow.augustus.config new file mode 100644 index 00000000..84daa69d --- /dev/null +++ b/modules/nf-core/busco/busco/tests/nextflow.augustus.config @@ -0,0 +1,5 @@ +process { + withName: 'BUSCO_BUSCO' { + ext.args = '--tar --augustus' + } +} diff --git a/modules/nf-core/busco/busco/tests/nextflow.config b/modules/nf-core/busco/busco/tests/nextflow.config new file mode 100644 index 00000000..1ec3fec0 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'BUSCO_BUSCO' { + ext.args = '--tar' + } +} diff --git a/modules/nf-core/busco/busco/tests/nextflow.metaeuk.config b/modules/nf-core/busco/busco/tests/nextflow.metaeuk.config new file mode 100644 index 00000000..c1418445 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/nextflow.metaeuk.config @@ -0,0 +1,5 @@ +process { + withName: 'BUSCO_BUSCO' { + ext.args = '--tar --metaeuk' + } +} diff --git a/modules/nf-core/busco/busco/tests/old_test.yml b/modules/nf-core/busco/busco/tests/old_test.yml new file mode 100644 index 00000000..75177f5d --- /dev/null +++ b/modules/nf-core/busco/busco/tests/old_test.yml @@ -0,0 +1,624 @@ +- name: busco test_busco_genome_single_fasta + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_genome_single_fasta -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: bc2440f8a68d7fbf931ff911c1c3fdfa + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_out.log + md5sum: 9caf1a1434414c78562eb0bbb9c0e53f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/.checkpoint + contains: + - "Tool: prodigal" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/full_table.tsv + md5sum: c56edab1dc1522e993c25ae2b730799f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/missing_busco_list.tsv + md5sum: b533ef30270f27160acce85a22d01bf5 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "lineage_dataset" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_genome_multi_fasta + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_genome_multi_fasta -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: 8c64c1a28b086ef2ee444f99cbed5f7d + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/bbtools_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/bbtools_out.log + md5sum: 8f047bdb33264d22a83920bc2c63f29a + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/prodigal_err.log + md5sum: c1fdc6977332f53dfe7f632733bb4585 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/prodigal_out.log + md5sum: 50752acb1c5a20be886bfdfc06635bcb + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/.checkpoint + contains: + - "Tool: prodigal" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/predicted.faa + md5sum: 8166471fc5f08c82fd5643ab42327f9d + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/predicted.fna + md5sum: ddc508a18f60e7f3314534df50cdf8ca + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa + md5sum: 8166471fc5f08c82fd5643ab42327f9d + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.fna + md5sum: ddc508a18f60e7f3314534df50cdf8ca + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_err.log + md5sum: c1fdc6977332f53dfe7f632733bb4585 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_out.log + md5sum: 50752acb1c5a20be886bfdfc06635bcb + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4.faa + md5sum: e56fd59c38248dc21ac94355dca98121 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4.fna + md5sum: b365f84bf99c68357952e0b98ed7ce42 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4_err.log + md5sum: e5f14d7925ba14a0f9850542f3739894 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4_out.log + md5sum: d41971bfc1b621d4ffd2633bc47017ea + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/full_table.tsv + md5sum: c9651b88b10871abc260ee655898e828 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/missing_busco_list.tsv + md5sum: 9939309df2da5419de88c32d1435c779 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_out.log + md5sum: 9caf1a1434414c78562eb0bbb9c0e53f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/.checkpoint + contains: + - "Tool: prodigal" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/full_table.tsv + md5sum: c56edab1dc1522e993c25ae2b730799f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/missing_busco_list.tsv + md5sum: b533ef30270f27160acce85a22d01bf5 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_eukaryote_metaeuk + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_eukaryote_metaeuk -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-eukaryota_odb10-busco.batch_summary.txt + md5sum: ff6d8277e452a83ce9456bbee666feb6 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_out.log + md5sum: e63debaa653f18f7405d936050abc093 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/full_table.tsv + md5sum: bd880e90b9e5620a58943a3e0f9ff16b + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/hmmer_output.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/.checkpoint + contains: + - "Tool: metaeuk" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/combined_pred_proteins.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/refseq_db_rerun.faa + md5sum: d80b8fa4cb5ed0d47d63d6aa93635bc2 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/missing_busco_list.tsv + md5sum: 1e8e79c540fd2e69ba0d2659d9eb2988 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-eukaryota_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_eukaryote_augustus + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_eukaryote_augustus -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-eukaryota_odb10-busco.batch_summary.txt + md5sum: ff6d8277e452a83ce9456bbee666feb6 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_out.log + md5sum: e63debaa653f18f7405d936050abc093 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_out.log + contains: + - "metaeuk" + - "easy-predict" + - "Compute score and coverage" + - "Time for processing:" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_out.log + contains: + - "metaeuk" + - "easy-predict" + - "Compute score and coverage" + - "Time for processing:" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/full_table.tsv + md5sum: bd880e90b9e5620a58943a3e0f9ff16b + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/hmmer_output.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/.checkpoint + contains: + - "Tool: metaeuk" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/combined_pred_proteins.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/refseq_db_rerun.faa + md5sum: d80b8fa4cb5ed0d47d63d6aa93635bc2 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/missing_busco_list.tsv + md5sum: 1e8e79c540fd2e69ba0d2659d9eb2988 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-eukaryota_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_protein + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_protein -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.proteome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.proteome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: 7a65e6cbb6c56a2ea4e739ae0aa3297d + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/full_table.tsv + md5sum: 0e34f1011cd83ea1d5d5103ec62b8922 + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/missing_busco_list.tsv + md5sum: 9939309df2da5419de88c32d1435c779 + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/versions.yml + +- name: busco test_busco_transcriptome + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_transcriptome -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.test1.contigs.fa.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.test1.contigs.fa.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: 46118ecf60d1b87d22b96d80f4f03632 + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/.checkpoint + contains: + - "Tool: makeblastdb" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.ndb + md5sum: 3788c017fe5e6f0f58224e9cdd21822b + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nhr + md5sum: 8ecd2ce392bb5e25ddbe1d85f879582e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nin + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.njs + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.not + md5sum: 0c340e376c7e85d19f82ec1a833e6a6e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nsq + md5sum: 532d5c0a7ea00fe95ca3c97cb3be6198 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.ntf + md5sum: de1250813f0c7affc6d12dac9d0fb6bb + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nto + md5sum: ff74bd41f9cc9b011c63a32c4f7693bf + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/makeblastdb_err.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/makeblastdb_out.log + contains: + - "Building a new DB" + - "Adding sequences from FASTA" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/tblastn_err.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/tblastn_out.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/.checkpoint + contains: + - "Tool: tblastn" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/coordinates.tsv + md5sum: cc30eed321944af293452bdbcfc24292 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_101.temp + md5sum: 73e9c65fc83fedc58f57f09b08f08238 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_119.temp + md5sum: 7fa4cc7955ec0cc36330a221c579b975 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_129.temp + md5sum: 6f1601c875d019e3f6f1f98ed8e988d4 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_138.temp + md5sum: 3f8e034686cd240c2330650d791bcae2 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_143.temp + md5sum: df3dfa8e9ba30ed70cf75b5e7abf2179 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_172.temp + md5sum: 7d463e0e6cf7169bc9077d8dc776dda1 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_178.temp + md5sum: 2288edf7fa4f88f51b4cf4d94086f77e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_188.temp + md5sum: 029906abbad6d87fc57830dd548cac24 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_195.temp + md5sum: 4937f3b348774a31b1160a00297c29cc + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_210.temp + md5sum: afcb20ba4c466479d6b91c8c62251e1f + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_232.temp + md5sum: 2e1e823ce017345bd998191a39fa9924 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_268.temp + md5sum: 08c2d82c34ecffbe1c638b410349412e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_29.temp + md5sum: cd9b63cf93524284781535c888313764 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_44.temp + md5sum: d1929b742b24ebe379bf4801ca882dca + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_58.temp + md5sum: 69215765b010c05336538cb322c900b3 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_72.temp + md5sum: 6feaa1cc3b0899a147ea9d466878f3e3 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_80.temp + md5sum: 13625eae14e860a96ce17cd4e37e9d01 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_81.temp + md5sum: e14b2484649b0dbc8926815c207b806d + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_93.temp + md5sum: 6902c93691df00e690faea914c71839e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_97.temp + md5sum: 0a0d9d38a83acbd5ad43c29cdf429988 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/tblastn.tsv + contains: + - "TBLASTN" + - "BLAST processed" + - "queries" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/full_table.tsv + md5sum: 24df25199e13c88bd892fc3e7b541ca0 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/missing_busco_list.tsv + md5sum: e7232e2b8cca4fdfdd9e363b39ebbc81 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/single_copy_proteins.faa + md5sum: e04b9465733577ae6e4bccb7aa01e720 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1024388at2.faa + md5sum: 7333c39a20258f20c7019ea0cd83157c + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1054741at2.faa + md5sum: ebb481e77a824685fbe04d8a2f3a0d7d + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1093223at2.faa + md5sum: 34621c7d499034e8f8e6b92fd4020a93 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1151822at2.faa + md5sum: aa89ca381c1c70c9c4e1380351ca7c2a + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/143460at2.faa + md5sum: f2e91d78b8dd3722840378789f29e8c8 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1491686at2.faa + md5sum: 73c25aef5c9cba7f4151804941b146ea + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1504821at2.faa + md5sum: cda556018d1f84ebe517e89f6fc107d0 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1574817at2.faa + md5sum: a9096c9fb8b25c78a72871ab0463acdc + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1592033at2.faa + md5sum: e463d25ce186c0cebfd749474f3a4c64 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1623045at2.faa + md5sum: f2cfd241590c6d8377286d6135480937 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1661836at2.faa + md5sum: 586569546fb9861502468e3d9ba2775c + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1674344at2.faa + md5sum: 24c658bee14ad84b062d81ad96642eb8 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1698718at2.faa + md5sum: 0b8e26ddf5149bbd8805be7af125208d + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1990650at2.faa + md5sum: 159320712ee01fb2ccb31a25df44eead + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/223233at2.faa + md5sum: 812629c0b06ac3d18661c2ca78de0c08 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/402899at2.faa + md5sum: f7ff4e1591342d30b77392a2e84b57d9 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/505485at2.faa + md5sum: 7b34a24fc49c540d46fcf96ff5129564 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/665824at2.faa + md5sum: 4cff2df64f6bcaff8bc19c234c8bcccd + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/776861at2.faa + md5sum: 613af7a3fea30ea2bece66f603b9284a + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/874197at2.faa + md5sum: a7cd1b13c9ef91c7ef4e31614166f197 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/932854at2.faa + md5sum: fe313ffd5efdb0fed887a04fba352552 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/95696at2.faa + md5sum: 4e1f30a2fea4dfbf9bb7fae2700622a0 + - path: output/busco/versions.yml diff --git a/modules/nf-core/busco/busco/tests/tags.yml b/modules/nf-core/busco/busco/tests/tags.yml new file mode 100644 index 00000000..7c4d2835 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/tags.yml @@ -0,0 +1,2 @@ +busco/busco: + - "modules/nf-core/busco/busco/**" diff --git a/modules/nf-core/checkm2/databasedownload/main.nf b/modules/nf-core/checkm2/databasedownload/main.nf index 6144067b..aaedce2a 100644 --- a/modules/nf-core/checkm2/databasedownload/main.nf +++ b/modules/nf-core/checkm2/databasedownload/main.nf @@ -1,5 +1,3 @@ -import groovy.json.JsonSlurper - process CHECKM2_DATABASEDOWNLOAD { label 'process_single' @@ -21,7 +19,7 @@ process CHECKM2_DATABASEDOWNLOAD { script: def args = task.ext.args ?: '' zenodo_id = db_zenodo_id ?: 5571251 // Default to latest version if no ID provided - api_data = (new JsonSlurper()).parseText(file("https://zenodo.org/api/records/${zenodo_id}").text) + api_data = (new groovy.json.JsonSlurper()).parseText(file("https://zenodo.org/api/records/${zenodo_id}").text) db_version = api_data.metadata.version checksum = api_data.files[0].checksum.replaceFirst(/^md5:/, "md5=") meta = [id: 'checkm2_db', version: db_version] diff --git a/modules/nf-core/checkm2/predict/main.nf b/modules/nf-core/checkm2/predict/main.nf index 25271ba9..5d105553 100644 --- a/modules/nf-core/checkm2/predict/main.nf +++ b/modules/nf-core/checkm2/predict/main.nf @@ -13,7 +13,7 @@ process CHECKM2_PREDICT { output: tuple val(meta), path("${prefix}") , emit: checkm2_output - tuple val(meta), path("${prefix}/quality_report.tsv"), emit: checkm2_tsv + tuple val(meta), path("${prefix}_checkm2_report.tsv"), emit: checkm2_tsv path("versions.yml") , emit: versions when: @@ -31,6 +31,8 @@ process CHECKM2_PREDICT { --database_path ${db} \\ ${args} + cp ${prefix}/quality_report.tsv ${prefix}_checkm2_report.tsv + cat <<-END_VERSIONS > versions.yml "${task.process}": checkm2: \$(checkm2 --version) @@ -38,7 +40,6 @@ process CHECKM2_PREDICT { """ stub: - def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" """ mkdir -p ${prefix}/diamond_output ${prefix}/protein_files diff --git a/modules/nf-core/checkm2/predict/meta.yml b/modules/nf-core/checkm2/predict/meta.yml index 48cc9fbc..9b8d38df 100644 --- a/modules/nf-core/checkm2/predict/meta.yml +++ b/modules/nf-core/checkm2/predict/meta.yml @@ -52,7 +52,7 @@ output: description: | Groovy Map containing sample information e.g. `[ id:'test' ]` - - ${prefix}/quality_report.tsv: + - ${prefix}_checkm2_report.tsv: type: file description: CheckM2 summary completeness statistics table pattern: "*.tsv" diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test b/modules/nf-core/checkm2/predict/tests/main.nf.test index e825f74c..de2a3c85 100644 --- a/modules/nf-core/checkm2/predict/tests/main.nf.test +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test @@ -28,7 +28,7 @@ nextflow_process { } process { """ - input[0] = [ [id: 'test'], [file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)] ] + input[0] = [ [id: 'test'], [file(params.modules_testdata_base_path + 'genomics/prokaryotes/escherichia_coli/genome/genome.fa', checkIfExists: true)] ] input[1] = CHECKM2_DATABASEDOWNLOAD.out.database """ } @@ -43,4 +43,4 @@ nextflow_process { } -} +} \ No newline at end of file diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test.snap b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap index 6fd2e918..4364c12f 100644 --- a/modules/nf-core/checkm2/predict/tests/main.nf.test.snap +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap @@ -6,13 +6,17 @@ { "id": "test" }, - "quality_report.tsv:md5,7f05ff49d18697304575d1106a871501" + "test_checkm2_report.tsv:md5,77b04107300bcece1e0fb46beb9df970" ] ], [ "versions.yml:md5,088ec2d8a46efd530c11019328064bff" ] ], - "timestamp": "2024-09-16T22:43:50.787486798" + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2024-12-20T21:28:38.287346125" } } \ No newline at end of file diff --git a/modules/nf-core/csvtk/concat/environment.yml b/modules/nf-core/csvtk/concat/environment.yml new file mode 100644 index 00000000..12087beb --- /dev/null +++ b/modules/nf-core/csvtk/concat/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::csvtk=0.31.0 diff --git a/modules/nf-core/csvtk/concat/main.nf b/modules/nf-core/csvtk/concat/main.nf new file mode 100644 index 00000000..9f17a9b1 --- /dev/null +++ b/modules/nf-core/csvtk/concat/main.nf @@ -0,0 +1,55 @@ +process CSVTK_CONCAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.31.0--h9ee0642_0' : + 'biocontainers/csvtk:0.31.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv, name: 'inputs/csv*/*') + val in_format + val out_format + + output: + tuple val(meta), path("${prefix}.${out_extension}"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) + def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + csvtk \\ + concat \\ + $args \\ + --num-cpus $task.cpus \\ + --delimiter "${delimiter}" \\ + --out-delimiter "${out_delimiter}" \\ + --out-file ${prefix}.${out_extension} \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + touch ${prefix}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/concat/meta.yml b/modules/nf-core/csvtk/concat/meta.yml new file mode 100644 index 00000000..27ffc1ca --- /dev/null +++ b/modules/nf-core/csvtk/concat/meta.yml @@ -0,0 +1,52 @@ +name: csvtk_concat +description: Concatenate two or more CSV (or TSV) tables into a single table +keywords: + - concatenate + - tsv + - csv +tools: + - csvtk: + description: A cross-platform, efficient, practical CSV/TSV toolkit + homepage: http://bioinf.shenwei.me/csvtk + documentation: http://bioinf.shenwei.me/csvtk + tool_dev_url: https://github.com/shenwei356/csvtk + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV formatted files + pattern: "*.{csv,tsv}" + - - in_format: + type: string + description: Input format (csv, tab, or a delimiting character) + pattern: "*" + - - out_format: + type: string + description: Output format (csv, tab, or a delimiting character) + pattern: "*" +output: + - csv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.${out_extension}: + type: file + description: Concatenated CSV/TSV file + pattern: "*.{csv,tsv}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "version.yml" +authors: + - "@rpetit3" +maintainers: + - "@rpetit3" diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test b/modules/nf-core/csvtk/concat/tests/main.nf.test new file mode 100644 index 00000000..b6c1a581 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test @@ -0,0 +1,72 @@ +// nf-core modules test csvtk/concat +nextflow_process { + + name "Test Process CSVTK_CONCAT" + script "../main.nf" + process "CSVTK_CONCAT" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/concat" + + test("tsv - concat - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) + ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("tsv - concat - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) + ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test.snap b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap new file mode 100644 index 00000000..254d34a1 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "tsv - concat - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-08T04:46:46.133640633" + }, + "tsv - concat - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,bb0ed52999b6b24297bcefb3c29f0a5c" + ] + ], + "1": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,bb0ed52999b6b24297bcefb3c29f0a5c" + ] + ], + "versions": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-08T04:46:31.419386462" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/concat/tests/tags.yml b/modules/nf-core/csvtk/concat/tests/tags.yml new file mode 100644 index 00000000..0d10e7c9 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/concat: + - "modules/nf-core/csvtk/concat/**" diff --git a/nextflow.config b/nextflow.config index 387fee34..21d7700a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -133,7 +133,6 @@ params { busco_db = null busco_auto_lineage_prok = false save_busco_db = false - busco_clean = false checkm_download_url = "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz" checkm_db = null save_checkm_data = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 18307b05..ee9677d3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -768,11 +768,6 @@ "description": "Save the used BUSCO lineage datasets provided via `--busco_db`.", "help_text": "Useful to allow reproducibility, as BUSCO datasets are frequently updated and old versions do not always remain accessible." }, - "busco_clean": { - "type": "boolean", - "description": "Enable clean-up of temporary files created during BUSCO runs.", - "help_text": "By default, BUSCO creates a large number of intermediate files every run. This may cause problems on some clusters which have file number limits in plate, particularly with large numbers of bins. Enabling this option cleans these files, reducing the total file count of the work directory." - }, "checkm_download_url": { "type": "string", "default": "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz", diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 5a83d140..dd34969e 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -3,18 +3,16 @@ */ include { ARIA2 as ARIA2_UNTAR } from '../../modules/nf-core/aria2/main' +include { BUSCO_BUSCO } from '../../modules/nf-core/busco/busco/main' include { CHECKM2_DATABASEDOWNLOAD } from '../../modules/nf-core/checkm2/databasedownload/main' -include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' -include { BUSCO } from '../../modules/local/busco' -include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' -include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' -include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' +include { CSVTK_CONCAT as CONCAT_BINQC_TSV } from '../../modules/nf-core/csvtk/concat/main' include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' +include { UNTAR as BUSCO_UNTAR } from '../../modules/nf-core/untar' workflow BIN_QC { @@ -81,57 +79,33 @@ workflow BIN_QC { /* * BUSCO */ + busco_lineage = params.busco_auto_lineage_prok ? 'auto_prok' : 'auto' + if (!ch_busco_db.isEmpty()) { if (ch_busco_db.extension in ['gz', 'tgz']) { - // Expects to be tar.gz! - BUSCO_DB_PREPARATION(ch_busco_db) - ch_db_for_busco = BUSCO_DB_PREPARATION.out.db.map { meta, db -> - [[id: meta, lineage: 'Y'], db] + BUSCO_UNTAR([[id: 'busco_db'], ch_busco_db]) + + if (ch_busco_db.getSimpleName().contains('odb')) { + busco_lineage = ch_busco_db.getSimpleName() } + ch_busco_db = BUSCO_UNTAR.out.untar.map { it[1] } } else if (ch_busco_db.isDirectory()) { - // Set meta to match expected channel cardinality for BUSCO - ch_db_for_busco = Channel - .of(ch_busco_db) - .collect { db -> - def basename = db.getBaseName() - def lineage = basename.contains('odb10') ? 'Y' : 'N' - [[id: basename, lineage: lineage], db] - } + if (ch_busco_db.name.matches(/odb\d+$/)) { + busco_lineage = ch_busco_db.name + } } } - else { - // Set BUSCO database to empty to allow for --auto-lineage - ch_db_for_busco = Channel - .of([[lineage: ''], []]) - .collect() - } - if (params.save_busco_db) { - // publish files downloaded by Busco - ch_downloads = BUSCO.out.busco_downloads - .groupTuple() - .map { _lin, downloads -> downloads[0] } - .toSortedList() - .flatten() - BUSCO_SAVE_DOWNLOAD(ch_downloads) - - ch_versions = ch_versions.mix(BUSCO_SAVE_DOWNLOAD.out.versions.first()) - } - - BUSCO(ch_input_bins_for_qc, ch_db_for_busco) - - BUSCO_SUMMARY( - BUSCO.out.summary_domain.collect { _meta, summary -> summary }.ifEmpty([]), - BUSCO.out.summary_specific.collect { _meta, summary -> summary }.ifEmpty([]), - BUSCO.out.failed_bin.collect { _meta, summary -> summary }.ifEmpty([]) - ) + BUSCO_BUSCO(ch_bins, 'genome', busco_lineage, ch_busco_db, []) + qc_summaries = BUSCO_BUSCO.out.batch_summary + .map { _meta, summary -> [[id: 'busco'], summary] } + .groupTuple() + ch_versions = ch_versions.mix(BUSCO_BUSCO.out.versions.first()) ch_multiqc_files = ch_multiqc_files.mix( - BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map { _meta, summary -> summary } + BUSCO_BUSCO.out.short_summaries_txt.map { it[1] }.flatten() ) - qc_summary = BUSCO_SUMMARY.out.summary - ch_versions = ch_versions.mix(BUSCO.out.versions.first()) } else if (params.binqc_tool == "checkm") { /* @@ -158,13 +132,10 @@ workflow BIN_QC { CHECKM_QA(ch_checkmqa_input, []) - COMBINE_BINQC_TSV(CHECKM_QA.out.output.collect { summary -> summary[1] }) - - qc_summary = COMBINE_BINQC_TSV.out.combined - ch_versions = ch_versions.mix( - CHECKM_QA.out.versions.first(), - COMBINE_BINQC_TSV.out.versions - ) + qc_summaries = CHECKM_QA.out.output + .map { _meta, summary -> [[id: 'checkm'], summary] } + .groupTuple() + ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) } else if (params.binqc_tool == "checkm2") { /* @@ -172,23 +143,19 @@ workflow BIN_QC { */ CHECKM2_PREDICT(ch_input_bins_for_qc.groupTuple(), ch_checkm2_db) - COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.collect { summary -> summary[1] }) - - qc_summary = COMBINE_BINQC_TSV.out.combined - ch_versions = ch_versions.mix( - CHECKM2_PREDICT.out.versions.first(), - COMBINE_BINQC_TSV.out.versions - ) + qc_summaries = CHECKM2_PREDICT.out + .map { _meta, summary -> [[id: 'checkm2'], summary] } + .groupTuple() + ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) } if (params.run_gunc) { /* * GUNC */ - ch_input_bins_for_gunc = ch_bins - .filter { meta, _bins -> - meta.domain != "eukarya" - } + ch_input_bins_for_gunc = ch_bins.filter { meta, _bins -> + meta.domain != "eukarya" + } if (params.gunc_db) { ch_db_for_gunc = ch_gunc_db @@ -207,7 +174,7 @@ workflow BIN_QC { .collectFile( name: "gunc_summary.tsv", keepHeader: true, - storeDir: "${params.outdir}/GenomeBinning/QC/" + storeDir: "${params.outdir}/GenomeBinning/QC/", ) if (params.binqc_tool == 'checkm') { @@ -222,11 +189,16 @@ workflow BIN_QC { .collectFile( name: "gunc_checkm_summary.tsv", keepHeader: true, - storeDir: "${params.outdir}/GenomeBinning/QC/" + storeDir: "${params.outdir}/GenomeBinning/QC/", ) } } + // Combine QC summaries (same process for all tools) + CONCAT_BINQC_TSV(qc_summaries, 'tsv', 'tsv') + qc_summary = CONCAT_BINQC_TSV.out.csv.map { _meta, summary -> summary } + ch_versions = ch_versions.mix(CONCAT_BINQC_TSV.out.versions) + emit: qc_summary = qc_summary multiqc_files = ch_multiqc_files diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index d3d66d47..deb22e0d 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -14,40 +14,25 @@ workflow GTDBTK { gtdb_mash // channel: path main: - // Filter bins: classify only medium & high quality MAGs - ch_bin_metrics = Channel.empty() - if ( params.binqc_tool == 'busco' ){ - // Collect completeness and contamination metrics from busco summary - ch_bin_metrics = bin_qc_summary - .splitCsv(header: true, sep: '\t') - .map { row -> - def completeness = -1 - def contamination = -1 - def missing, duplicated - if (params.busco_db && file(params.busco_db).getBaseName().contains('odb10')) { - missing = row.'%Missing (specific)' // TODO or just take '%Complete'? - duplicated = row.'%Complete and duplicated (specific)' - } else { - missing = row.'%Missing (domain)' - duplicated = row.'%Complete and duplicated (domain)' - } - if (missing != '') completeness = 100.0 - Double.parseDouble(missing) - if (duplicated != '') contamination = Double.parseDouble(duplicated) - [row.'GenomeBin', completeness, contamination] - } - } else { - // Collect completeness and contamination metrics from CheckM/CheckM2 summary - bin_name = params.binqc_tool == 'checkm' ? 'Bin Id' : 'Name' + // Collect bin quality metrics + qc_columns = [ + busco: ['Input_file', 'Complete', 'Duplicated'], + checkm: ['Bin Id', 'Completeness', 'Contamination'], + checkm2: ['Name', 'Completeness', 'Contamination'] + ] - ch_bin_metrics = bin_qc_summary - .splitCsv(header: true, sep: '\t') - .map { row -> - def completeness = Double.parseDouble(row.'Completeness') - def contamination = Double.parseDouble(row.'Contamination') - [row[bin_name] + ".fa", completeness, contamination] + ch_bin_metrics = bin_qc_summary + .splitCsv(header: true, sep: '\t') + .map { row -> qc_columns[params.binqc_tool].collect { col -> row[col] } } + .filter { row -> row[1] != '' } + .map { row -> + row = [row[0]] + row[1..2].collect { value -> Double.parseDouble(value) } + // CheckM / CheckM2 removes the .fa extension from the bin name + if (params.binqc_tool in ['checkm', 'checkm2']) { + row[0] = row[0] + '.fa' } - } - + row + } // Filter bins based on collected metrics: completeness, contamination ch_filtered_bins = bins diff --git a/subworkflows/local/tiara.nf b/subworkflows/local/tiara.nf index ab274cc8..a5b1aeda 100644 --- a/subworkflows/local/tiara.nf +++ b/subworkflows/local/tiara.nf @@ -1,7 +1,7 @@ include { TIARA_TIARA } from '../../modules/nf-core/tiara/tiara/main' include { TIARA_CLASSIFY } from '../../modules/local/tiara_classify' include { DASTOOL_FASTATOCONTIG2BIN as DASTOOL_FASTATOCONTIG2BIN_TIARA } from '../../modules/nf-core/dastool/fastatocontig2bin/main' -include { COMBINE_TSV as TIARA_SUMMARY } from '../../modules/local/combine_tsv' +include { CSVTK_CONCAT as TIARA_SUMMARY } from '../../modules/nf-core/csvtk/concat/main' workflow TIARA { take: @@ -118,8 +118,11 @@ workflow TIARA { [ classification ] } .collect() + .map { classifications -> + [[:], classifications] + } - TIARA_SUMMARY(ch_bin_classifications) + TIARA_SUMMARY(ch_bin_classifications, 'tsv', 'tsv') emit: classified_bins = ch_classified_bins diff --git a/workflows/mag.nf b/workflows/mag.nf index de353a40..308098cc 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -59,7 +59,6 @@ include { CAT_DB_GENERATE } from '../modul include { CAT } from '../modules/local/cat' include { CAT_SUMMARY } from '../modules/local/cat_summary' include { BIN_SUMMARY } from '../modules/local/bin_summary' -include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modules/local/combine_tsv' workflow MAG { take: