diff --git a/README.md b/README.md index 766d5fe..ab66115 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,8 @@ Each row represents an input genome and the fields are: - `fasta:` fasta file for the genome - `is_masked`: yes or no to denote whether the fasta file is already masked or not + + At minimum, a file with proteins as evidence is also required. Now, you can run the pipeline using: ```bash diff --git a/docs/parameters.md b/docs/parameters.md index 1c96a8b..29fc806 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -1,106 +1,130 @@ + + # plant-food-research-open/genepal pipeline parameters A Nextflow pipeline for consensus, phased and pan-genome annotation. ## Input/output options -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------- | -------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `input` | Target assemblies listed in a CSV sheet | `string` | | True | | -| `protein_evidence` | Protein evidence provided as a fasta file or multiple fasta files listed in a plain txt file | `string` | | True | | -| `eggnogmapper_db_dir` | Eggnogmapper database directory | `string` | | | | -| `eggnogmapper_tax_scope` | Eggnogmapper taxonomy scopre. Eukaryota: 2759, Viridiplantae: 33090, Archaea: 2157, Bacteria: 2, root: 1 | `integer` | 1 | | | -| `rna_evidence` | FASTQ/BAM samples listed in a CSV sheet | `string` | | | | -| `liftoff_annotations` | Reference annotations listed in a CSV sheet | `string` | | | | -| `orthofinder_annotations` | Additional annotations for orthology listed in a CSV sheet | `string` | | | | -| `outdir` | The output directory where the results will be saved | `string` | | True | | -| `email` | Email address for completion summary. | `string` | | | True | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `input` | Target assemblies listed in a CSV sheet | `string` | | True | | +| `protein_evidence` | Protein evidence provided as a fasta file or multiple fasta files listed in a plain txt file | `string` | | True | | +| `eggnogmapper_db_dir` | Eggnogmapper database directory | `string` | | | | +| `eggnogmapper_tax_scope` | Eggnogmapper taxonomy scopre. Eukaryota: 2759, Viridiplantae: 33090, Archaea: 2157, Bacteria: 2, root: 1 | `integer` | 1 | | | +| `rna_evidence` | FASTQ/BAM samples listed in a CSV sheet | `string` | | | | +| `liftoff_annotations` | Reference annotations listed in a CSV sheet | `string` | | | | +| `orthofinder_annotations` | Additional annotations for orthology listed in a CSV sheet | `string` | | | | +| `outdir` | The output directory where the results will be saved | `string` | | True | | +| `email` | Email address for completion summary. | `string` | | | True | ## Repeat annotation options -| Parameter | Description | Type | Default | Required | Hidden | -| --------------------------- | ------------------------------------------ | --------- | ------------- | -------- | ------ | -| `repeat_annotator` | 'edta' or 'repeatmodeler' | `string` | repeatmodeler | | | -| `save_annotated_te_lib` | Save annotated TE library or not? | `boolean` | | | | -| `edta_is_sensitive` | Use '--sensitive 1' flag with EDTA or not? | `boolean` | | | | -| `repeatmasker_save_outputs` | Save the repeat-masked genome or not? | `boolean` | | | | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `repeat_annotator` | 'edta' or 'repeatmodeler' | `string` | repeatmodeler | | | +| `save_annotated_te_lib` | Save annotated TE library or not? | `boolean` | | | | +| `edta_is_sensitive` | Use '--sensitive 1' flag with EDTA or not? | `boolean` | | | | +| `repeatmasker_save_outputs` | Save the repeat-masked genome or not? | `boolean` | | | | ## RNASeq pre-processing options -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------ | ------------------------------------------------------------------ | --------- | ----------------------------------------- | -------- | ------ | -| `fastqc_skip` | Skip FASTQC or not? | `boolean` | True | | | -| `fastp_skip` | Skip trimming by FASTQP or not? | `boolean` | | | | -| `min_trimmed_reads` | Exclude a sample if its reads after trimming are below this number | `integer` | 10000 | | | -| `fastp_extra_args` | Extra FASTP arguments | `string` | | | | -| `save_trimmed` | Save FASTQ files after trimming or not? | `boolean` | | | | -| `remove_ribo_rna` | Remove Ribosomal RNA or not? | `boolean` | | | | -| `save_non_ribo_reads` | Save FASTQ files after Ribosomal RNA removal or not? | `boolean` | | | | -| `ribo_database_manifest` | Ribosomal RNA fastas listed in a text sheet | `string` | ${projectDir}/assets/rrna-db-defaults.txt | | | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `fastqc_skip` | Skip FASTQC or not? | `boolean` | True | | | +| `fastp_skip` | Skip trimming by FASTQP or not? | `boolean` | | | | +| `min_trimmed_reads` | Exclude a sample if its reads after trimming are below this number | `integer` | 10000 | | | +| `fastp_extra_args` | Extra FASTP arguments | `string` | | | | +| `save_trimmed` | Save FASTQ files after trimming or not? | `boolean` | | | | +| `remove_ribo_rna` | Remove Ribosomal RNA or not? | `boolean` | | | | +| `save_non_ribo_reads` | Save FASTQ files after Ribosomal RNA removal or not? | `boolean` | | | | +| `ribo_database_manifest` | Ribosomal RNA fastas listed in a text sheet | `string` | ${projectDir}/assets/rrna-db-defaults.txt | | | ## RNASeq alignment options -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------ | ------------------------------------------------- | --------- | ------- | -------- | ------ | -| `star_max_intron_length` | Maximum intron length for STAR alignment | `integer` | 16000 | | | -| `star_align_extra_args` | EXTRA arguments for STAR | `string` | | | | -| `star_save_outputs` | Save BAM files from STAR or not? | `boolean` | | | | -| `save_cat_bam` | SAVE a concatenated BAM file per assembly or not? | `boolean` | | | | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `star_max_intron_length` | Maximum intron length for STAR alignment | `integer` | 16000 | | | +| `star_align_extra_args` | EXTRA arguments for STAR | `string` | | | | +| `star_save_outputs` | Save BAM files from STAR or not? | `boolean` | | | | +| `save_cat_bam` | SAVE a concatenated BAM file per assembly or not? | `boolean` | | | | ## Annotation options -| Parameter | Description | Type | Default | Required | Hidden | -| --------------------- | --------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `braker_extra_args` | Extra arguments for BRAKER | `string` | | | | -| `liftoff_coverage` | Liftoff coverage parameter | `number` | 0.9 | | | -| `liftoff_identity` | Liftoff identity parameter | `number` | 0.9 | | | -| `eggnogmapper_evalue` | Only report alignments below or equal the e-value threshold | `number` | 1e-05 | | | -| `eggnogmapper_pident` | Only report alignments above or equal to the given percentage of identity (0-100) | `integer` | 35 | | | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `braker_extra_args` | Extra arguments for BRAKER | `string` | | | | +| `liftoff_coverage` | Liftoff coverage parameter | `number` | 0.9 | | | +| `liftoff_identity` | Liftoff identity parameter | `number` | 0.9 | | | +| `eggnogmapper_evalue` | Only report alignments below or equal the e-value threshold | `number` | 1e-05 | | | +| `eggnogmapper_pident` | Only report alignments above or equal to the given percentage of identity (0-100) | `integer` | 35 | | | + +| `--min_contig_length` | Minimum length (in base pairs) of contigs to include in the analysis | `number`| 5000 | | | + ## Post-annotation filtering options -| Parameter | Description | Type | Default | Required | Hidden | -| ----------------------------- | ----------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | -| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | -| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | -| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | +| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | +| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | +| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | ## Annotation output options -| Parameter | Description | Type | Default | Required | Hidden | -| ----------------------------- | ------------------------------------ | --------- | ------- | -------- | ------ | -| `braker_save_outputs` | Save BRAKER files | `boolean` | | | | -| `add_attrs_to_proteins_fasta` | Add gff attributes to proteins fasta | `boolean` | | | | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `braker_save_outputs` | Save BRAKER files | `boolean` | | | | +| `add_attrs_to_proteins_fasta` | Add gff attributes to proteins fasta | `boolean` | | | | ## Evaluation options -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------ | --------------------------------------------------------------------------- | --------- | --------------- | -------- | ------ | -| `busco_skip` | Skip evaluation by BUSCO | `boolean` | | | | -| `busco_lineage_datasets` | BUSCO lineages as a space-separated list: 'fungi_odb10 microsporidia_odb10' | `string` | eukaryota_odb10 | | | + + +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `busco_skip` | Skip evaluation by BUSCO | `boolean` | | | | +| `busco_lineage_datasets` | BUSCO lineages as a space-separated list: 'fungi_odb10 microsporidia_odb10' | `string` | eukaryota_odb10 | | | ## Institutional config options Parameters used to describe centralised config profiles. These should not be edited. -| Parameter | Description | Type | Default | Required | Hidden | -| ---------------------------- | ----------------------------------------- | -------- | -------------------------------------------------------- | -------- | ------ | -| `custom_config_version` | Git commit id for Institutional configs. | `string` | master | | True | -| `custom_config_base` | Base directory for Institutional configs. | `string` | https://raw.githubusercontent.com/nf-core/configs/master | | True | -| `config_profile_name` | Institutional config name. | `string` | | | True | -| `config_profile_description` | Institutional config description. | `string` | | | True | +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `custom_config_version` | Git commit id for Institutional configs. | `string` | master | | True | +| `custom_config_base` | Base directory for Institutional configs. | `string` | https://raw.githubusercontent.com/nf-core/configs/master | | True | +| `config_profile_name` | Institutional config name. | `string` | | | True | +| `config_profile_description` | Institutional config description. | `string` | | | True | ## Generic options Less common options for the pipeline, typically set in a config file. -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------ | ----------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `version` | Display version and exit. | `boolean` | | | True | -| `publish_dir_mode` | Method used to save pipeline results to output directory. | `string` | copy | | True | -| `email_on_fail` | Email address for completion summary, only when pipeline fails. | `string` | | | True | -| `plaintext_email` | Send plain-text email instead of HTML. | `boolean` | | | True | -| `max_multiqc_email_size` | File size limit when attaching MultiQC reports to summary emails. | `string` | 25.MB | | True | -| `monochrome_logs` | Do not use coloured log outputs. | `boolean` | | | True | -| `hook_url` | Incoming hook URL for messaging service | `string` | | | True | +| Parameter | Description | Type | Default | Required | Hidden | +|-----------|-----------|-----------|-----------|-----------|-----------| +| `version` | Display version and exit. | `boolean` | | | True | +| `publish_dir_mode` | Method used to save pipeline results to output directory. | `string` | copy | | True | +| `email_on_fail` | Email address for completion summary, only when pipeline fails. | `string` | | | True | +| `plaintext_email` | Send plain-text email instead of HTML. | `boolean` | | | True | +| `max_multiqc_email_size` | File size limit when attaching MultiQC reports to summary emails. | `string` | 25.MB | | True | +| `monochrome_logs` | Do not use coloured log outputs. | `boolean` | | | True | +| `hook_url` | Incoming hook URL for messaging service | `string` | | | True | + + + diff --git a/main.nf b/main.nf index 2545848..28eca2c 100755 --- a/main.nf +++ b/main.nf @@ -17,6 +17,39 @@ include { GENEPAL } from './workflows/genepal' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_genepal_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_genepal_pipeline' +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PROCESS: Filter Genome Assembly by Minimum Contig Length +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Include the seqkit module +include { SEQKIT } from './modules/nf-core/seqkit' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PROCESS: Filter Genome Assembly by Minimum Contig Length +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process SEQKIT_GET_LENGTH { + + input: + path input_file + + output: + path 'filtered_output_file.txt' + + script: + """ + # Filter contigs based on length and output filtered FASTA + seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta + + # Generate a list of filtered contigs + seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt + """ +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NAMED WORKFLOWS FOR PIPELINE @@ -48,10 +81,15 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL { main: // - // WORKFLOW: Run pipeline + // Filter genome assembly by minimum contig length + // + SEQKIT_GET_LENGTH(ch_target_assembly) + + // + // Run GENEPAL main workflow using filtered FASTA // GENEPAL( - ch_target_assembly, + SEQKIT_GET_LENGTH.out.filtered_fasta.map { meta, fasta, contig_list -> [ meta, fasta ] }, // Filtered genome FASTA ch_tar_assm_str, ch_is_masked, ch_te_library, @@ -68,9 +106,11 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL { ch_tsebra_config, ch_orthofinder_pep ) + emit: multiqc_report = GENEPAL.out.multiqc_report // channel: /path/to/multiqc_report.html } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -81,9 +121,9 @@ workflow { main: // - // SUBWORKFLOW: Run initialisation tasks + // SUBWORKFLOW: Run initialization tasks // - PIPELINE_INITIALISATION ( + PIPELINE_INITIALISATION( params.version, params.monochrome_logs, args, @@ -95,10 +135,15 @@ workflow { ) // - // WORKFLOW: Run main workflow + // Filter genome assembly by minimum contig length + // + SEQKIT_GET_LENGTH(PIPELINE_INITIALISATION.out.target_assembly) + + // + // Run main workflow using filtered FASTA // PLANTFOODRESEARCHOPEN_GENEPAL( - PIPELINE_INITIALISATION.out.target_assembly, + SEQKIT_GET_LENGTH.out.filtered_fasta, PIPELINE_INITIALISATION.out.tar_assm_str, PIPELINE_INITIALISATION.out.is_masked, PIPELINE_INITIALISATION.out.te_library, @@ -115,10 +160,11 @@ workflow { PIPELINE_INITIALISATION.out.tsebra_config, PIPELINE_INITIALISATION.out.orthofinder_pep ) + // // SUBWORKFLOW: Run completion tasks // - PIPELINE_COMPLETION ( + PIPELINE_COMPLETION( params.email, params.email_on_fail, params.plaintext_email, diff --git a/nextflow.config b/nextflow.config index 665e6b3..190d6e8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,7 +48,7 @@ params { liftoff_identity = 0.9 eggnogmapper_evalue = 0.00001 eggnogmapper_pident = 35 - + min_contig_length = 5000 // Post-annotation filtering options allow_isoforms = true enforce_full_intron_support = true @@ -79,7 +79,15 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" } - +// Validation for the min_contig_length parameter +process { + beforeScript = """ + if [[ ${params.min_contig_length} -le 1000 ]]; then + echo "ERROR: The parameter 'min_contig_length' must be greater than 5 kbp (5000 base pairs). Provided value: ${params.min_contig_length}" >&2 + exit 1 + fi + """ +} // Max resources process { resourceLimits = [ diff --git a/nextflow_schema.json b/nextflow_schema.json index 0258683..8816405 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -291,6 +291,14 @@ "type": "boolean", "fa_icon": "fas fa-question-circle", "description": "Add gff attributes to proteins fasta" + }, + "Seqkit_min_contig_threshold": + { + "name": "min_contig_length", + "type": "integer", + "description": "Minimum length of contigs.", + "required": true, + "minimum": 5000 } } }, diff --git a/subworkflows/yykaya/seqkit.filter.nf b/subworkflows/yykaya/seqkit.filter.nf new file mode 100644 index 0000000..e03f049 --- /dev/null +++ b/subworkflows/yykaya/seqkit.filter.nf @@ -0,0 +1,21 @@ +process SEQKIT_GET_LENGTH { + tag "${meta.id}" + label 'process_medium' + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0' + : 'quay.io/biocontainers/seqkit:2.4.0--h9ee0642_0'}" + + input: + tuple val(meta), path(genome_fasta) + + output: + tuple val(meta), path("filtered_${meta.id}.fasta"), path("${meta.id}_contig_list.txt"), emit: filtered_fasta + + script: + """ + # Filter contigs based on length and output filtered FASTA + seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta + + # Generate a list of filtered contigs + seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt + """