diff --git a/.nf-core.yml b/.nf-core.yml
index c269926..92703f4 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -18,4 +18,4 @@ template:
skip_features:
- igenomes
- fastqc
- version: 1.0.0dev
+ version: 1.0.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76a80c3..9c1285e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,14 +3,16 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## v1.0.0dev - [date]
+## v1.0.0 - [2025/02/05]
Initial release of nf-core/proteinfamilies, created with the [nf-core](https://nf-co.re/) template.
### `Added`
-### `Fixed`
-
-### `Dependencies`
-
-### `Deprecated`
+- Amino acid sequence clustering (mmseqs)
+- Multiple sequence alignment (famsa, mafft, clipkit)
+- Hidden Markov Model generation (hmmer)
+- Between families redundancy removal (hmmer)
+- In-family sequence redundancy removal (mmseqs)
+- Family updating (hmmer, seqkit, mmseqs, famsa, mafft, clipkit)
+- Family statistics presentation (multiqc)
diff --git a/CITATIONS.md b/CITATIONS.md
index 3e9b54d..9cb5c87 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -10,6 +10,34 @@
## Pipeline tools
+- [MMseqs2](https://pubmed.ncbi.nlm.nih.gov/33734313/)
+
+> Mirdita M, Steinegger M, Breitwieser F, Söding J, Levy Karin E. Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics. 2021 Sep 15;37(18):3029-31. doi: 10.1093/bioinformatics/btab184. PubMed PMID: 33734313; PubMed Central PMCID: PMC8479651.
+
+- [FAMSA](https://pubmed.ncbi.nlm.nih.gov/27670777/)
+
+> Deorowicz S, Debudaj-Grabysz A, Gudyś A. FAMSA: Fast and accurate multiple sequence alignment of huge protein families. Scientific reports. 2016 Sep 27;6(1):33964. doi: 10.1038/srep33964. PubMed PMID: 27670777; PubMed Central PMCID: PMC5037421.
+
+- [mafft](https://pubmed.ncbi.nlm.nih.gov/23329690/)
+
+> Katoh K, Standley DM. MAFFT multiple sequence alignment software version 7: improvements in performance and usability. Molecular biology and evolution. 2013 Jan 16;30(4):772-80. doi: 10.1093/molbev/mst010. PubMed PMID: 23329690; PubMed Central PMCID: PMC3603318.
+
+- [ClipKIT](https://pubmed.ncbi.nlm.nih.gov/33264284/)
+
+> Steenwyk JL, Buida III TJ, Li Y, Shen XX, Rokas A. ClipKIT: a multiple sequence alignment trimming software for accurate phylogenomic inference. PLoS biology. 2020 Dec 2;18(12):e3001007. doi: 10.1371/journal.pbio.3001007. PubMed PMID: 33264284; PubMed Central PMCID: PMC7735675.
+
+- [hmmer](https://pubmed.ncbi.nlm.nih.gov/29905871/)
+
+> Potter SC, Luciani A, Eddy SR, Park Y, Lopez R, Finn RD. Nucleic acids research. 2018 Jul 2;46(W1):W200-4. doi: 10.1093/nar/gky448. PubMed PMID: 29905871; PubMed Central PMCID: PMC6030962.
+
+- [SeqKit](https://pubmed.ncbi.nlm.nih.gov/38898985/)
+
+> Shen W, Sipos B, Zhao L. SeqKit2: A Swiss army knife for sequence and alignment processing. iMeta. 2024 Apr 5:e191. doi: 10.1002/imt2.191. PubMed PMID: 38898985; PubMed Central PMCID: PMC11183193.
+
+- [Biopython](https://pubmed.ncbi.nlm.nih.gov/19304878/)
+
+> Cock PJ, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A, Friedberg I, Hamelryck T, Kauff F, Wilczynski B, De Hoon MJ. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. 2009 Jun 6;25(11):1422. doi: 10.1093/bioinformatics/btp163. PubMed PMID: 19304878; PubMed Central PMCID: PMC2682512.
+
- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
diff --git a/README.md b/README.md
index dab7dda..cf6323c 100644
--- a/README.md
+++ b/README.md
@@ -19,43 +19,55 @@
## Introduction
-**nf-core/proteinfamilies** is a bioinformatics pipeline that ...
-
-
-
-
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+**nf-core/proteinfamilies** is a bioinformatics pipeline that generates protein families from amino acid sequences and/or updates existing families with new sequences.
+It takes a protein fasta file as input, clusters the sequences and then generates protein family Hiden Markov Models (HMMs) along with their multiple sequence alignments (MSAs).
+Optionally, paths to existing family HMMs and MSAs can be given (must have matching base filenames one-to-one) in order to update with new sequences in case of matching hits.
+
+
+
+
+
+### Create families
+
+1. Cluster sequences ([`MMseqs2`](https://github.com/soedinglab/MMseqs2/))
+2. Perform multiple sequence alignment (MSA) ([`FAMSA`](https://github.com/refresh-bio/FAMSA/) or [`mafft`](https://github.com/GSLBiotech/mafft/))
+3. Optionally, clip gap parts of the MSA ([`ClipKIT`](https://github.com/JLSteenwyk/ClipKIT/))
+4. Generate family HMMs and fish additional sequences into the family ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))
+5. Optionally, remove redundant families by comparing family representative sequences against family models with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))
+6. Optionally, from the remaining families, remove in-family redundant sequences by strictly clustering with ([`MMseqs2`](https://github.com/soedinglab/MMseqs2/)) and keep cluster representatives
+7. Present statistics for remaining/updated families size distributions and representative sequence lengths ([`MultiQC`](http://multiqc.info/))
+
+### Update families
+
+1. Find which families to update by comparing the input sequences against existing family models with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))
+2. For non hit sequences continue with the above: A. Create families. For hit sequences and families continue to: 3
+3. Extract family sequences ([`SeqKit`](https://github.com/shenwei356/seqkit/)) and concatenate with filtered hit sequences of each family
+4. Optionally, remove in-family redundant sequences by strictly clustering with ([`MMseqs2`](https://github.com/soedinglab/MMseqs2/)) and keeping cluster representatives
+5. Perform multiple sequence alignment (MSA) ([`FAMSA`](https://github.com/refresh-bio/FAMSA/) or [`mafft`](https://github.com/GSLBiotech/mafft/))
+6. Optionally, clip gap parts of the MSA ([`ClipKIT`](https://github.com/JLSteenwyk/ClipKIT/))
+7. Update family HMM with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))
## Usage
> [!NOTE]
> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
-
+Each row contains a fasta file with amino acid sequences (can be zipped or unzipped).
+Optionally, a row may contain tarball archives (tar.gz) of existing families' HMM and MSA folders, in order to be updated.
+In this case, the HMM and MSA files must be matching in numbers and in base filenames (not the extension).
+Hit families/sequences will be updated, while no hit sequences will create new families.
Now, you can run the pipeline using:
-
-
```bash
nextflow run nf-core/proteinfamilies \
-profile \
@@ -80,7 +92,7 @@ nf-core/proteinfamilies was originally written by Evangelos Karatzas.
We thank the following people for their extensive assistance in the development of this pipeline:
-
+- [Martin Beracochea](https://github.com/mberacochea)
## Contributions and Support
@@ -93,8 +105,6 @@ For further information or help, don't hesitate to get in touch on the [Slack `#
-
-
An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.
You can cite the `nf-core` publication as follows:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 156d48a..79c783f 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,7 +1,8 @@
report_comment: >
- This report has been generated by the nf-core/proteinfamilies
- analysis pipeline. For information about how to interpret these results, please see the
- documentation.
+ This report has been generated by the nf-core/proteinfamilies analysis pipeline. For information about
+ how to interpret these results, please see the documentation.
report_section_order:
"nf-core-proteinfamilies-methods-description":
order: -1000
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
index 5f653ab..925bedb 100644
--- a/assets/samplesheet.csv
+++ b/assets/samplesheet.csv
@@ -1,3 +1,2 @@
-sample,fastq_1,fastq_2
-SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
-SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
+sample,fasta,existing_hmms_to_update,existing_msas_to_update
+mgnifams_test,https://github.com/nf-core/test-datasets/raw/proteinfamilies/test_data/mgnifams_input_small.fa,,
diff --git a/assets/schema_input.json b/assets/schema_input.json
index a9cb2a2..62e2a10 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -13,21 +13,28 @@
"errorMessage": "Sample name must be provided and cannot contain spaces",
"meta": ["id"]
},
- "fastq_1": {
+ "fasta": {
"type": "string",
"format": "file-path",
"exists": true,
- "pattern": "^\\S+\\.f(ast)?q\\.gz$",
- "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+ "pattern": "^\\S+\\.(fa|fasta|faa|fas)(\\.gz)?$",
+ "errorMessage": "Fasta file for amino acid sequences must be provided, cannot contain spaces and must have extension '.fa', '.fasta', '.faa', '.fas', '.fa.gz', '.fasta.gz', '.faa.gz' or '.fas.gz'"
},
- "fastq_2": {
+ "existing_hmms_to_update": {
"type": "string",
"format": "file-path",
"exists": true,
- "pattern": "^\\S+\\.f(ast)?q\\.gz$",
- "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+ "pattern": "^\\S+\\.tar\\.gz$",
+ "description": "Gzipped tarball file containing existing protein family HMMs. These models will be used to 'fish' new sequences from the input and then be updated accordingly."
+ },
+ "existing_msas_to_update": {
+ "type": "string",
+ "format": "file-path",
+ "exists": true,
+ "pattern": "^\\S+\\.tar\\.gz$",
+ "description": "Tarball file containing multiple sequence alignments (MSAs) for the families to be updated. These alignments are essential for the update process and should match the HMM filenames one by one."
}
},
- "required": ["sample", "fastq_1"]
+ "required": ["sample", "fasta"]
}
}
diff --git a/bin/branch_hits_fasta.py b/bin/branch_hits_fasta.py
new file mode 100755
index 0000000..64c7c6d
--- /dev/null
+++ b/bin/branch_hits_fasta.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import sys
+import argparse
+import os
+import gzip
+import re
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-f",
+ "--fasta",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Input fasta file.",
+ )
+ parser.add_argument(
+ "-d",
+ "--domtbl",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Domain summary annotations result from hmmsearch.",
+ )
+ parser.add_argument(
+ "-l",
+ "--length_threshold",
+ required=True,
+ metavar="FLOAT",
+ type=float,
+ help="Minimum length percentage threshold of annotated domain (env) against query to keep.",
+ )
+ parser.add_argument(
+ "-H",
+ "--hits",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="Name of the output folder with hit fasta files (one file per family, where the filename is the family id).",
+ )
+ parser.add_argument(
+ "-n",
+ "--non_hits",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Name of the output fasta file with the non hit sequences.",
+ )
+ return parser.parse_args(args)
+
+
+def filter_sequences(domtbl, length_threshold):
+ results = {}
+
+ # Open the domtbl file (supporting gzip)
+ open_func = gzip.open if domtbl.endswith(".gz") else open
+ with open_func(domtbl, "rt") as file:
+ for line in file:
+ if line.startswith("#"):
+ continue # Skip comments
+
+ columns = line.split()
+ try:
+ qlen = float(columns[5])
+ env_from = int(columns[19])
+ env_to = int(columns[20])
+ env_length = env_to - env_from + 1
+
+ if env_length >= length_threshold * qlen:
+ sequence_name = columns[0]
+ query_name = columns[3]
+
+ if query_name not in results:
+ results[query_name] = set()
+ results[query_name].add(f"{sequence_name}/{env_from}-{env_to}")
+ except (IndexError, ValueError):
+ continue # Skip malformed lines
+
+ return results
+
+
+# Open the file with gzip if it's gzipped, otherwise open normally
+def open_fasta(file_path):
+ if file_path.endswith(".gz"):
+ return gzip.open(file_path, "rt") # Open gzipped file in text mode
+ return open(file_path, "rt") # Open plain text file
+
+
+# Parse the file
+def parse_fasta(file_path):
+ with open_fasta(file_path) as file:
+ return {record.id: record for record in SeqIO.parse(file, "fasta")}
+
+
+def write_non_hit_sequences(filtered_sequences, sequences, non_hits):
+ # Determine the non-hit sequences
+ hit_sequence_names = {hit.split("/")[0] for hits in filtered_sequences.values() for hit in hits}
+ non_hit_records = [
+ record for name, record in sequences.items()
+ if name not in hit_sequence_names
+ ]
+
+ # Write the non-hit sequences to a gzipped file
+ with gzip.open(non_hits, "wt") as non_hits_file: # 'wt' mode for text writing
+ SeqIO.write(non_hit_records, non_hits_file, "fasta")
+ print(f"Written {len(non_hit_records)} non-hit sequences to {non_hits}")
+
+
+def validate_and_parse_hit_name(hit):
+ """
+ Validates and parses a hit string.
+ The hit must contain a string, at least one '/', and a valid range (integer-integer) after the last '/'.
+
+ Args:
+ hit (str): The hit string to validate and parse.
+
+ Returns:
+ tuple: (sequence_name, env_from, env_to) if the hit is valid.
+
+ Raises:
+ ValueError: If the hit is invalid.
+ """
+ # Define the regex pattern
+ pattern = r"^(.*)/(\d+)-(\d+)$"
+
+ # Match the pattern
+ match = re.match(pattern, hit)
+ if not match:
+ raise ValueError(f"Skipping hit with invalid format: {hit}.")
+
+ # Extract components
+ sequence_name = match.group(1) # Everything before the last '/'
+ env_from = int(match.group(2)) # First integer in the range
+ env_to = int(match.group(3)) # Second integer in the range
+
+ return sequence_name, env_from, env_to
+
+
+def write_family_fastas(results, sequences, output_dir):
+ # Create the output directory if it doesn't exist
+ os.makedirs(output_dir, exist_ok=True)
+
+ for family, hits in results.items():
+ family_records = []
+
+ for hit in hits:
+ try:
+ sequence_name, env_from, env_to = validate_and_parse_hit_name(hit)
+
+ # Get the original sequence
+ original_record = sequences[sequence_name]
+
+ # Extract the specific range (adjust indices for 0-based indexing)
+ extracted_seq = original_record.seq[env_from-1:env_to]
+
+ # Determine the new sequence ID
+ if len(extracted_seq) == len(original_record.seq):
+ new_id = sequence_name # Omit range if full-length
+ else:
+ new_id = f"{sequence_name}/{env_from}-{env_to}"
+
+ # Create a new SeqRecord for the extracted range
+ new_record = SeqRecord(
+ Seq(extracted_seq),
+ id=new_id,
+ description=family
+ )
+ family_records.append(new_record)
+ except KeyError:
+ print(f"Sequence {sequence_name} not found in the input FASTA.", file=sys.stderr)
+ except ValueError as e:
+ print(e, file=sys.stderr)
+
+ # Write the extracted sequences to a FASTA file for the family
+ if family_records:
+ family_fasta_path = os.path.join(output_dir, f"{family}.fasta")
+ SeqIO.write(family_records, family_fasta_path, "fasta")
+ print(f"Written {len(family_records)} sequences to {family_fasta_path}")
+
+
+def filter_recruited(fasta, domtbl, length_threshold, hits, non_hits):
+ filtered_sequences = filter_sequences(domtbl, length_threshold)
+ sequences = parse_fasta(fasta)
+ write_non_hit_sequences(filtered_sequences, sequences, non_hits)
+ write_family_fastas(filtered_sequences, sequences, hits)
+
+
+def main(args=None):
+ args = parse_args(args)
+ filter_recruited(
+ args.fasta, args.domtbl, args.length_threshold, args.hits, args.non_hits
+ )
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/chunk_clusters.py b/bin/chunk_clusters.py
new file mode 100755
index 0000000..ae99828
--- /dev/null
+++ b/bin/chunk_clusters.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import sys
+import os
+import argparse
+from collections import defaultdict
+import csv
+from Bio import SeqIO
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-c",
+ "--clustering",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="TSV clustering file input.",
+ )
+ parser.add_argument(
+ "-s",
+ "--sequences",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Initial sequences FASTA file.",
+ )
+ parser.add_argument(
+ "-t",
+ "--threshold",
+ required=True,
+ metavar="INT",
+ type=int,
+ help="Minimum cluster size to keep.",
+ )
+ parser.add_argument(
+ "-o",
+ "--out_folder",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="Name of the output folder to be created.",
+ )
+ return parser.parse_args(args)
+
+
+def collect_clusters(clustering_file, threshold):
+ # Collect clusters with a size threshold, storing in a defaultdict
+ clusters = defaultdict(list)
+
+ with open(clustering_file) as f:
+ csv_reader = csv.reader(f, delimiter="\t")
+ for row in csv_reader:
+ rep, member = row
+ clusters[rep].append(member)
+
+ # Filter clusters by threshold
+ return {
+ rep: members for rep, members in clusters.items() if len(members) >= threshold
+ }
+
+
+def main(args=None):
+ args = parse_args(args)
+
+ # Create output directory if it doesn't exist
+ os.makedirs(args.out_folder, exist_ok=True)
+
+ # Collect clusters that meet the threshold
+ clusters = collect_clusters(args.clustering, int(args.threshold))
+
+ # Stream through the FASTA file and write out sequences that match clusters
+ chunk_num = 1
+ for rep, members in clusters.items():
+ output_file = os.path.join(args.out_folder, f"{chunk_num}.fasta")
+ with open(output_file, "w") as fasta_out:
+ with open(args.sequences) as seq_file:
+ for record in SeqIO.parse(seq_file, "fasta"):
+ if record.id in members:
+ SeqIO.write(record, fasta_out, "fasta")
+ chunk_num += 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/clip_ends.py b/bin/clip_ends.py
new file mode 100755
index 0000000..4a01684
--- /dev/null
+++ b/bin/clip_ends.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import sys
+import numpy as np
+import argparse
+from Bio import SeqIO
+from Bio.Seq import Seq
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-a",
+ "--alignment",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Multiple sequence alignment file in fasta format.",
+ )
+ parser.add_argument(
+ "-g",
+ "--gap_threshold",
+ required=True,
+ metavar="FLOAT",
+ type=float,
+ help="Minimum gap occupancy across sequences to keep.",
+ )
+ parser.add_argument(
+ "-o",
+ "--out_fasta",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Name of the output fasta file with the trimmed multiple sequence alignment.",
+ )
+ return parser.parse_args(args)
+
+
+def read_fasta_to_matrix(file_path):
+ records = list(SeqIO.parse(file_path, "fasta"))
+ max_length = max(len(record.seq) for record in records)
+ matrix = np.zeros((len(records), max_length), dtype=np.dtype("U1"))
+ original_names = []
+
+ for i, record in enumerate(records):
+ original_names.append(record.id)
+ matrix[i, : len(record.seq)] = list(str(record.seq))
+
+ return matrix, original_names
+
+
+def calculate_trim_positions(sequence_matrix, gap_threshold):
+ numeric_matrix = np.where(sequence_matrix == "-", 0, 1)
+ num_rows = numeric_matrix.shape[0]
+ column_sums = np.sum(numeric_matrix, axis=0)
+ column_sums_percentage = column_sums / num_rows
+ start_position = np.argmax(column_sums_percentage > gap_threshold)
+ end_position = (
+ len(column_sums_percentage)
+ - np.argmax(column_sums_percentage[::-1] > gap_threshold)
+ - 1
+ )
+
+ return start_position, end_position
+
+
+def write_trimmed_sequences(
+ sequence_matrix_trimmed, original_sequence_names, out_fasta
+):
+ trimmed_records = []
+ for i, sequence in enumerate(sequence_matrix_trimmed):
+ trimmed_sequence = "".join(map(str, sequence))
+ original_name = original_sequence_names[i]
+ trimmed_record = SeqIO.SeqRecord(
+ Seq(trimmed_sequence), id=original_name, description=""
+ )
+ trimmed_records.append(trimmed_record)
+
+ with open(out_fasta, "w") as output_fasta:
+ SeqIO.write(trimmed_records, output_fasta, "fasta")
+
+
+def trim_msa(alignment, gap_threshold, out_fasta):
+ sequence_matrix, original_sequence_names = read_fasta_to_matrix(alignment)
+ start_position, end_position = calculate_trim_positions(
+ sequence_matrix, gap_threshold
+ )
+ sequence_matrix_trimmed = sequence_matrix[:, start_position : end_position + 1]
+ write_trimmed_sequences(sequence_matrix_trimmed, original_sequence_names, out_fasta)
+
+
+def main(args=None):
+ args = parse_args(args)
+ trim_msa(args.alignment, args.gap_threshold, args.out_fasta)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/extract_family_reps.py b/bin/extract_family_reps.py
new file mode 100755
index 0000000..afae2ed
--- /dev/null
+++ b/bin/extract_family_reps.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import sys
+import os
+import gzip
+import argparse
+import csv
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-f",
+ "--full_msa_folder",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="Input folder with fasta full alignments.",
+ )
+ parser.add_argument(
+ "-m",
+ "--metadata",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Name of the output csv file with family ids, sizes and representative sequences.",
+ )
+ parser.add_argument(
+ "-o",
+ "--out_fasta",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Name of the output fasta file with family representative sequences.",
+ )
+ return parser.parse_args(args)
+
+
+def extract_first_sequences(msa_folder, metadata_file, out_fasta):
+ # Open the output FASTA file in write mode
+ with open(out_fasta, "w") as fasta_out, open(
+ metadata_file, "w", newline=""
+ ) as csv_out:
+ # Write custom metadata lines to the metadata file
+ csv_out.write(
+ '# id: "family_metadata"\n'
+ '# section_name: "Family Metadata"\n'
+ '# description: "Family metadata table containing family ids and sizes along with representative sequences, ids and lengths."\n'
+ '# format: "csv"\n'
+ '# plot_type: "table"\n'
+ )
+ csv_writer = csv.writer(csv_out, quoting=csv.QUOTE_NONNUMERIC)
+ # Write the CSV header
+ csv_writer.writerow(
+ [
+ "Sample Name",
+ "Family Id",
+ "Size",
+ "Representative Length",
+ "Representative Id",
+ "Sequence",
+ ]
+ )
+
+ # Iterate over all files in the MSA folder
+ for filename in os.listdir(msa_folder):
+ filepath = os.path.join(msa_folder, filename)
+ # Parse the MSA fasta file and extract the first sequence
+ with (
+ gzip.open(filepath, "rt")
+ if filepath.endswith(".gz")
+ else open(filepath, "r")
+ ) as fasta_file:
+ format = "stockholm" if filepath.endswith(".sto.gz") else "fasta"
+ records = list(SeqIO.parse(fasta_file, format))
+ family_size = len(records)
+ if records:
+ first_record = records[0]
+ # Remove gaps from the sequence, and convert all to upper case
+ cleaned_sequence = (
+ str(first_record.seq).replace("-", "").replace(".", "").upper()
+ )
+ # Modify the ID to only include the part before the first space
+ cleaned_id = first_record.id.split(" ")[0]
+ # Create a new SeqRecord with the cleaned sequence and ID
+ cleaned_record = SeqRecord(
+ Seq(cleaned_sequence), id=cleaned_id, description=""
+ )
+ # Write the cleaned sequence to the FASTA file
+ SeqIO.write(cleaned_record, fasta_out, "fasta")
+ # Write the mapping to the CSV file
+ family_name = os.path.splitext(os.path.splitext(filename)[0])[0]
+ csv_writer.writerow(
+ [
+ family_name,
+ family_name,
+ family_size,
+ len(cleaned_sequence),
+ cleaned_id,
+ cleaned_sequence,
+ ]
+ )
+
+
+def main(args=None):
+ args = parse_args(args)
+ extract_first_sequences(args.full_msa_folder, args.metadata, args.out_fasta)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/filter_non_redundant_hmms.py b/bin/filter_non_redundant_hmms.py
new file mode 100755
index 0000000..6434a95
--- /dev/null
+++ b/bin/filter_non_redundant_hmms.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import os
+import sys
+import argparse
+import shutil
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-s",
+ "--seqs",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="Filtered fasta files to grab names from.",
+ )
+ parser.add_argument(
+ "-m",
+ "--models",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="All family HMMs.",
+ )
+ parser.add_argument(
+ "-o",
+ "--out_folder",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="Name of the output folder file with the filtered HMMs.",
+ )
+ return parser.parse_args(args)
+
+
+def filter_non_redundant_hmms(seqs, models, out_folder):
+ seq_basenames = {
+ os.path.basename(f).split(".")[0]
+ for f in os.listdir(seqs)
+ if f.endswith(".fasta.gz") or f.endswith(".fasta")
+ }
+ # Iterate through the models folder and copy matching files
+ for model_file in os.listdir(models):
+ if model_file.endswith(".hmm.gz"):
+ model_basename = os.path.basename(model_file).split(".")[0]
+ model_chunk = model_basename.split("_")[-1] # for the case where we didn't fish additional sequences, fastas only have the chunk id for name
+ if model_basename or model_chunk in seq_basenames:
+ src = os.path.join(models, model_file)
+ dst = os.path.join(out_folder, model_file)
+ shutil.copy(src, dst)
+
+
+def main(args=None):
+ args = parse_args(args)
+
+ os.makedirs(args.out_folder, exist_ok=True)
+ filter_non_redundant_hmms(args.seqs, args.models, args.out_folder)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/filter_recruited.py b/bin/filter_recruited.py
new file mode 100755
index 0000000..e9044f8
--- /dev/null
+++ b/bin/filter_recruited.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import sys
+import argparse
+import gzip
+import re
+from Bio import SeqIO
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-d",
+ "--domtbl",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Domain summary annotations result from hmmsearch.",
+ )
+ parser.add_argument(
+ "-f",
+ "--fasta",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Input fasta file containing all protein names with their amino acid sequence for mapping.",
+ )
+ parser.add_argument(
+ "-l",
+ "--length_threshold",
+ required=True,
+ metavar="FLOAT",
+ type=float,
+ help="Minimum length percentage threshold of annotated domain (env) against query to keep.",
+ )
+ parser.add_argument(
+ "-o",
+ "--out_fasta",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Name of the output fasta file with the fasta converted sequences (no gaps).",
+ )
+ return parser.parse_args(args)
+
+
+def filter_sequences(domtbl, length_threshold):
+ filtered_sequences = []
+
+ with gzip.open(domtbl, "rt", encoding="utf-8") as file:
+ for line in file:
+ if line.startswith("#"):
+ continue # Skip comments
+
+ columns = line.split()
+ try:
+ qlen = float(columns[5])
+ env_from = int(columns[19])
+ env_to = int(columns[20])
+ env_length = env_to - env_from + 1
+
+ if env_length >= length_threshold * qlen:
+ sequence_name = columns[0]
+
+ filtered_sequences.append(f"{sequence_name}/{env_from}-{env_to}")
+ except (IndexError, ValueError):
+ continue # Skip malformed lines
+
+ return filtered_sequences
+
+
+def validate_and_parse_hit_name(hit):
+ """
+ Validates and parses a hit string.
+ The hit must contain a string, at least one '/', and a valid range (integer-integer) after the last '/'.
+
+ Args:
+ hit (str): The hit string to validate and parse.
+
+ Returns:
+ tuple: (sequence_name, env_from, env_to) if the hit is valid.
+
+ Raises:
+ ValueError: If the hit is invalid.
+ """
+ # Define the regex pattern
+ pattern = r"^(.*)/(\d+)-(\d+)$"
+
+ # Match the pattern
+ match = re.match(pattern, hit)
+ if not match:
+ raise ValueError(f"Skipping hit with invalid format: {hit}.")
+
+ # Extract components
+ sequence_name = match.group(1) # Everything before the last '/'
+ env_from = int(match.group(2)) # First integer in the range
+ env_to = int(match.group(3)) # Second integer in the range
+
+ return sequence_name, env_from, env_to
+
+
+def extract_fasta_subset(filtered_sequences, fasta, out_fasta):
+ open_func = gzip.open if fasta.endswith(".gz") else open
+ with open_func(fasta, "rt") as in_fasta:
+ fasta_dict = {record.id: str(record.seq) for record in SeqIO.parse(in_fasta, "fasta")}
+
+ with gzip.open(out_fasta, "wt") as out_file:
+ for filtered_sequence in filtered_sequences:
+ try:
+ sequence_name, env_from, env_to = validate_and_parse_hit_name(filtered_sequence)
+
+ # Get the original sequence
+ original_record = fasta_dict[sequence_name]
+
+ # Extract the specific range (adjust indices for 0-based indexing)
+ extracted_seq = original_record[env_from-1:env_to]
+
+ # Determine the new sequence ID
+ if len(extracted_seq) == len(original_record):
+ new_id = sequence_name # Omit range if full-length
+ else:
+ new_id = f"{sequence_name}/{env_from}-{env_to}"
+
+ out_file.write(f">{new_id}\n{extracted_seq}\n")
+ except KeyError:
+ print(f"Sequence {sequence_name} not found in the input FASTA.", file=sys.stderr)
+ except ValueError as e:
+ print(e, file=sys.stderr)
+
+
+def filter_recruited(domtbl, fasta, length_threshold, out_fasta):
+ filtered_sequences = filter_sequences(domtbl, length_threshold)
+ extract_fasta_subset(filtered_sequences, fasta, out_fasta)
+
+
+def main(args=None):
+ args = parse_args(args)
+ filter_recruited(args.domtbl, args.fasta, args.length_threshold, args.out_fasta)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/remove_redundant_fams.py b/bin/remove_redundant_fams.py
new file mode 100755
index 0000000..29d2f81
--- /dev/null
+++ b/bin/remove_redundant_fams.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import os
+import sys
+import pandas as pd
+import argparse
+import shutil
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-m",
+ "--mapping",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="CSV metadata mapping input.",
+ )
+ parser.add_argument(
+ "-d",
+ "--domtbl",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="TSV hmmsearch domtbl out results for filtering.",
+ )
+ parser.add_argument(
+ "-f",
+ "--fasta_folder",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="Name of the input folder file with the pre-filtered fasta.",
+ )
+ parser.add_argument(
+ "-l",
+ "--length_threshold",
+ required=True,
+ metavar="FLOAT",
+ type=float,
+ help="Minimum length percentage threshold of annotated domain (env) against query to keep.",
+ )
+ parser.add_argument(
+ "-o",
+ "--out_folder",
+ required=True,
+ metavar="FOLDER",
+ type=str,
+ help="Name of the output folder file with the filtered fasta.",
+ )
+ return parser.parse_args(args)
+
+
+def remove_self_hits(domtbl_df, representative_to_family):
+ domtbl_df["target name"] = domtbl_df["target name"].map(representative_to_family)
+ domtbl_df = domtbl_df[domtbl_df["target name"] != domtbl_df["query name"]]
+
+ return domtbl_df
+
+
+def filter_by_length(domtbl_df, length_threshold):
+ domtbl_df = domtbl_df[
+ (domtbl_df["env to"] - domtbl_df["env from"] + 1) / domtbl_df["qlen"]
+ >= length_threshold
+ ]
+
+ return domtbl_df
+
+
+def remove_redundant_fams(mapping, domtbl, fasta_folder, length_threshold, out_folder):
+ mapping_df = pd.read_csv(
+ mapping, comment="#", usecols=["Family Id", "Size", "Representative Id"]
+ )
+ domtbl_df = pd.read_csv(
+ domtbl, sep=r"\s+", comment="#", header=None, usecols=[0, 3, 5, 19, 20]
+ ).rename(
+ columns={
+ 0: "target name",
+ 3: "query name",
+ 5: "qlen",
+ 19: "env from",
+ 20: "env to",
+ }
+ )
+
+ representative_to_family = dict(
+ zip(mapping_df["Representative Id"], mapping_df["Family Id"])
+ )
+ family_to_size = dict(zip(mapping_df["Family Id"], mapping_df["Size"]))
+
+ domtbl_df = remove_self_hits(domtbl_df, representative_to_family)
+ domtbl_df = filter_by_length(domtbl_df, length_threshold)
+ domtbl_df = domtbl_df.drop(columns=["qlen", "env from", "env to"])
+ domtbl_df["query size"] = domtbl_df["query name"].map(family_to_size)
+ domtbl_df["target size"] = domtbl_df["target name"].map(family_to_size)
+
+ redundant_fam_names = set()
+ for _, row in domtbl_df.iterrows():
+ if row["query size"] < row["target size"]:
+ redundant_fam_names.add(row["query name"])
+ else:
+ redundant_fam_names.add(row["target name"])
+
+ for file_name in os.listdir(fasta_folder):
+ base_name = os.path.basename(file_name).split(".")[0]
+ if base_name not in redundant_fam_names:
+ source_file = os.path.join(fasta_folder, file_name)
+ destination_file = os.path.join(out_folder, file_name)
+
+ # Check if it is a file (not a directory) and copy
+ if os.path.isfile(source_file):
+ shutil.copy2(source_file, destination_file)
+
+
+def main(args=None):
+ args = parse_args(args)
+
+ os.makedirs(args.out_folder, exist_ok=True)
+ remove_redundant_fams(
+ args.mapping,
+ args.domtbl,
+ args.fasta_folder,
+ args.length_threshold,
+ args.out_folder,
+ )
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/remove_redundant_seqs.py b/bin/remove_redundant_seqs.py
new file mode 100755
index 0000000..57737ba
--- /dev/null
+++ b/bin/remove_redundant_seqs.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+## Originally written by Evangelos Karatzas and released under the MIT license.
+## See git repository (https://github.com/nf-core/proteinfamilies) for full license text.
+
+import sys
+import gzip
+import argparse
+import csv
+from Bio import SeqIO
+
+
+def parse_args(args=None):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-c",
+ "--clustering",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="TSV clustering file input.",
+ )
+ parser.add_argument(
+ "-s",
+ "--sequences",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Initial sequences FASTA file.",
+ )
+ parser.add_argument(
+ "-o",
+ "--out_fasta",
+ required=True,
+ metavar="FILE",
+ type=str,
+ help="Name of the output fasta file with family representative sequences.",
+ )
+ return parser.parse_args(args)
+
+
+def extract_rep_sequences(clustering, sequences, out_fasta):
+ # Read the clustering file and extract unique values from column 1
+ unique_representatives = set()
+ with open(clustering, "r") as tsv_file:
+ reader = csv.reader(tsv_file, delimiter="\t")
+ for row in reader:
+ if row: # Ensure the row is not empty
+ unique_representatives.add(row[0])
+
+ # Read the sequences file and filter for representatives
+ matching_records = []
+ with (
+ gzip.open(sequences, "rt")
+ if sequences.endswith(".gz")
+ else open(sequences, "r")
+ ) as fasta_file:
+ for record in SeqIO.parse(fasta_file, "fasta"):
+ if record.id in unique_representatives:
+ matching_records.append(record)
+
+ # Write the matching sequences to the output fasta file
+ with open(out_fasta, "w") as output_file:
+ SeqIO.write(matching_records, output_file, "fasta")
+
+
+def main(args=None):
+ args = parse_args(args)
+ extract_rep_sequences(args.clustering, args.sequences, args.out_fasta)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/conf/base.config b/conf/base.config
index 507f2b0..d3959cc 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -10,7 +10,6 @@
process {
- // TODO nf-core: Check the defaults for all processes
cpus = { 1 * task.attempt }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
@@ -24,7 +23,6 @@ process {
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
- // TODO nf-core: Customise requirements for specific processes.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { 1 }
@@ -59,4 +57,12 @@ process {
errorStrategy = 'retry'
maxRetries = 2
}
+
+ withName: CHUNK_CLUSTERS {
+ memory = { 36.GB * task.attempt }
+ }
+ withName: FILTER_RECRUITED {
+ memory = { 36.GB * task.attempt }
+ }
+
}
diff --git a/conf/modules.config b/conf/modules.config
index f0b0d55..596387b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -18,6 +18,482 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
+ withName: 'UNTAR_HMM' {
+ publishDir = [
+ path: { "${params.outdir}/untar/hmm" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'UNTAR_MSA' {
+ publishDir = [
+ path: { "${params.outdir}/untar/msa" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'CAT_HMM' {
+ publishDir = [
+ path: { "${params.outdir}/update_families/hmmer/concatenated" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'CAT_FASTA' {
+ ext.prefix = { "${meta.id}_${meta.family}" }
+ tag = { "${meta.id}_${meta.family}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/fasta" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:CAT_CAT' {
+ publishDir = [
+ path: { "${params.outdir}/remove_redundancy/hmmer/concatenated" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'SEQKIT_SEQ' {
+ ext.args = "--remove-gaps"
+ ext.prefix = { "${meta.id}_${meta.family}" }
+ tag = { "${meta.id}_${meta.family}" }
+ publishDir = [
+ path: { "${params.outdir}/seqkit/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'MMSEQS_CREATEDB' {
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/initial_clustering/mmseqs_createdb/" },
+ mode: params.publish_dir_mode,
+ enabled: params.save_mmseqs_db,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:EXECUTE_CLUSTERING:MMSEQS_CREATEDB' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/update_families/mmseqs_createdb/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:EXECUTE_CLUSTERING:MMSEQS_CREATEDB' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/redundancy_clustering/mmseqs_createdb/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'MMSEQS_CLUSTER' {
+ ext.args = [
+ "--min-seq-id ${params.cluster_seq_identity}",
+ "-c ${params.cluster_coverage}",
+ "--cov-mode ${params.cluster_cov_mode}",
+ ].join(' ').trim()
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/initial_clustering/mmseqs_cluster/" },
+ mode: params.publish_dir_mode,
+ enabled: params.save_mmseqs_clustering,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES_CLUSTERING:EXECUTE_CLUSTERING:MMSEQS_CLUSTER' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ ext.args = [
+ "--min-seq-id ${params.cluster_seq_identity_for_redundancy}",
+ "-c ${params.cluster_coverage_for_redundancy}",
+ "--cov-mode ${params.cluster_cov_mode_for_redundancy}",
+ ].join(' ').trim()
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/update_families/mmseqs_cluster/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:EXECUTE_CLUSTERING:MMSEQS_CLUSTER' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ ext.args = [
+ "--min-seq-id ${params.cluster_seq_identity_for_redundancy}",
+ "-c ${params.cluster_coverage_for_redundancy}",
+ "--cov-mode ${params.cluster_cov_mode_for_redundancy}",
+ ].join(' ').trim()
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/redundancy_clustering/mmseqs_cluster/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'MMSEQS_LINCLUST' {
+ ext.args = [
+ "--min-seq-id ${params.cluster_seq_identity}",
+ "-c ${params.cluster_coverage}",
+ "--cov-mode ${params.cluster_cov_mode}",
+ ].join(' ').trim()
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/initial_clustering/mmseqs_linclust/" },
+ mode: params.publish_dir_mode,
+ enabled: params.save_mmseqs_clustering,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:EXECUTE_CLUSTERING:MMSEQS_LINCLUST' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ ext.args = [
+ "--min-seq-id ${params.cluster_seq_identity_for_redundancy}",
+ "-c ${params.cluster_coverage_for_redundancy}",
+ "--cov-mode ${params.cluster_cov_mode_for_redundancy}",
+ ].join(' ').trim()
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/update_families/mmseqs_linclust/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:EXECUTE_CLUSTERING:MMSEQS_LINCLUST' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ ext.args = [
+ "--min-seq-id ${params.cluster_seq_identity_for_redundancy}",
+ "-c ${params.cluster_coverage_for_redundancy}",
+ "--cov-mode ${params.cluster_cov_mode_for_redundancy}",
+ ].join(' ').trim()
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/redundancy_clustering/mmseqs_linclust/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'MMSEQS_CREATETSV' {
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/initial_clustering/mmseqs_createtsv/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:EXECUTE_CLUSTERING:MMSEQS_CREATETSV' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/update_families/mmseqs_createtsv/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:EXECUTE_CLUSTERING:MMSEQS_CREATETSV' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/redundancy_clustering/mmseqs_createtsv/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'CHUNK_CLUSTERS' {
+ publishDir = [
+ path: { "${params.outdir}/mmseqs/initial_clustering/filtered_fasta_chunks/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'FAMSA_ALIGN' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/seed_msa/famsa_align/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:ALIGN_SEQUENCES:FAMSA_ALIGN' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/full_msa/famsa_align/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:ALIGN_SEQUENCES:FAMSA_ALIGN' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/full_msa/non_redundant/famsa_align/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'MAFFT_ALIGN' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/seed_msa/mafft_align/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:ALIGN_SEQUENCES:MAFFT_ALIGN' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/full_msa/mafft_align/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:ALIGN_SEQUENCES:MAFFT_ALIGN' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/full_msa/non_redundant/mafft_align/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'CLIPKIT' {
+ ext.args = { "-m gappy --gaps ${params.gap_threshold}" }
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/seed_msa/clipkit/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:CLIPKIT' {
+ ext.args = { "-m gappy --gaps ${params.gap_threshold}" }
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/full_msa/clipkit/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'CLIP_ENDS' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/seed_msa/clip_ends/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:CLIP_ENDS' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/full_msa/clip_ends/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'HMMER_HMMBUILD' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/hmmer/hmmbuild/pre_non_redundant/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ pattern: "*.{hmm.gz}",
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:HMMER_HMMBUILD' {
+ ext.prefix = { "${meta.family}" }
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/hmmer/hmmbuild/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ pattern: "*.{hmm.gz}",
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'HMMER_HMMSEARCH' {
+ ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" }
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/hmmer/hmmsearch/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ pattern: "*.domtbl.gz",
+ enabled: params.save_hmmsearch_results,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:HMMER_HMMSEARCH' {
+ ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" }
+ ext.prefix = { "${meta.id}" }
+ tag = { "${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/hmmer/hmmsearch/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ pattern: "*.{domtbl.gz}",
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:HMMER_HMMSEARCH' {
+ ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" }
+ ext.prefix = { "${meta.id}" }
+ tag = { "${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/remove_redundancy/hmmer/hmmsearch/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ pattern: "*.{domtbl.gz}",
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'FILTER_RECRUITED' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/full_msa/pre_non_redundant/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'HMMER_HMMALIGN' {
+ ext.prefix = { "${meta.id}_${meta.chunk}" }
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/full_msa/pre_non_redundant/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ pattern: "*.sto.gz",
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'REMOVE_REDUNDANT_FAMS' {
+ publishDir = [
+ path: { "${params.outdir}/remove_redundancy/non_redundant_fams/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'FILTER_NON_REDUNDANT_HMMS' {
+ publishDir = [
+ path: { "${params.outdir}/hmmer/hmmbuild/non_redundant/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'REMOVE_REDUNDANT_SEQS' {
+ tag = { "${meta.id}_${meta.chunk}" }
+ publishDir = [
+ path: { "${params.outdir}/remove_redundancy/reps_fasta/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:REMOVE_REDUNDANT_SEQS' {
+ tag = { "${meta.family}_${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/update_families/reps_fasta/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'EXTRACT_FAMILY_REPS' {
+ publishDir = [
+ path: { "${params.outdir}/family_reps/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:UPDATE_FAMILIES:EXTRACT_FAMILY_REPS' {
+ publishDir = [
+ path: { "${params.outdir}/update_families/family_reps/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*:REMOVE_REDUNDANCY:EXTRACT_FAMILY_REPS' {
+ publishDir = [
+ path: { "${params.outdir}/remove_redundancy/family_reps/${meta.id}/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: 'BRANCH_HITS_FASTA' {
+ publishDir = [
+ path: { "${params.outdir}/update_families/branch_fasta/" },
+ mode: params.publish_dir_mode,
+ enabled: false,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
withName: 'MULTIQC' {
ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' }
publishDir = [
diff --git a/conf/test.config b/conf/test.config
index 193133f..69abb3f 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -20,10 +20,10 @@ process {
params {
config_profile_name = 'Test profile'
- config_profile_description = 'Minimal test dataset to check pipeline function'
+ config_profile_description = 'Test dataset to check generic pipeline function for creating new families'
// Input data
- // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
- // TODO nf-core: Give any required params for the test so that command line flags are not needed
- input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
+ input = params.pipelines_testdata_base_path + 'proteinfamilies/samplesheets/samplesheet.csv'
+ // Clustering
+ cluster_size_threshold = 5
}
diff --git a/conf/test_full.config b/conf/test_full.config
index 50cab15..9d266ef 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -12,13 +12,11 @@
params {
config_profile_name = 'Full test profile'
- config_profile_description = 'Full test dataset to check pipeline function'
+ config_profile_description = 'Full test dataset to check simultaneous create and update/create pipeline functions with two input samples'
// Input data for full size test
- // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
- // TODO nf-core: Give any required params for the test so that command line flags are not needed
- input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
-
- // Fasta references
- fasta = params.pipelines_testdata_base_path + 'viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz'
+ // Input data
+ input = params.pipelines_testdata_base_path + 'proteinfamilies/samplesheets/samplesheet_full.csv'
+ // Clustering
+ cluster_size_threshold = 5
}
diff --git a/conf/test_minimal.config b/conf/test_minimal.config
new file mode 100644
index 0000000..268ca85
--- /dev/null
+++ b/conf/test_minimal.config
@@ -0,0 +1,35 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Defines input files and everything required to run a fast and simple pipeline test.
+
+ Use as follows:
+ nextflow run nf-core/proteinfamilies -profile test, --outdir
+
+----------------------------------------------------------------------------------------
+*/
+
+process {
+ resourceLimits = [
+ cpus: 4,
+ memory: '15.GB',
+ time: '1.h'
+ ]
+}
+
+params {
+ config_profile_name = 'Minmal test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function (no MSA clipping, no redundancy checking, no update)'
+
+ // Input data
+ input = params.pipelines_testdata_base_path + 'proteinfamilies/samplesheets/samplesheet.csv'
+ // Clustering
+ cluster_size_threshold = 5
+ // Alignment
+ trim_msa = false
+ recruit_sequences_with_models = false
+ // Redundancy
+ remove_family_redundancy = false
+ remove_sequence_redundancy = false
+}
diff --git a/conf/test_multi_sample_with_gz.config b/conf/test_multi_sample_with_gz.config
new file mode 100644
index 0000000..6deba39
--- /dev/null
+++ b/conf/test_multi_sample_with_gz.config
@@ -0,0 +1,29 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Defines input files and everything required to run a fast and simple pipeline test.
+
+ Use as follows:
+ nextflow run nf-core/proteinfamilies -profile test, --outdir
+
+----------------------------------------------------------------------------------------
+*/
+
+process {
+ resourceLimits = [
+ cpus: 4,
+ memory: '15.GB',
+ time: '1.h'
+ ]
+}
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Test dataset to check pipeline function with both fasta and fasta.gz sample inputs at the same time'
+
+ // Input data
+ input = params.pipelines_testdata_base_path + 'proteinfamilies/samplesheets/samplesheet_multi_sample_with_gz.csv'
+ // Clustering
+ cluster_size_threshold = 5
+}
diff --git a/conf/test_update.config b/conf/test_update.config
new file mode 100644
index 0000000..2be1a14
--- /dev/null
+++ b/conf/test_update.config
@@ -0,0 +1,29 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Defines input files and everything required to run a fast and simple pipeline test.
+
+ Use as follows:
+ nextflow run nf-core/proteinfamilies -profile test, --outdir
+
+----------------------------------------------------------------------------------------
+*/
+
+process {
+ resourceLimits = [
+ cpus: 4,
+ memory: '15.GB',
+ time: '1.h'
+ ]
+}
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Test dataset to check the update pipeline function (non hit sequences will flow into the create function)'
+
+ // Input data
+ input = params.pipelines_testdata_base_path + 'proteinfamilies/samplesheets/samplesheet_update.csv'
+ // Clustering
+ cluster_size_threshold = 5
+}
diff --git a/docs/images/proteinfamilies_workflow.png b/docs/images/proteinfamilies_workflow.png
new file mode 100644
index 0000000..4268a7d
Binary files /dev/null and b/docs/images/proteinfamilies_workflow.png differ
diff --git a/docs/images/proteinfamilies_workflow.svg b/docs/images/proteinfamilies_workflow.svg
new file mode 100644
index 0000000..f69ccd9
--- /dev/null
+++ b/docs/images/proteinfamilies_workflow.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/output.md b/docs/output.md
index b4467df..86eb51a 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -6,15 +6,446 @@ This document describes the output produced by the pipeline. Most of the plots a
The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
-
-
## Pipeline overview
The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
+Initial clustering:
+
+- [MMseqs2](#mmseqs2) initial clustering of input amino acid sequences and filtering with membership threshold
+
+Multiple sequence alignment:
+
+- [FAMSA](#famsa) aligner option. Best speed and sensitivity option to build seed multiple sequence alignment for the families
+- [mafft](#mafft) aligner option. Fast but not as sensitive as FAMSA to build seed multiple sequence alignment for the families
+- [ClipKIT](#clipkit) to optionally clip gapped portions of the multiple sequence alignment (MSA)
+
+Generating family models:
+
+- [hmmer](#hmmer) to build the family HMM (hmmbuild) and to optionally 'fish' additional sequences from the input fasta file (hmmsearch), with given thresholds, into the family and also build the family full MSA (hmmalign)
+
+Removing redundancy:
+
+- [hmmer](#hmmer-for-redundancy-removal) to match family representative sequences against other family models in order to keep non redundant ones
+- [MMseqs2](#mmseqs2-for-redundancy-removal) to strictly cluster the sequences within each of the remaining families, in order to still capture the evolutionary diversity within a family, but without keeping all the almost identical sequences
+- [FAMSA](#famsa-for-redundancy-removal) aligner option. Re-align full MSA with final set of sequences
+- [mafft](#mafft-for-redundancy-removal) aligner option. Re-align full MSA with final set of sequences
+
+Updating families:
+
+- [untar](#untar) to decompress tarballs of existing hmms and msas
+- [hmmer](#hmmer-for-updating-families) to match input sequences to existing families with hmmsearch as well as for rebuilding models with newly recruited sequences with hmmbuild
+- [SeqKit](#SeqKit) to extract fasta formatted family sequences from their MSA files
+- [MMseqs2](#mmseqs2-for-updating-families) to strictly cluster the sequences within each of the families to update
+- [FAMSA](#famsa-for-updating-families) aligner option. Re-align full MSA with final set of sequences
+- [mafft](#mafft-for-updating-families) aligner option. Re-align full MSA with final set of sequences
+- [ClipKIT](#clipkit-for-updating-families) to optionally clip gapped portions of the multiple sequence alignment (MSA)
+
+Reporting:
+
+- [Extract family representatives](#extract-family-representatives) to produce the final metadata file along with a fasta of all family representative sequences (can be used downstream for structural prediction).
- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
+### MMseqs2
+
+
+Output files
+
+- `mmseqs/`
+ - `initial_clustering/`
+ - `mmseqs_createtsv/`
+ - `.tsv`: tab-separated table containing 2 columns; the first one with the cluster representative sequences, and the second with the cluster members
+ - `mmseqs_createdb/`
+ - `/`
+ - `*`: (optional) mmseqs format db of fasta sequences. Can be turned on with --save_mmseqs_db
+ - `mmseqs_linclust/`
+ - `/`
+ - `*`: (optional) mmseqs format clustered db. Can be turned on with --save_mmseqs_clustering
+ - `mmseqs_cluster/`
+ - `/`
+ - `*`: (optional) mmseqs format clustered db. Can be turned on with --save_mmseqs_clustering
+ - `filtered_fasta_chunks/`
+ - `/`
+ - `chunked_fasta/`
+ - `*.fasta`: (optional) fasta files with amino acid sequences of each cluster above the membership threshold
+
+
+
+The `mmseqs_createtsv/.tsv` contains the mmseqs clustering of sequences, which will then be filtered by size and split into chunks for further parallel processing.
+The optionally saved `chunked_fasta` folder contains these fasta files of sequences for each cluster.
+These per cluster fasta files act as input to produce downstream families in the next steps of the pipeline.
+The original mmseqs db and the clustered mmseqs db can be optional saved to the output folder, but they won't be further utilised in this pipeline.
+
+[MMseqs2](https://github.com/soedinglab/MMseqs2) clusters amino acid fasta files via either the 'cluster' or the 'linclust' algorithms.
+
+### FAMSA aligner
+
+
+Output files
+
+- `seed_msa/`
+ - `famsa_align/`
+ - `/`
+ - `_*.aln`: fasta files with aligned amino acid sequences
+
+
+
+This folder contains the generated seed MSA family files, if `famsa` was chosen as the `--alignment_tool`.
+These MSA files only contain the original sequences of each cluster as calculated by mmseqs.
+
+[FAMSA](https://github.com/refresh-bio/FAMSA) is a progressive algorithm for large-scale multiple sequence alignments.
+
+### mafft aligner
+
+
+Output files
+
+- `seed_msa/`
+ - `mafft_align/`
+ - `/`
+ - `_*.fas`: fasta files with aligned amino acid sequences
+
+
+
+This folder contains the generated seed MSA family files, if `mafft` was chosen as the `--alignment_tool`.
+These MSA files only contain the original sequences of each cluster as calculated by mmseqs.
+
+[mafft](https://github.com/GSLBiotech/mafft) is a fast but not very sensitive multiple sequence alignment tool.
+
+### ClipKIT
+
+
+Output files
+
+- `seed_msa/`
+ - `clipkit/`
+ - `/`
+ - `_*.clipkit`: gap-clipped (start, middle, end) fasta files of aligned amino acid sequences
+ - `clip_ends/`
+ - `/`
+ - `_*.clipends`: gap-clipped (only start and end) fasta files of aligned amino acid sequences
+
+
+
+If the `--trim_msa` parameter was set to `true`, then depending on the `--clipping_tool`, and according to the `--gap_threshold` either
+`clipkit` is run and gaps (above threshold) are removed throughout the alignment,
+or `clip_ends` is run and gaps (above threshold) are removed only at the ends.
+Results are stored in the `seed_msa` folder.
+
+[ClipKIT](https://github.com/JLSteenwyk/ClipKIT) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others.
+
+### hmmer
+
+
+Output files
+
+- `hmmer/`
+ - `hmmbuild/`
+ - `/`
+ - `_*.hmm.gz`: compressed hmm model for the family
+ - `_*.hmmbuild.txt`: (optional) hmmbuild execution log
+ - `hmmsearch/`
+ - `/`
+ - `_*.domtbl.gz`: (optional) hmmsearch results along parameters info. Can be turned on with --save_hmmsearch_results
+ - `_*.txt.gz`: (optional) hmmsearch execution log. Can be turned on with --save_hmmsearch_results
+- `full_msa/`
+ - `pre_non_redundant/`
+ - `/`
+ - `_*.sto.gz`: compressed family full MSA produced by hmmalign (before checking for redundancy)
+
+
+
+The `hmmer/hmmbuild` folder contains all originally created family HMMs. These models will be used downstream used to 'fish' extra sequences
+in each family if `--recruit_sequences_with_models` is set to true, and/or to remove between-families redundancy if `--remove_family_redundancy` is set to true.
+The models can also be used in the `update_families` execution mode of the pipeline,
+along with the families' respective MSAs, to recruit sequences from a new input fasta file into the families, updating both family HMM and MSA files.
+
+[hmmer](https://github.com/EddyRivasLab/hmmer) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others.
+
+### hmmer for redundancy removal
+
+
+Output files
+
+- `remove_redundancy/`
+ - `hmmer/`
+ - `concatenated/`
+ - `.hmm.gz`: (optional) concatenated compressed hmm model for all families in a given sample (pre redundancy removal)
+ - `hmmsearch/`
+ - `/`
+ - `_*.domtbl.gz`: (optional) hmmsearch results of family reps against families' HMMs
+ - `family_reps/`
+ - `/`
+ - `_meta_mqc.csv`: (optional) csv with metadata (Sample Name,Family Id,Size,Representative Length,Representative Id,Sequence)
+ - `_reps.fa`: (optional) fasta file of all family representative sequences (one sequence per family)
+ - `non_redundant_fams/`
+ - `/`
+ - `non_redundant/`
+ - `_*.fasta.gz`: (optional) compressed family full MSA (after checking for family redundancy)
+
+
+
+If `--remove_family_redundancy` is set to true, the `hmmer/hmmsearch` module is used
+to identify family representative sequences that are highly identical to other family HMMs.
+In these cases, the smaller sized families are deemed redundant and flagged for removal.
+These `remove_redundancy` optional folders only contain intermediate pipeline results that by default are not saved in the output results.
+
+[hmmer](https://github.com/EddyRivasLab/hmmer) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others.
+
+### MMseqs2 for redundancy removal
+
+
+Output files
+
+- `mmseqs/`
+ - `redundancy_clustering/`
+ - `mmseqs_createtsv/`
+ - `.tsv`: tab-separated table containing 2 columns; the first one with the cluster representative sequences, and the second with the cluster members
+ - `mmseqs_createdb/`
+ - `/`
+ - `*`: (optional) mmseqs format db of fasta sequences
+ - `mmseqs_linclust/`
+ - `/`
+ - `*`: (optional) mmseqs format clustered db
+ - `mmseqs_cluster/`
+ - `/`
+ - `*`: (optional) mmseqs format clustered db
+- `remove_redundancy/`
+ - `reps_fasta/`
+ - `/`
+ - `_reps.fa`: (optional) fasta file of all family representative sequences (one sequence per family)
+
+
+
+If `--remove_sequence_redundancy` is set to true, the mmseqs clustering subworkflow will be executed
+to very strictly cluster (`--cluster_seq_identity_for_redundancy` = 0.97, `cluster_coverage_for_redundancy` = 0.97,
+`cluster_cov_mode_for_redundancy` = 0 -meaning both strands) in-family sequences, keeping only cluster representatives
+before recalculating the family MSAs.
+
+[MMseqs2](https://github.com/soedinglab/MMseqs2) clusters amino acid fasta files via either the 'cluster' or the 'linclust' algorithms.
+
+### FAMSA for redundancy removal
+
+
+Output files
+
+- `full_msa/`
+ - `non_redundant/`
+ - `famsa_align/`
+ - `/`
+ - `_*.aln`: family full MSA (after checking for sequence redundancy)
+
+
+
+If `--remove_sequence_redundancy` is set to true, then the MSAs will be recalculated after in-family sequence redundancy is removed.
+If the `--alignment_tool` is `famsa`, then this `famsa_align` folder will be created, containing the final MSA files.
+
+[FAMSA](https://github.com/refresh-bio/FAMSA) is a progressive algorithm for large-scale multiple sequence alignments.
+
+### mafft for redundancy removal
+
+
+Output files
+
+- `full_msa/`
+ - `non_redundant/`
+ - `mafft_align/`
+ - `/`
+ - `_*.fas`: fasta files with aligned amino acid sequences (after checking for sequence redundancy)
+
+
+
+If `--remove_sequence_redundancy` is set to true, then the MSAs will be recalculated after in-family sequence redundancy is removed.
+If the `--alignment_tool` is `mafft`, then this `mafft_align` folder will be created, containing the final MSA files.
+
+[mafft](https://github.com/GSLBiotech/mafft) is a fast but not very sensitive multiple sequence alignment tool.
+
+### untar
+
+
+Output files
+
+- `untar/`
+ - `hmm/`
+ - `/`
+ - `.{hmm.gz,hmm}`: (optional) decompressed input hmm tarball
+ - `msa/`
+ - `/`
+ - `.{aln,fas}`: (optional) decompressed input msa tarball
+
+
+
+### hmmer for updating families
+
+
+Output files
+
+- `update_families/`
+ - `hmmer/`
+ - `concatenated/`
+ - `.hmm.gz`: (optional) concatenated compressed HMM models for all families in a given sample, to be used as input for hmmsearch, to determine which families will be updated with new sequences
+ - `hmmsearch/`
+ - `/`
+ - `.domtbl.gz`: (optional) hmmsearch results of input fasta file against existing families' HMMs
+ - `hmmbuild/`
+ - `/`
+ - `.hmm.gz`: (optional) compressed family HMM after the update
+ - `.hmmbuild.txt`: (optional) hmmbuild execution log
+ - `branch_fasta/`
+ - `hits/`
+ - `.fasta`: (optional) subset of the input FASTA with hit sequences for each existing family
+ - `.fasta.gz`: (optional) FASTA file that contains all remaining non-hit input sequences, which will be passed to normal execution mode to create new families
+ - `family_reps/`
+ - `/`
+ - `_meta_mqc.csv`: (optional) csv with metadata (Sample Name,Family Id,Size,Representative Length,Representative Id,Sequence)
+ - `_reps.fa`: (optional) fasta file of all family representative sequences (one sequence per family)
+
+
+
+The `update_families` execution mode is run if paths to `existing_hmms_to_update` and `existing_msas_to_update` are provided in the input samplesheet.csv.
+The `hmmer/hmmsearch` module is used to match new incoming sequences in the existing family models.
+If there were hits, the new sequences are reclustered along their matching family existing ones, and new models are build with `hmmer/hmmbuild`
+in the `update_families/hmmer/hmmbuild` folder, from the respective new MSAs.
+
+[hmmer](https://github.com/EddyRivasLab/hmmer) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others.
+
+### SeqKit
+
+
+Output files
+
+- `seqkit/`
+ - `_.fastq`: (optional) fasta formatted family sequences from full MSA with gaps removed
+- `update_families/`
+ - `fasta/`
+ - `_.fastq`: (optional) concatenated family fasta with newly recruited sequences
+
+
+
+The seqkit module is mainly used during the `update_families` mode
+to extract sequences from family MSA, into intermediate fasta files (`seqkit` output folder).
+The intermediate `update_families/fasta` folder contains the aggregation of existing family sequences along with their newly matching ones,
+that will together produce the updated family MSA.
+
+[SeqKit](https://github.com/shenwei356/seqkit) is a cross-platform and ultrafast toolkit for FASTA/Q file manipulation.
+
+### MMseqs2 for updating families
+
+
+Output files
+
+- `mmseqs/`
+ - `update_families/`
+ - `mmseqs_createtsv/`
+ - `.tsv`: tab-separated table containing 2 columns; the first one with the cluster representative sequences, and the second with the cluster members
+ - `mmseqs_createdb/`
+ - `/`
+ - `*`: (optional) mmseqs format db of fasta sequences
+ - `mmseqs_linclust/`
+ - `/`
+ - `*`: (optional) mmseqs format clustered db
+ - `mmseqs_cluster/`
+ - `/`
+ - `*`: (optional) mmseqs format clustered db
+ - `reps_fasta/`
+ - `/`
+ - `_reps.fa`: (optional) fasta file of all family representative sequences (one sequence per family)
+
+
+
+Similarly to the in-family sequence redundancy removal mechanism, the mmseqs suite is used to strictly cluster
+existing family sequences along newly recruited ones, keeping a non redundant set.
+The new family representative sequences can now be found in the intermediate `mmseqs/reps_fasta` folder.
+
+[MMseqs2](https://github.com/soedinglab/MMseqs2) clusters amino acid fasta files via either the 'cluster' or the 'linclust' algorithms.
+
+### FAMSA for updating families
+
+
+Output files
+
+- `update_families/`
+ - `full_msa/`
+ - `famsa_align/`
+ - `/`
+ - `.aln`: family full MSA (after updating with new sequences)
+
+
+
+In the `update_families` mode, if new sequences are added in an existing family,
+and after (optionally) removing in-family sequence redundacny, if `--remove_sequence_redundancy` is set to `true`,
+then the family MSA is recalculated.
+If the `--alignment_tool` is `famsa`, then this `famsa_align` folder will be created, containing the updated family MSA files.
+
+[FAMSA](https://github.com/refresh-bio/FAMSA) is a progressive algorithm for large-scale multiple sequence alignments.
+
+### mafft for updating families
+
+
+Output files
+
+- `update_families/`
+ - `full_msa/`
+ - `mafft_align/`
+ - `/`
+ - `.fas`: family full MSA (after updating with new sequences)
+
+
+
+In the `update_families` mode, if new sequences are added in an existing family,
+and after (optionally) removing in-family sequence redundacny, if `--remove_sequence_redundancy` is set to `true`,
+then the family MSA is recalculated.
+If the `--alignment_tool` is `mafft`, then this `mafft_align` folder will be created, containing the updated family MSA files.
+
+[mafft](https://github.com/GSLBiotech/mafft) is a fast but not very sensitive multiple sequence alignment tool.
+
+### ClipKIT for updating families
+
+
+Output files
+
+- `update_families/`
+ - `full_msa/`
+ - `clipkit/`
+ - `/`
+ - `.clipkit`: gap-clipped (start, middle, end) fasta files of aligned amino acid sequences
+ - `clip_ends/`
+ - `/`
+ - `.clipends`: gap-clipped (only start and end) fasta files of aligned amino acid sequences
+
+
+
+If the `--trim_msa` parameter was set to `true`, then depending on the `--clipping_tool`, and according to the `--gap_threshold` either
+`clipkit` is run and gaps (above threshold) are removed throughout the alignment,
+or `clip_ends` is run and gaps (above threshold) are removed only at the ends.
+Results are stored in the `update_families/full_msa` folder.
+
+[ClipKIT](https://github.com/JLSteenwyk/ClipKIT) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others.
+
+### Extract family representatives
+
+
+Output files
+
+- `family_reps/`
+ - `/`
+ - `_meta_mqc.csv`: csv with metadata to print with MultiQC (Sample Name,Family Id,Size,Representative Length,Representative Id,Sequence)
+ - `_reps.fa`: fasta file of all family representative sequences (one sequence per family)
+- `update_families/`
+ - `family_reps/`
+ - `/`
+ - `_meta_mqc.csv`: csv with metadata to print with MultiQC (Sample Name,Family Id,Size,Representative Length,Representative Id,Sequence)
+ - `_reps.fa`: fasta file of all family representative sequences (one sequence per family)
+
+
+
+The final report of the nf-core/proteinfamilies pipeline.
+The `*_meta_mqc.csv` file are used to report family metadata and statistics in the browser, via the MultiQC software.
+The `*_reps.fa` protein fasta file contains all family representative sequence in one place.
+This file can be further used as input in other pipelines such as nf-core/proteinfold for structural prediction
+or in fasta annotation pipelines.
+
### MultiQC
@@ -31,6 +462,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
+Custom output MultiQC data includes a metadata file (`multiqc_data/multiqc_family_metadata.txt`) with family information such as: Sample,Family Id,Size,Representative Length,Representative Id,Sequence
+
+This custom metadata is presented as a data table in the MultiQC report file.
+
### Pipeline information
diff --git a/docs/usage.md b/docs/usage.md
index 555c3af..0ad52f4 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -6,51 +6,44 @@
## Introduction
-
+**nf-core/proteinfamilies** is a bioinformatics pipeline that generates protein families from amino acid sequences and/or updates existing families with new sequences.
+It takes a protein fasta file as input, clusters the sequences and then generates protein family Hiden Markov Models (HMMs) along with their multiple sequence alignments (MSAs).
+Optionally, paths to existing family HMMs and MSAs can be given (must have matching base filenames one-to-one) in order to update with new sequences in case of matching hits.
## Samplesheet input
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 2 mandatory and 2 optional columns, and a header row as shown in the examples below.
```bash
--input '[path to samplesheet file]'
```
-### Multiple runs of the same sample
-
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
-
-```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
-```
-
-### Full samplesheet
-
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
-
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
-
-```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
+```csv
+sample,fasta,existing_hmms_to_update,existing_msas_to_update
+CONTROL_REP1,amino_acid_sequences_input.fasta,,
+CONTROL_REP2,amino_acid_sequences_extra.fa.gz,existing_hmms.tar.gz,existing_msas.tar.gz
```
-| Column | Description |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
-
-An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
+| Column | Description |
+| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`). |
+| `fasta` | Full path to amino acid fasta file. File can be gzipped and allowed extensions include ".fasta", ".fasta.gz", ".fa" or ".fa.gz". |
+| `existing_hmms_to_update` | Full path to compressed archive with existing family HMMs. Allowed extension should be ".tar.gz". |
+| `existing_msas_to_update` | Full path to compressed archive with existing family MSAs. Allowed extension should be ".tar.gz". |
+
+## Parameter specifications
+
+Here we provide guidance regarding some parameter choices.
+
+- clustering_tool ["cluster", "linclust"]: The mmseqs algorithm used for clustering.
+ The 'cluster' option is very slow and should only be used for small or medium size inputs.
+ The 'linclust' option is somewhat less sensitive, but extremely fast for clustering larger datasets.
+- alignment_tool ["famsa", "mafft"]: Multiple Sequence Alignment (MSA) options.
+ The 'famsa' option is generally recommended as best time-memory-accuracy combination.
+ The 'mafft' option is also very fast and may still produce better alignments in some edge cases.
+- clipping_tool ["clip_ends", "clipkit"]: Options for MSA gap trimming.
+ The 'clipkit' options clips gaps throughout the sequence while 'clip_ends' only at the ends.
+ The authors suggest using 'clip_ends', since the gaps inside the sequences may still carry evolutionary significance.
## Running the pipeline
diff --git a/main.nf b/main.nf
index 7525862..f228396 100644
--- a/main.nf
+++ b/main.nf
@@ -15,7 +15,7 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
-include { PROTEINFAMILIES } from './workflows/proteinfamilies'
+include { PROTEINFAMILIES } from './workflows/proteinfamilies'
include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_proteinfamilies_pipeline'
include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_proteinfamilies_pipeline'
/*
diff --git a/modules.json b/modules.json
index ca041b0..35b13f3 100644
--- a/modules.json
+++ b/modules.json
@@ -5,10 +5,78 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
+ "cat/cat": {
+ "branch": "master",
+ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+ "installed_by": ["modules"]
+ },
+ "clipkit": {
+ "branch": "master",
+ "git_sha": "3ef36024bb95d306c63c7bf9014132e62c7b4755",
+ "installed_by": ["modules"]
+ },
+ "famsa/align": {
+ "branch": "master",
+ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+ "installed_by": ["modules"]
+ },
+ "hmmer/hmmalign": {
+ "branch": "master",
+ "git_sha": "03a9f356a1a333923c1177c2912fa7bc61bb46f3",
+ "installed_by": ["modules"]
+ },
+ "hmmer/hmmbuild": {
+ "branch": "master",
+ "git_sha": "03a9f356a1a333923c1177c2912fa7bc61bb46f3",
+ "installed_by": ["modules"]
+ },
+ "hmmer/hmmsearch": {
+ "branch": "master",
+ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+ "installed_by": ["modules"]
+ },
+ "mafft/align": {
+ "branch": "master",
+ "git_sha": "868cb0d7fc4862991fb7c2b4cd7289806cd53f81",
+ "installed_by": ["modules"]
+ },
+ "mmseqs/cluster": {
+ "branch": "master",
+ "git_sha": "2dc4c0474a77f5f8709eb970d890ad102e92af6f",
+ "installed_by": ["modules"],
+ "patch": "modules/nf-core/mmseqs/cluster/mmseqs-cluster.diff"
+ },
+ "mmseqs/createdb": {
+ "branch": "master",
+ "git_sha": "2dc4c0474a77f5f8709eb970d890ad102e92af6f",
+ "installed_by": ["modules"]
+ },
+ "mmseqs/createtsv": {
+ "branch": "master",
+ "git_sha": "2dc4c0474a77f5f8709eb970d890ad102e92af6f",
+ "installed_by": ["modules"],
+ "patch": "modules/nf-core/mmseqs/createtsv/mmseqs-createtsv.diff"
+ },
+ "mmseqs/linclust": {
+ "branch": "master",
+ "git_sha": "2dc4c0474a77f5f8709eb970d890ad102e92af6f",
+ "installed_by": ["modules"],
+ "patch": "modules/nf-core/mmseqs/linclust/mmseqs-linclust.diff"
+ },
"multiqc": {
"branch": "master",
"git_sha": "f0719ae309075ae4a291533883847c3f7c441dad",
"installed_by": ["modules"]
+ },
+ "seqkit/seq": {
+ "branch": "master",
+ "git_sha": "60645c2b45e56579de0a0c89416805cae44c1f46",
+ "installed_by": ["modules"]
+ },
+ "untar": {
+ "branch": "master",
+ "git_sha": "3e548877f25a5980a177cc4f81d2d2e8c24164ef",
+ "installed_by": ["modules"]
}
}
},
diff --git a/modules/local/branch_hits_fasta/environment.yml b/modules/local/branch_hits_fasta/environment.yml
new file mode 100644
index 0000000..7ef7d0b
--- /dev/null
+++ b/modules/local/branch_hits_fasta/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::biopython=1.84
diff --git a/modules/local/branch_hits_fasta/main.nf b/modules/local/branch_hits_fasta/main.nf
new file mode 100644
index 0000000..5f91985
--- /dev/null
+++ b/modules/local/branch_hits_fasta/main.nf
@@ -0,0 +1,39 @@
+process BRANCH_HITS_FASTA {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb3700531c7ec639f59f084ab64c05e881d654dcf829db163539f2f0b095e09d/data' :
+ 'community.wave.seqera.io/library/biopython:1.84--3318633dad0031e7' }"
+
+ input:
+ tuple val(meta) , path(fasta)
+ tuple val(meta2), path(domtbl)
+ val(length_threshold)
+
+ output:
+ tuple val(meta), path("hits/*") , emit: hits
+ tuple val(meta), path("*.fasta.gz"), emit: non_hit_fasta
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ branch_hits_fasta.py \\
+ --fasta ${fasta} \\
+ --domtbl ${domtbl} \\
+ --length_threshold ${length_threshold} \\
+ --hits hits \\
+ --non_hits ${prefix}.fasta.gz
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ biopython: \$(python -c "import importlib.metadata; print(importlib.metadata.version('biopython'))")
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/chunk_clusters/environment.yml b/modules/local/chunk_clusters/environment.yml
new file mode 100644
index 0000000..7ef7d0b
--- /dev/null
+++ b/modules/local/chunk_clusters/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::biopython=1.84
diff --git a/modules/local/chunk_clusters/main.nf b/modules/local/chunk_clusters/main.nf
new file mode 100644
index 0000000..14b6e54
--- /dev/null
+++ b/modules/local/chunk_clusters/main.nf
@@ -0,0 +1,42 @@
+process CHUNK_CLUSTERS {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb3700531c7ec639f59f084ab64c05e881d654dcf829db163539f2f0b095e09d/data' :
+ 'community.wave.seqera.io/library/biopython:1.84--3318633dad0031e7' }"
+
+ input:
+ tuple val(meta) , path(clustering)
+ tuple val(meta2), path(sequences)
+ val(size_threshold)
+
+ output:
+ tuple val(meta), path("chunked_fasta/*"), emit: fasta_chunks
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def is_compressed = sequences.getName().endsWith(".gz") ? true : false
+ def fasta_name = sequences.name.replace(".gz", "")
+ """
+ if [ "$is_compressed" == "true" ]; then
+ gzip -c -d $sequences > $fasta_name
+ fi
+
+ chunk_clusters.py \\
+ --clustering ${clustering} \\
+ --sequences ${fasta_name} \\
+ --threshold ${size_threshold} \\
+ --out_folder chunked_fasta
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ biopython: \$(python -c "import importlib.metadata; print(importlib.metadata.version('biopython'))")
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/clip_ends/environment.yml b/modules/local/clip_ends/environment.yml
new file mode 100644
index 0000000..7ef7d0b
--- /dev/null
+++ b/modules/local/clip_ends/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::biopython=1.84
diff --git a/modules/local/clip_ends/main.nf b/modules/local/clip_ends/main.nf
new file mode 100644
index 0000000..082cb9e
--- /dev/null
+++ b/modules/local/clip_ends/main.nf
@@ -0,0 +1,35 @@
+process CLIP_ENDS {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb3700531c7ec639f59f084ab64c05e881d654dcf829db163539f2f0b095e09d/data' :
+ 'community.wave.seqera.io/library/biopython:1.84--3318633dad0031e7' }"
+
+ input:
+ tuple val(meta), path(aln)
+ val(gap_threshold)
+
+ output:
+ tuple val(meta), path("*.clipends"), emit: fas
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ clip_ends.py \\
+ --alignment ${aln} \\
+ --gap_threshold ${gap_threshold} \\
+ --out_fasta ${prefix}.clipends
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ biopython: \$(python -c "import importlib.metadata; print(importlib.metadata.version('biopython'))")
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/extract_family_reps/environment.yml b/modules/local/extract_family_reps/environment.yml
new file mode 100644
index 0000000..7ef7d0b
--- /dev/null
+++ b/modules/local/extract_family_reps/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::biopython=1.84
diff --git a/modules/local/extract_family_reps/main.nf b/modules/local/extract_family_reps/main.nf
new file mode 100644
index 0000000..3cdeb09
--- /dev/null
+++ b/modules/local/extract_family_reps/main.nf
@@ -0,0 +1,35 @@
+process EXTRACT_FAMILY_REPS {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb3700531c7ec639f59f084ab64c05e881d654dcf829db163539f2f0b095e09d/data' :
+ 'community.wave.seqera.io/library/biopython:1.84--3318633dad0031e7' }"
+
+ input:
+ tuple val(meta), path(aln, stageAs: "aln/*")
+
+ output:
+ tuple val(meta), path("${meta.id}_reps.fa") , emit: fasta
+ tuple val(meta), path("${meta.id}_meta_mqc.csv"), emit: map
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ extract_family_reps.py \\
+ --full_msa_folder aln \\
+ --metadata ${prefix}_meta_mqc.csv \\
+ --out_fasta ${prefix}_reps.fa
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ biopython: \$(python -c "import importlib.metadata; print(importlib.metadata.version('biopython'))")
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/filter_non_redundant_hmms/environment.yml b/modules/local/filter_non_redundant_hmms/environment.yml
new file mode 100644
index 0000000..7ecb6f2
--- /dev/null
+++ b/modules/local/filter_non_redundant_hmms/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::python=3.13.1
diff --git a/modules/local/filter_non_redundant_hmms/main.nf b/modules/local/filter_non_redundant_hmms/main.nf
new file mode 100644
index 0000000..072fc16
--- /dev/null
+++ b/modules/local/filter_non_redundant_hmms/main.nf
@@ -0,0 +1,33 @@
+process FILTER_NON_REDUNDANT_HMMS {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/31/313e1c18a344323886cf97a151ab66d81c1a146fb129558cb9382b69a72d5532/data' :
+ 'community.wave.seqera.io/library/python:b1b4b1f458c605bb' }"
+
+ input:
+ tuple val(meta) , path(seqs, stageAs: "seqs/*")
+ tuple val(meta2), path(models, stageAs: "models/*")
+
+ output:
+ tuple val(meta), path("non_redundant/*"), emit: hmm
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ """
+ filter_non_redundant_hmms.py \\
+ --seqs seqs \\
+ --models models \\
+ --out_folder non_redundant
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/filter_recruited/environment.yml b/modules/local/filter_recruited/environment.yml
new file mode 100644
index 0000000..7ef7d0b
--- /dev/null
+++ b/modules/local/filter_recruited/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::biopython=1.84
diff --git a/modules/local/filter_recruited/main.nf b/modules/local/filter_recruited/main.nf
new file mode 100644
index 0000000..92e1384
--- /dev/null
+++ b/modules/local/filter_recruited/main.nf
@@ -0,0 +1,36 @@
+process FILTER_RECRUITED {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb3700531c7ec639f59f084ab64c05e881d654dcf829db163539f2f0b095e09d/data' :
+ 'community.wave.seqera.io/library/biopython:1.84--3318633dad0031e7' }"
+
+ input:
+ tuple val(meta), path(domtbl), path(fa)
+ val(length_threshold)
+
+ output:
+ tuple val(meta), path("*.fasta.gz"), emit: fasta
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ filter_recruited.py \\
+ --domtbl ${domtbl} \\
+ --fasta ${fa} \\
+ --length_threshold ${length_threshold} \\
+ --out_fasta ${prefix}.fasta.gz
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ biopython: \$(python -c "import importlib.metadata; print(importlib.metadata.version('biopython'))")
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/remove_redundant_fams/environment.yml b/modules/local/remove_redundant_fams/environment.yml
new file mode 100644
index 0000000..07d55ab
--- /dev/null
+++ b/modules/local/remove_redundant_fams/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::pandas=1.4.3
diff --git a/modules/local/remove_redundant_fams/main.nf b/modules/local/remove_redundant_fams/main.nf
new file mode 100644
index 0000000..2e160fb
--- /dev/null
+++ b/modules/local/remove_redundant_fams/main.nf
@@ -0,0 +1,38 @@
+process REMOVE_REDUNDANT_FAMS {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' :
+ 'biocontainers/pandas:1.4.3' }"
+
+ input:
+ tuple val(meta) , path(mapping)
+ tuple val(meta2), path(domtbl)
+ tuple val(meta3), path(fasta, stageAs: "fasta_folder/*")
+ val(length_threshold)
+
+ output:
+ tuple val(meta), path("non_redundant/*"), emit: fasta
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ """
+ remove_redundant_fams.py \\
+ --mapping ${mapping} \\
+ --domtbl ${domtbl} \\
+ --fasta_folder fasta_folder \\
+ --length_threshold ${length_threshold} \\
+ --out_folder non_redundant
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ pandas: \$(python -c "import importlib.metadata; print(importlib.metadata.version('pandas'))")
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/remove_redundant_seqs/environment.yml b/modules/local/remove_redundant_seqs/environment.yml
new file mode 100644
index 0000000..7ef7d0b
--- /dev/null
+++ b/modules/local/remove_redundant_seqs/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::biopython=1.84
diff --git a/modules/local/remove_redundant_seqs/main.nf b/modules/local/remove_redundant_seqs/main.nf
new file mode 100644
index 0000000..18ac7d8
--- /dev/null
+++ b/modules/local/remove_redundant_seqs/main.nf
@@ -0,0 +1,41 @@
+process REMOVE_REDUNDANT_SEQS {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb3700531c7ec639f59f084ab64c05e881d654dcf829db163539f2f0b095e09d/data' :
+ 'community.wave.seqera.io/library/biopython:1.84--3318633dad0031e7' }"
+
+ input:
+ tuple val(meta) , path(clustering)
+ tuple val(meta2), path(sequences)
+
+ output:
+ tuple val(meta), path("${meta.id}_reps.fa"), emit: fasta
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def is_compressed = sequences.getName().endsWith(".gz") ? true : false
+ def fasta_name = sequences.name.replace(".gz", "")
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ if [ "$is_compressed" == "true" ]; then
+ gzip -c -d $sequences > $fasta_name
+ fi
+
+ remove_redundant_seqs.py \\
+ --clustering ${clustering} \\
+ --sequences ${fasta_name} \\
+ --out_fasta ${prefix}_reps.fa
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version 2>&1 | sed 's/Python //g')
+ biopython: \$(python -c "import importlib.metadata; print(importlib.metadata.version('biopython'))")
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml
new file mode 100644
index 0000000..9b01c86
--- /dev/null
+++ b/modules/nf-core/cat/cat/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::pigz=2.3.4
diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf
new file mode 100644
index 0000000..2862c64
--- /dev/null
+++ b/modules/nf-core/cat/cat/main.nf
@@ -0,0 +1,78 @@
+process CAT_CAT {
+ tag "$meta.id"
+ label 'process_low'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' :
+ 'biocontainers/pigz:2.3.4' }"
+
+ input:
+ tuple val(meta), path(files_in)
+
+ output:
+ tuple val(meta), path("${prefix}"), emit: file_out
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def args2 = task.ext.args2 ?: ''
+ def file_list = files_in.collect { it.toString() }
+
+ // choose appropriate concatenation tool depending on input and output format
+
+ // | input | output | command1 | command2 |
+ // |-----------|------------|----------|----------|
+ // | gzipped | gzipped | cat | |
+ // | ungzipped | ungzipped | cat | |
+ // | gzipped | ungzipped | zcat | |
+ // | ungzipped | gzipped | cat | pigz |
+
+ // Use input file ending as default
+ prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}"
+ out_zip = prefix.endsWith('.gz')
+ in_zip = file_list[0].endsWith('.gz')
+ command1 = (in_zip && !out_zip) ? 'zcat' : 'cat'
+ command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : ''
+ if(file_list.contains(prefix.trim())) {
+ error "The name of the input file can't be the same as for the output prefix in the " +
+ "module CAT_CAT (currently `$prefix`). Please choose a different one."
+ }
+ """
+ $command1 \\
+ $args \\
+ ${file_list.join(' ')} \\
+ $command2 \\
+ > ${prefix}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+ END_VERSIONS
+ """
+
+ stub:
+ def file_list = files_in.collect { it.toString() }
+ prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
+ if(file_list.contains(prefix.trim())) {
+ error "The name of the input file can't be the same as for the output prefix in the " +
+ "module CAT_CAT (currently `$prefix`). Please choose a different one."
+ }
+ """
+ touch $prefix
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+ END_VERSIONS
+ """
+}
+
+// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz
+def getFileSuffix(filename) {
+ def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/
+ return match ? match[0][1] : filename.substring(filename.lastIndexOf('.'))
+}
diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml
new file mode 100644
index 0000000..81778a0
--- /dev/null
+++ b/modules/nf-core/cat/cat/meta.yml
@@ -0,0 +1,43 @@
+name: cat_cat
+description: A module for concatenation of gzipped or uncompressed files
+keywords:
+ - concatenate
+ - gzip
+ - cat
+tools:
+ - cat:
+ description: Just concatenation
+ documentation: https://man7.org/linux/man-pages/man1/cat.1.html
+ licence: ["GPL-3.0-or-later"]
+ identifier: ""
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - files_in:
+ type: file
+ description: List of compressed / uncompressed files
+ pattern: "*"
+output:
+ - file_out:
+ - meta:
+ type: file
+ description: Concatenated file. Will be gzipped if file_out ends with ".gz"
+ pattern: "${file_out}"
+ - ${prefix}:
+ type: file
+ description: Concatenated file. Will be gzipped if file_out ends with ".gz"
+ pattern: "${file_out}"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@erikrikarddaniel"
+ - "@FriederikeHanssen"
+maintainers:
+ - "@erikrikarddaniel"
+ - "@FriederikeHanssen"
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test
new file mode 100644
index 0000000..9cb1617
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test
@@ -0,0 +1,191 @@
+nextflow_process {
+
+ name "Test Process CAT_CAT"
+ script "../main.nf"
+ process "CAT_CAT"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "cat"
+ tag "cat/cat"
+
+ test("test_cat_name_conflict") {
+ when {
+ params {
+ outdir = "${outputDir}"
+ }
+ process {
+ """
+ input[0] =
+ [
+ [ id:'genome', single_end:true ],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true)
+ ]
+ ]
+ """
+ }
+ }
+ then {
+ assertAll(
+ { assert !process.success },
+ { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") },
+ { assert snapshot(process.out.versions).match() }
+ )
+ }
+ }
+
+ test("test_cat_unzipped_unzipped") {
+ when {
+ params {
+ outdir = "${outputDir}"
+ }
+ process {
+ """
+ input[0] =
+ [
+ [ id:'test', single_end:true ],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true)
+ ]
+ ]
+ """
+ }
+ }
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+
+ test("test_cat_zipped_zipped") {
+ when {
+ params {
+ outdir = "${outputDir}"
+ }
+ process {
+ """
+ input[0] =
+ [
+ [ id:'test', single_end:true ],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true)
+ ]
+ ]
+ """
+ }
+ }
+ then {
+ def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ lines[0..5],
+ lines.size(),
+ process.out.versions
+ ).match()
+ }
+ )
+ }
+ }
+
+ test("test_cat_zipped_unzipped") {
+ config './nextflow_zipped_unzipped.config'
+
+ when {
+ params {
+ outdir = "${outputDir}"
+ }
+ process {
+ """
+ input[0] =
+ [
+ [ id:'test', single_end:true ],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true)
+ ]
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("test_cat_unzipped_zipped") {
+ config './nextflow_unzipped_zipped.config'
+ when {
+ params {
+ outdir = "${outputDir}"
+ }
+ process {
+ """
+ input[0] =
+ [
+ [ id:'test', single_end:true ],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true)
+ ]
+ ]
+ """
+ }
+ }
+ then {
+ def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ lines[0..5],
+ lines.size(),
+ process.out.versions
+ ).match()
+ }
+ )
+ }
+ }
+
+ test("test_cat_one_file_unzipped_zipped") {
+ config './nextflow_unzipped_zipped.config'
+ when {
+ params {
+ outdir = "${outputDir}"
+ }
+ process {
+ """
+ input[0] =
+ [
+ [ id:'test', single_end:true ],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ ]
+ """
+ }
+ }
+ then {
+ def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ lines[0..5],
+ lines.size(),
+ process.out.versions
+ ).match()
+ }
+ )
+ }
+ }
+}
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap
new file mode 100644
index 0000000..b7623ee
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap
@@ -0,0 +1,147 @@
+{
+ "test_cat_unzipped_unzipped": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+ ],
+ "file_out": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2023-10-16T14:32:18.500464399"
+ },
+ "test_cat_zipped_unzipped": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+ ],
+ "file_out": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2023-10-16T14:32:49.642741302"
+ },
+ "test_cat_zipped_zipped": {
+ "content": [
+ [
+ "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab",
+ "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1",
+ "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+ "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+ "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1",
+ "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1"
+ ],
+ 78,
+ [
+ "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-22T11:51:46.802978"
+ },
+ "test_cat_name_conflict": {
+ "content": [
+ [
+
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-22T11:51:29.45394"
+ },
+ "test_cat_one_file_unzipped_zipped": {
+ "content": [
+ [
+ ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+ "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+ "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+ "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+ "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+ "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+ ],
+ 374,
+ [
+ "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-22T11:52:02.774016"
+ },
+ "test_cat_unzipped_zipped": {
+ "content": [
+ [
+ ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+ "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+ "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+ "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+ "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+ "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+ ],
+ 375,
+ [
+ "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-22T11:51:57.581523"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
new file mode 100644
index 0000000..ec26b0f
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
@@ -0,0 +1,6 @@
+
+process {
+ withName: CAT_CAT {
+ ext.prefix = 'cat.txt.gz'
+ }
+}
diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
new file mode 100644
index 0000000..fbc7978
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
@@ -0,0 +1,8 @@
+
+process {
+
+ withName: CAT_CAT {
+ ext.prefix = 'cat.txt'
+ }
+
+}
diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml
new file mode 100644
index 0000000..37b578f
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/tags.yml
@@ -0,0 +1,2 @@
+cat/cat:
+ - modules/nf-core/cat/cat/**
diff --git a/modules/nf-core/clipkit/environment.yml b/modules/nf-core/clipkit/environment.yml
new file mode 100644
index 0000000..65c451f
--- /dev/null
+++ b/modules/nf-core/clipkit/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - "bioconda::clipkit=2.3.0"
diff --git a/modules/nf-core/clipkit/main.nf b/modules/nf-core/clipkit/main.nf
new file mode 100644
index 0000000..2a9ebe3
--- /dev/null
+++ b/modules/nf-core/clipkit/main.nf
@@ -0,0 +1,46 @@
+process CLIPKIT {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/clipkit:2.3.0--pyhdfd78af_0':
+ 'biocontainers/clipkit:2.3.0--pyhdfd78af_0' }"
+
+ input:
+ tuple val(meta), path(aln)
+
+ output:
+ tuple val(meta), path("*.clipkit"), emit: clipkit
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ clipkit \\
+ $args \\
+ $aln \\
+ -o ${prefix}.clipkit
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ clipkit: \$(clipkit --version |& sed '1!d ; s/clipkit //')
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.clipkit
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ clipkit: \$(clipkit --version |& sed '1!d ; s/clipkit //')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/clipkit/meta.yml b/modules/nf-core/clipkit/meta.yml
new file mode 100644
index 0000000..5347ad5
--- /dev/null
+++ b/modules/nf-core/clipkit/meta.yml
@@ -0,0 +1,50 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "clipkit"
+description: ClipKIT is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes those that display characteristics poor phylogenetic signal.
+keywords:
+ - sort
+ - example
+ - genomics
+tools:
+ - "clipkit":
+ description: "Alignment trimming software for phylogenetics."
+ homepage: "https://jlsteenwyk.com/ClipKIT/"
+ documentation: "https://jlsteenwyk.com/ClipKIT/"
+ tool_dev_url: "https://github.com/JLSteenwyk/ClipKIT"
+ doi: "10.1371/journal.pbio.3001007"
+ licence: ["MIT"]
+ identifier: ""
+
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'sample1', single_end:false ]`
+ - aln:
+ type: file
+ description: Multiple sequence alignment file in various supported formats.
+ pattern: "*.{fa,fasta,fa,fna,faa,alnfaa,aln,sto,stk,mauve,alignment,clustal}"
+
+output:
+ - clipkit:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'sample1', single_end:false ]`
+ - "*.clipkit":
+ type: file
+ description: Trimmed multiple sequence alignment file
+ pattern: "*.clipkit"
+ - versions:
+ - "versions.yml":
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+
+authors:
+ - "@vagkaratzas"
+maintainers:
+ - "@vagkaratzas"
diff --git a/modules/nf-core/clipkit/tests/main.nf.test b/modules/nf-core/clipkit/tests/main.nf.test
new file mode 100644
index 0000000..46cc8a0
--- /dev/null
+++ b/modules/nf-core/clipkit/tests/main.nf.test
@@ -0,0 +1,66 @@
+nextflow_process {
+
+ name "Test Process CLIPKIT"
+ script "../main.nf"
+ process "CLIPKIT"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "clipkit"
+ tag "gunzip"
+
+ setup {
+ run("GUNZIP") {
+ script "../../gunzip/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'PF14720' ], // meta map
+ file(params.modules_testdata_base_path + 'delete_me/hmmer/PF14720_seed.alnfaa.gz', checkIfExists: true)
+ ]
+ """
+ }
+ }
+ }
+
+ test("PF14720 - aln") {
+
+ when {
+ process {
+ """
+ input[0] = GUNZIP.out.gunzip
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("PF14720 - aln - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = GUNZIP.out.gunzip
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/clipkit/tests/main.nf.test.snap b/modules/nf-core/clipkit/tests/main.nf.test.snap
new file mode 100644
index 0000000..55f837d
--- /dev/null
+++ b/modules/nf-core/clipkit/tests/main.nf.test.snap
@@ -0,0 +1,68 @@
+{
+ "PF14720 - aln - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "PF14720"
+ },
+ "PF14720.clipkit:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,1eaf2dc589bc08c1e37cbbddbedfcc80"
+ ],
+ "clipkit": [
+ [
+ {
+ "id": "PF14720"
+ },
+ "PF14720.clipkit:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,1eaf2dc589bc08c1e37cbbddbedfcc80"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "23.04.5"
+ },
+ "timestamp": "2024-11-14T13:11:21.109400968"
+ },
+ "PF14720 - aln": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "PF14720"
+ },
+ "PF14720.clipkit:md5,97e4328b5990f5e5a6241c225ed6d922"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,1eaf2dc589bc08c1e37cbbddbedfcc80"
+ ],
+ "clipkit": [
+ [
+ {
+ "id": "PF14720"
+ },
+ "PF14720.clipkit:md5,97e4328b5990f5e5a6241c225ed6d922"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,1eaf2dc589bc08c1e37cbbddbedfcc80"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "23.04.5"
+ },
+ "timestamp": "2024-11-14T13:11:14.278212183"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/famsa/align/environment.yml b/modules/nf-core/famsa/align/environment.yml
new file mode 100644
index 0000000..08b6f88
--- /dev/null
+++ b/modules/nf-core/famsa/align/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::famsa=2.2.2
diff --git a/modules/nf-core/famsa/align/main.nf b/modules/nf-core/famsa/align/main.nf
new file mode 100644
index 0000000..096d8ff
--- /dev/null
+++ b/modules/nf-core/famsa/align/main.nf
@@ -0,0 +1,53 @@
+
+
+process FAMSA_ALIGN {
+ tag "$meta.id"
+ label 'process_medium'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/famsa:2.2.2--h9f5acd7_0':
+ 'biocontainers/famsa:2.2.2--h9f5acd7_0' }"
+
+ input:
+ tuple val(meta) , path(fasta)
+ tuple val(meta2), path(tree)
+ val(compress)
+
+ output:
+ tuple val(meta), path("*.aln{.gz,}"), emit: alignment
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def compress_args = compress ? '-gz' : ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def options_tree = tree ? "-gt import $tree" : ""
+ """
+ famsa $options_tree \\
+ $compress_args \\
+ $args \\
+ -t ${task.cpus} \\
+ ${fasta} \\
+ ${prefix}.aln${compress ? '.gz':''}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ famsa: \$( famsa -help 2>&1 | head -n 2 | tail -n 1 | sed 's/ version //g' )
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.aln${compress ? '.gz' : ''}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ famsa: \$( famsa -help 2>&1 | head -n 2 | tail -n 1 | sed 's/ version //g' )
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/famsa/align/meta.yml b/modules/nf-core/famsa/align/meta.yml
new file mode 100644
index 0000000..c12a99d
--- /dev/null
+++ b/modules/nf-core/famsa/align/meta.yml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "famsa_align"
+description: Aligns sequences using FAMSA
+keywords:
+ - alignment
+ - MSA
+ - genomics
+tools:
+ - "famsa":
+ description: "Algorithm for large-scale multiple sequence alignments"
+ homepage: "https://github.com/refresh-bio/FAMSA"
+ documentation: "https://github.com/refresh-bio/FAMSA"
+ tool_dev_url: "https://github.com/refresh-bio/FAMSA"
+ doi: "10.1038/srep33964"
+ licence: ["GPL v3"]
+ identifier: biotools:famsa
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test']`
+ - fasta:
+ type: file
+ description: Input sequences in FASTA format
+ pattern: "*.{fa,fasta}"
+ - - meta2:
+ type: map
+ description: |
+ Groovy Map containing tree information
+ e.g. `[ id:'test_tree']`
+ - tree:
+ type: file
+ description: Input guide tree in Newick format
+ pattern: "*.{dnd}"
+ - - compress:
+ type: boolean
+ description: Flag representing whether the output MSA should be compressed.
+ Set to true to enable/false to disable compression. Compression is handled
+ by passing '-gz' to FAMSA along with any other options specified in task.ext.args.
+output:
+ - alignment:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test']`
+ - "*.aln{.gz,}":
+ type: file
+ description: Alignment file, in FASTA format. May be gzipped or uncompressed,
+ depending on if compress is set to true or false
+ pattern: "*.aln{.gz,}"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@luisas"
+ - "@JoseEspinosa"
+maintainers:
+ - "@luisas"
+ - "@JoseEspinosa"
diff --git a/modules/nf-core/famsa/align/tests/main.nf.test b/modules/nf-core/famsa/align/tests/main.nf.test
new file mode 100644
index 0000000..8e91c30
--- /dev/null
+++ b/modules/nf-core/famsa/align/tests/main.nf.test
@@ -0,0 +1,96 @@
+nextflow_process {
+
+ name "Test Process FAMSA_ALIGN"
+ script "../main.nf"
+ process "FAMSA_ALIGN"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "famsa"
+ tag "famsa/align"
+ tag "famsa/guidetree"
+
+ test("sarscov2 - fasta - uncompressed") {
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:],[]]
+ input[2] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.alignment).match("alignment_uncompressed")},
+ { assert snapshot(process.out.versions).match("versions0") }
+ )
+ }
+
+ }
+
+ test("sarscov2 - fasta - compressed") {
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:],[]]
+ input[2] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.alignment).match("alignment_compressed")},
+ { assert snapshot(process.out.versions).match("versions1") }
+ )
+ }
+
+ }
+
+ test("sarscov2 - fasta - guide_tree") {
+
+ setup {
+ run("FAMSA_GUIDETREE") {
+ script "../../guidetree/main.nf"
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)
+ ]
+ """
+ }
+ }
+ }
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)
+ ]
+ input[1] = FAMSA_GUIDETREE.out.tree.collect{ meta, tree -> tree }.map{ tree -> [[ id: 'test_summary'], tree]}
+ input[2] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.alignment).match("with_guide_tree_alignment")},
+ { assert snapshot(process.out.versions).match("with_guide_tree_versions") }
+ )
+ }
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/famsa/align/tests/main.nf.test.snap b/modules/nf-core/famsa/align/tests/main.nf.test.snap
new file mode 100644
index 0000000..95bbbf1
--- /dev/null
+++ b/modules/nf-core/famsa/align/tests/main.nf.test.snap
@@ -0,0 +1,57 @@
+{
+ "alignment_uncompressed": {
+ "content": [
+ [
+ [
+ {
+ "id": "test"
+ },
+ "test.aln:md5,7cf7375f2ba360814ea978731838b972"
+ ]
+ ]
+ ],
+ "timestamp": "2024-02-09T19:08:43.577982822"
+ },
+ "versions": {
+ "content": [
+ [
+ "versions.yml:md5,7d9e0a8c263fa6d9017075fe88c9e9dc"
+ ]
+ ],
+ "timestamp": "2024-02-09T19:08:43.670136799"
+ },
+ "with_guide_tree_alignment": {
+ "content": [
+ [
+ [
+ {
+ "id": "test"
+ },
+ "test.aln.gz:md5,7cf7375f2ba360814ea978731838b972"
+ ]
+ ]
+ ],
+ "timestamp": "2024-02-09T19:10:05.167368314"
+ },
+ "alignment_compressed": {
+ "content": [
+ [
+ [
+ {
+ "id": "test"
+ },
+ "test.aln.gz:md5,7cf7375f2ba360814ea978731838b972"
+ ]
+ ]
+ ],
+ "timestamp": "2024-02-09T19:09:25.819156831"
+ },
+ "with_guide_tree_versions": {
+ "content": [
+ [
+ "versions.yml:md5,7d9e0a8c263fa6d9017075fe88c9e9dc"
+ ]
+ ],
+ "timestamp": "2024-02-09T19:10:05.231995851"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/famsa/align/tests/tags.yml b/modules/nf-core/famsa/align/tests/tags.yml
new file mode 100644
index 0000000..d010f3b
--- /dev/null
+++ b/modules/nf-core/famsa/align/tests/tags.yml
@@ -0,0 +1,2 @@
+famsa/align:
+ - "modules/nf-core/famsa/align/**"
diff --git a/modules/nf-core/hmmer/hmmalign/environment.yml b/modules/nf-core/hmmer/hmmalign/environment.yml
new file mode 100644
index 0000000..c5ddec5
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmalign/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::hmmer=3.4
diff --git a/modules/nf-core/hmmer/hmmalign/main.nf b/modules/nf-core/hmmer/hmmalign/main.nf
new file mode 100644
index 0000000..39b17c7
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmalign/main.nf
@@ -0,0 +1,46 @@
+process HMMER_HMMALIGN {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/07/07c4cbd91c4459dc86b13b5cd799cacba96b27d66c276485550d299c7a4c6f8a/data' :
+ 'community.wave.seqera.io/library/hmmer:3.4--cb5d2dd2e85974ca' }"
+
+ input:
+ tuple val(meta), path(fasta)
+ path hmm
+
+ output:
+ tuple val(meta), path("*.sto.gz"), emit: sto
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ hmmalign \\
+ $args \\
+ $hmm \\
+ $fasta | gzip -c > ${prefix}.sto.gz
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ hmmer: \$(hmmalign -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//')
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ echo | gzip > ${prefix}.sto.gz
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ hmmer: \$(hmmalign -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/hmmer/hmmalign/meta.yml b/modules/nf-core/hmmer/hmmalign/meta.yml
new file mode 100644
index 0000000..fb16ba6
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmalign/meta.yml
@@ -0,0 +1,54 @@
+name: hmmer_hmmalign
+description: hmmalign from the HMMER suite aligns a number of sequences to an HMM
+ profile
+keywords:
+ - alignment
+ - HMMER
+ - profile
+ - amino acid
+ - nucleotide
+tools:
+ - hmmer:
+ description: Biosequence analysis using profile hidden Markov models
+ homepage: http://hmmer.org/
+ documentation: http://hmmer.org/documentation.html
+ doi: "10.1371/journal.pcbi.1002195"
+ licence: ["BSD-3-Clause"]
+ identifier: ""
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test' ]
+ - fasta:
+ type: file
+ description: Amino acid or nucleotide gzipped compressed fasta file
+ pattern: "*.{fna.gz,faa.gz,fasta.gz,fa.gz}"
+ - - hmm:
+ type: file
+ description: A gzipped HMM file
+ pattern: "*.hmm.gz"
+output:
+ - sto:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.sto.gz":
+ type: file
+ description: Multiple alignment in gzipped Stockholm format
+ pattern: "*.sto.gz"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@erikrikarddaniel"
+ - "@jfy133"
+maintainers:
+ - "@erikrikarddaniel"
+ - "@jfy133"
+ - "@vagkaratzas"
diff --git a/modules/nf-core/hmmer/hmmalign/tests/main.nf.test b/modules/nf-core/hmmer/hmmalign/tests/main.nf.test
new file mode 100644
index 0000000..7b0368f
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmalign/tests/main.nf.test
@@ -0,0 +1,58 @@
+
+nextflow_process {
+
+ name "Test Process HMMER_HMMALIGN"
+ script "../main.nf"
+ process "HMMER_HMMALIGN"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "hmmer"
+ tag "hmmer/hmmalign"
+
+ test("test-hmmer-hmmalign") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id: 'test' ], // meta map
+ file(params.modules_testdata_base_path + 'delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true)
+ ]
+ input[1] = file(params.modules_testdata_base_path + 'delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true)
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+ test("test-hmmer-hmmalign-stub") {
+ options '-stub'
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id: 'test' ], // meta map
+ file(params.modules_testdata_base_path + 'delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true)
+ ]
+ input[1] = file(params.modules_testdata_base_path + 'delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true)
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+}
diff --git a/modules/nf-core/hmmer/hmmalign/tests/main.nf.test.snap b/modules/nf-core/hmmer/hmmalign/tests/main.nf.test.snap
new file mode 100644
index 0000000..ae65cac
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmalign/tests/main.nf.test.snap
@@ -0,0 +1,68 @@
+{
+ "test-hmmer-hmmalign": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.sto.gz:md5,4ae989d5ade2aaae9578cb88ba031e8f"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,601e13e7a1e6057766d862a828d501c4"
+ ],
+ "sto": [
+ [
+ {
+ "id": "test"
+ },
+ "test.sto.gz:md5,4ae989d5ade2aaae9578cb88ba031e8f"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,601e13e7a1e6057766d862a828d501c4"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "23.04.5"
+ },
+ "timestamp": "2024-11-13T12:37:26.551497424"
+ },
+ "test-hmmer-hmmalign-stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.sto.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,601e13e7a1e6057766d862a828d501c4"
+ ],
+ "sto": [
+ [
+ {
+ "id": "test"
+ },
+ "test.sto.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,601e13e7a1e6057766d862a828d501c4"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "23.04.5"
+ },
+ "timestamp": "2024-11-13T12:37:32.244343836"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/hmmer/hmmbuild/environment.yml b/modules/nf-core/hmmer/hmmbuild/environment.yml
new file mode 100644
index 0000000..c5ddec5
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmbuild/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::hmmer=3.4
diff --git a/modules/nf-core/hmmer/hmmbuild/main.nf b/modules/nf-core/hmmer/hmmbuild/main.nf
new file mode 100644
index 0000000..8eed7fc
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmbuild/main.nf
@@ -0,0 +1,56 @@
+process HMMER_HMMBUILD {
+ tag "$meta.id"
+ label 'process_low'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/07/07c4cbd91c4459dc86b13b5cd799cacba96b27d66c276485550d299c7a4c6f8a/data' :
+ 'community.wave.seqera.io/library/hmmer:3.4--cb5d2dd2e85974ca' }"
+
+ input:
+ tuple val(meta), path(alignment)
+ path mxfile
+
+ output:
+ tuple val(meta), path("*.hmm.gz"), emit: hmm
+ path "*.hmmbuild.txt", emit: hmmbuildout
+ path "versions.yml", emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def mxfileopt = mxfile ? "--mxfile ${mxfile}" : ""
+
+ """
+ hmmbuild \\
+ $args \\
+ --cpu $task.cpus \\
+ -n ${prefix} \\
+ -o ${prefix}.hmmbuild.txt \\
+ ${mxfileopt} \\
+ ${prefix}.hmm \\
+ $alignment
+
+ gzip ${prefix}.hmm
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ hmmer: \$(echo \$(hmmbuild -h | grep HMMER | sed 's/# HMMER //' | sed 's/ .*//' 2>&1))
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ echo | gzip > ${prefix}.hmm.gz
+ touch ${prefix}.hmmbuild.txt
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ hmmer: \$(echo \$(hmmbuild -h | grep HMMER | sed 's/# HMMER //' | sed 's/ .*//' 2>&1))
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/hmmer/hmmbuild/meta.yml b/modules/nf-core/hmmer/hmmbuild/meta.yml
new file mode 100644
index 0000000..9be14fe
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmbuild/meta.yml
@@ -0,0 +1,56 @@
+name: "hmmer_hmmbuild"
+description: create an hmm profile from a multiple sequence alignment
+keywords:
+ - search
+ - hidden Markov model
+ - HMM
+ - hmmer
+ - hmmsearch
+tools:
+ - "hmmer":
+ description: "Biosequence analysis using profile hidden Markov models"
+ homepage: "http://hmmer.org"
+ documentation: "http://hmmer.org/documentation.html"
+ tool_dev_url: "https://github.com/EddyRivasLab/hmmer"
+ doi: "10.1371/journal.pcbi.1002195"
+ licence: ["BSD"]
+ identifier: ""
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - alignment:
+ type: file
+ description: multiple sequence alignment in fasta, clustal, stockholm or phylip
+ format
+ pattern: "*"
+ - - mxfile:
+ type: file
+ description: read substitution score matrix, for use when building profiles
+ from single sequences (--singlemx option)
+ pattern: "*"
+output:
+ - hmm:
+ - meta:
+ type: file
+ description: Gzipped HMM file
+ pattern: "*.{hmm.gz}"
+ - "*.hmm.gz":
+ type: file
+ description: Gzipped HMM file
+ pattern: "*.{hmm.gz}"
+ - hmmbuildout:
+ - "*.hmmbuild.txt":
+ type: file
+ description: HMM build output
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@erikrikarddaniel"
+maintainers:
+ - "@erikrikarddaniel"
diff --git a/modules/nf-core/hmmer/hmmbuild/tests/main.nf.test b/modules/nf-core/hmmer/hmmbuild/tests/main.nf.test
new file mode 100644
index 0000000..635f5b6
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmbuild/tests/main.nf.test
@@ -0,0 +1,66 @@
+
+nextflow_process {
+
+ name "Test Process HMMER_HMMBUILD"
+ script "../main.nf"
+ process "HMMER_HMMBUILD"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "hmmer"
+ tag "hmmer/hmmbuild"
+
+ test("test-hmmer-hmmbuild") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id: 'PF14720' ], // meta map
+ file(params.modules_testdata_base_path + 'delete_me/hmmer/PF14720_seed.alnfaa.gz', checkIfExists: true)
+ ]
+ input[1] = []
+
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert file(process.out.hmmbuildout[0]).text.contains('CPU time:') },
+ { assert snapshot(
+ file(process.out.hmm[0][1]).name, // unstable
+ file(process.out.hmmbuildout[0]).name, // unstable
+ process.out.versions
+ ).match()
+ }
+ )
+ }
+ }
+
+ test("test-hmmer-hmmbuild-stub") {
+ options '-stub'
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id: 'PF14720' ], // meta map
+ file(params.modules_testdata_base_path + 'delete_me/hmmer/PF14720_seed.alnfaa.gz', checkIfExists: true)
+ ]
+ input[1] = []
+
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+}
diff --git a/modules/nf-core/hmmer/hmmbuild/tests/main.nf.test.snap b/modules/nf-core/hmmer/hmmbuild/tests/main.nf.test.snap
new file mode 100644
index 0000000..79317b8
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmbuild/tests/main.nf.test.snap
@@ -0,0 +1,55 @@
+{
+ "test-hmmer-hmmbuild-stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "PF14720"
+ },
+ "PF14720.hmm.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+ ]
+ ],
+ "1": [
+ "PF14720.hmmbuild.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ],
+ "2": [
+ "versions.yml:md5,f8a0bffcbbc58404752849403812905b"
+ ],
+ "hmm": [
+ [
+ {
+ "id": "PF14720"
+ },
+ "PF14720.hmm.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+ ]
+ ],
+ "hmmbuildout": [
+ "PF14720.hmmbuild.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ],
+ "versions": [
+ "versions.yml:md5,f8a0bffcbbc58404752849403812905b"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "23.04.5"
+ },
+ "timestamp": "2024-11-13T12:52:29.794123574"
+ },
+ "test-hmmer-hmmbuild": {
+ "content": [
+ "PF14720.hmm.gz",
+ "PF14720.hmmbuild.txt",
+ [
+ "versions.yml:md5,f8a0bffcbbc58404752849403812905b"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "23.04.5"
+ },
+ "timestamp": "2024-11-13T12:52:23.95935055"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/hmmer/hmmsearch/environment.yml b/modules/nf-core/hmmer/hmmsearch/environment.yml
new file mode 100644
index 0000000..c5ddec5
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmsearch/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::hmmer=3.4
diff --git a/modules/nf-core/hmmer/hmmsearch/main.nf b/modules/nf-core/hmmer/hmmsearch/main.nf
new file mode 100644
index 0000000..603a865
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmsearch/main.nf
@@ -0,0 +1,70 @@
+process HMMER_HMMSEARCH {
+ tag "$meta.id"
+ label 'process_medium'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/hmmer:3.4--hdbdd923_1' :
+ 'biocontainers/hmmer:3.4--hdbdd923_1' }"
+
+ input:
+ tuple val(meta), path(hmmfile), path(seqdb), val(write_align), val(write_target), val(write_domain)
+
+ output:
+ tuple val(meta), path('*.txt.gz') , emit: output
+ tuple val(meta), path('*.sto.gz') , emit: alignments , optional: true
+ tuple val(meta), path('*.tbl.gz') , emit: target_summary, optional: true
+ tuple val(meta), path('*.domtbl.gz'), emit: domain_summary, optional: true
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ output = "${prefix}.txt"
+ alignment = write_align ? "-A ${prefix}.sto" : ''
+ target_summary = write_target ? "--tblout ${prefix}.tbl" : ''
+ domain_summary = write_domain ? "--domtblout ${prefix}.domtbl" : ''
+ """
+ hmmsearch \\
+ $args \\
+ --cpu $task.cpus \\
+ -o $output \\
+ $alignment \\
+ $target_summary \\
+ $domain_summary \\
+ $hmmfile \\
+ $seqdb
+
+ gzip --no-name *.txt \\
+ ${write_align ? '*.sto' : ''} \\
+ ${write_target ? '*.tbl' : ''} \\
+ ${write_domain ? '*.domtbl' : ''}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//')
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch "${prefix}.txt"
+ ${write_align ? "touch ${prefix}.sto" : ''} \\
+ ${write_target ? "touch ${prefix}.tbl" : ''} \\
+ ${write_domain ? "touch ${prefix}.domtbl" : ''}
+
+ gzip --no-name *.txt \\
+ ${write_align ? '*.sto' : ''} \\
+ ${write_target ? '*.tbl' : ''} \\
+ ${write_domain ? '*.domtbl' : ''}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/hmmer/hmmsearch/meta.yml b/modules/nf-core/hmmer/hmmsearch/meta.yml
new file mode 100644
index 0000000..0e07865
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmsearch/meta.yml
@@ -0,0 +1,92 @@
+name: hmmer_hmmsearch
+description: search profile(s) against a sequence database
+keywords:
+ - Hidden Markov Model
+ - HMM
+ - hmmer
+ - hmmsearch
+tools:
+ - hmmer:
+ description: Biosequence analysis using profile hidden Markov models
+ homepage: http://hmmer.org/
+ documentation: http://hmmer.org/documentation.html
+ tool_dev_url: https://github.com/EddyRivasLab/hmmer
+ doi: "10.1371/journal.pcbi.1002195"
+ licence: ["BSD"]
+ identifier: ""
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - hmmfile:
+ type: file
+ description: One or more HMM profiles created with hmmbuild
+ pattern: "*.{hmm,hmm.gz}"
+ - seqdb:
+ type: file
+ description: Database of sequences in FASTA format
+ pattern: "*.{fasta,fna,faa,fa,fasta.gz,fna.gz,faa.gz,fa.gz}"
+ - write_align:
+ type: boolean
+ description: Flag to save optional alignment output. Specify with 'true' to
+ save.
+ - write_target:
+ type: boolean
+ description: Flag to save optional per target summary. Specify with 'true' to
+ save.
+ - write_domain:
+ type: boolean
+ description: Flag to save optional per domain summary. Specify with 'true' to
+ save.
+output:
+ - output:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.txt.gz":
+ type: file
+ description: Human readable output summarizing hmmsearch results
+ pattern: "*.{txt.gz}"
+ - alignments:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.sto.gz":
+ type: file
+ description: Optional multiple sequence alignment (MSA) in Stockholm format
+ pattern: "*.{sto.gz}"
+ - target_summary:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.tbl.gz":
+ type: file
+ description: Optional tabular (space-delimited) summary of per-target output
+ pattern: "*.{tbl.gz}"
+ - domain_summary:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.domtbl.gz":
+ type: file
+ description: Optional tabular (space-delimited) summary of per-domain output
+ pattern: "*.{domtbl.gz}"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@Midnighter"
+maintainers:
+ - "@Midnighter"
diff --git a/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test
new file mode 100644
index 0000000..f1b59e9
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test
@@ -0,0 +1,126 @@
+nextflow_process {
+
+ name "Test Process HMMER_HMMSEARCH"
+ script "../main.nf"
+ process "HMMER_HMMSEARCH"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "hmmer"
+ tag "hmmer/hmmsearch"
+
+ test("hmmer/hmmsearch") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ],
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true),
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true),
+ false,
+ false,
+ false
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.output[0][1]).linesGzip.toString().contains('[ok]') },
+ { assert snapshot(process.out.versions).match() }
+ )
+ }
+
+ }
+
+ test("hmmer/hmmsearch - optional") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ],
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true),
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true),
+ true,
+ true,
+ true
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.output.get(0).get(1)).linesGzip.toString().contains('[ok]') },
+ { assert path(process.out.target_summary.get(0).get(1)).linesGzip.toString().contains('[ok]') },
+ { assert snapshot(
+ process.out.alignments +
+ process.out.versions
+ ).match() }
+ )
+ }
+
+ }
+
+ test("hmmer/hmmsearch - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ],
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true),
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true),
+ false,
+ false,
+ false
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("hmmer/hmmsearch - optional - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ],
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true),
+ file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true),
+ true,
+ true,
+ true
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test.snap b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test.snap
new file mode 100644
index 0000000..e6b2277
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test.snap
@@ -0,0 +1,175 @@
+{
+ "hmmer/hmmsearch": {
+ "content": [
+ [
+ "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-03-28T12:18:47.293093635"
+ },
+ "hmmer/hmmsearch - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+
+ ],
+ "2": [
+
+ ],
+ "3": [
+
+ ],
+ "4": [
+ "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f"
+ ],
+ "alignments": [
+
+ ],
+ "domain_summary": [
+
+ ],
+ "output": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "target_summary": [
+
+ ],
+ "versions": [
+ "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-03-28T12:18:57.862047944"
+ },
+ "hmmer/hmmsearch - optional - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.sto.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "2": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.tbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "3": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "4": [
+ "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f"
+ ],
+ "alignments": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.sto.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "domain_summary": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "output": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "target_summary": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.tbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-03-28T12:19:03.49192788"
+ },
+ "hmmer/hmmsearch - optional": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.sto.gz:md5,5c44c289b9e36aa1f7f3afae2005fbb7"
+ ],
+ "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-03-28T12:18:52.725638562"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/hmmer/hmmsearch/tests/tags.yml b/modules/nf-core/hmmer/hmmsearch/tests/tags.yml
new file mode 100644
index 0000000..1776d21
--- /dev/null
+++ b/modules/nf-core/hmmer/hmmsearch/tests/tags.yml
@@ -0,0 +1,2 @@
+hmmer/hmmsearch:
+ - "modules/nf-core/hmmer/hmmsearch/**"
diff --git a/modules/nf-core/mafft/align/environment.yml b/modules/nf-core/mafft/align/environment.yml
new file mode 100644
index 0000000..97a13e6
--- /dev/null
+++ b/modules/nf-core/mafft/align/environment.yml
@@ -0,0 +1,6 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::mafft=7.520
+ - conda-forge::pigz=2.8
diff --git a/modules/nf-core/mafft/align/main.nf b/modules/nf-core/mafft/align/main.nf
new file mode 100644
index 0000000..6031dd4
--- /dev/null
+++ b/modules/nf-core/mafft/align/main.nf
@@ -0,0 +1,75 @@
+process MAFFT_ALIGN {
+ tag "$meta.id"
+ label 'process_high'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mulled-v2-12eba4a074f913c639117640936668f5a6a01da6:425707898cf4f85051b77848be253b88f1d2298a-0':
+ 'biocontainers/mulled-v2-12eba4a074f913c639117640936668f5a6a01da6:425707898cf4f85051b77848be253b88f1d2298a-0' }"
+
+ input:
+ tuple val(meta) , path(fasta)
+ tuple val(meta2), path(add)
+ tuple val(meta3), path(addfragments)
+ tuple val(meta4), path(addfull)
+ tuple val(meta5), path(addprofile)
+ tuple val(meta6), path(addlong)
+ val(compress)
+
+ output:
+ tuple val(meta), path("*.fas{.gz,}"), emit: fas
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def add = add ? "--add <(unpigz -cdf ${add})" : ''
+ def addfragments = addfragments ? "--addfragments <(unpigz -cdf ${addfragments})" : ''
+ def addfull = addfull ? "--addfull <(unpigz -cdf ${addfull})" : ''
+ def addprofile = addprofile ? "--addprofile <(unpigz -cdf ${addprofile})" : ''
+ def addlong = addlong ? "--addlong <(unpigz -cdf ${addlong})" : ''
+ def write_output = compress ? " | pigz -cp ${task.cpus} > ${prefix}.fas.gz" : "> ${prefix}.fas"
+ // this will not preserve MAFFTs return value, but mafft crashes when it receives a process substitution
+ if ("$fasta" == "${prefix}.fas" ) error "Input and output names are the same, set prefix in module configuration to disambiguate!"
+ """
+ mafft \\
+ --thread ${task.cpus} \\
+ ${add} \\
+ ${addfragments} \\
+ ${addfull} \\
+ ${addprofile} \\
+ ${addlong} \\
+ ${args} \\
+ ${fasta} \\
+ ${write_output}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mafft: \$(mafft --version 2>&1 | sed 's/^v//' | sed 's/ (.*)//')
+ pigz: \$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ))
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def add = add ? "--add ${add}" : ''
+ def addfragments = addfragments ? "--addfragments ${addfragments}" : ''
+ def addfull = addfull ? "--addfull ${addfull}" : ''
+ def addprofile = addprofile ? "--addprofile ${addprofile}" : ''
+ def addlong = addlong ? "--addlong ${addlong}" : ''
+ if ("$fasta" == "${prefix}.fas" ) error "Input and output names are the same, set prefix in module configuration to disambiguate!"
+ """
+ touch ${prefix}.fas${compress ? '.gz' : ''}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mafft: \$(mafft --version 2>&1 | sed 's/^v//' | sed 's/ (.*)//')
+ pigz: \$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ))
+ END_VERSIONS
+ """
+
+}
diff --git a/modules/nf-core/mafft/align/meta.yml b/modules/nf-core/mafft/align/meta.yml
new file mode 100644
index 0000000..30d8053
--- /dev/null
+++ b/modules/nf-core/mafft/align/meta.yml
@@ -0,0 +1,108 @@
+name: mafft_align
+description: Multiple sequence alignment using MAFFT
+keywords:
+ - fasta
+ - msa
+ - multiple sequence alignment
+tools:
+ - "mafft":
+ description: Multiple alignment program for amino acid or nucleotide sequences
+ based on fast Fourier transform
+ homepage: https://mafft.cbrc.jp/alignment/software/
+ documentation: https://mafft.cbrc.jp/alignment/software/manual/manual.html
+ tool_dev_url: https://mafft.cbrc.jp/alignment/software/source.html
+ doi: "10.1093/nar/gkf436"
+ licence: ["BSD"]
+ identifier: biotools:MAFFT
+ - "pigz":
+ description: "Parallel implementation of the gzip algorithm."
+ homepage: "https://zlib.net/pigz/"
+ documentation: "https://zlib.net/pigz/pigz.pdf"
+ identifier: biotools:MAFFT
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - fasta:
+ type: file
+ description: FASTA file containing the sequences to align. May be gzipped or
+ uncompressed.
+ pattern: "*.{fa,fasta}{.gz,}"
+ - - meta2:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - add:
+ type: file
+ description: FASTA file containing sequences to align to the sequences in `fasta`
+ using `--add`. May be gzipped or uncompressed.
+ pattern: "*.{fa,fasta}{.gz,}"
+ - - meta3:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - addfragments:
+ type: file
+ description: FASTA file containing sequences to align to the sequences in `fasta`
+ using `--addfragments`. May be gzipped or uncompressed.
+ pattern: "*.{fa,fasta}{.gz,}"
+ - - meta4:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - addfull:
+ type: file
+ description: FASTA file containing sequences to align to the sequences in `fasta`
+ using `--addfull`. May be gzipped or uncompressed.
+ pattern: "*.{fa,fasta}{.gz,}"
+ - - meta5:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - addprofile:
+ type: file
+ description: FASTA file containing sequences to align to the sequences in `fasta`
+ using `--addprofile`. May be gzipped or uncompressed.
+ pattern: "*.{fa,fasta}{.gz,}"
+ - - meta6:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - addlong:
+ type: file
+ description: FASTA file containing sequences to align to the sequences in `fasta`
+ using `--addlong`. May be gzipped or uncompressed.
+ pattern: "*.{fa,fasta}{.gz,}"
+ - - compress:
+ type: boolean
+ description: Flag representing whether the output MSA should be compressed.
+ Set to true to enable/false to disable compression. Compression is done using
+ pigz, and is multithreaded.
+output:
+ - fas:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.fas{.gz,}":
+ type: file
+ description: Aligned sequences in FASTA format. May be gzipped or uncompressed.
+ pattern: "*.fas{.gz,}"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@MillironX"
+maintainers:
+ - "@MillironX"
+ - "@Joon-Klaps"
diff --git a/modules/nf-core/mafft/align/tests/main.nf.test b/modules/nf-core/mafft/align/tests/main.nf.test
new file mode 100644
index 0000000..660a897
--- /dev/null
+++ b/modules/nf-core/mafft/align/tests/main.nf.test
@@ -0,0 +1,249 @@
+nextflow_process {
+
+ name "Test Process MAFFT_ALIGN"
+ script "../main.nf"
+ process "MAFFT_ALIGN"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "mafft"
+ tag "mafft/align"
+
+ test("SARS-CoV-2 scaffolds fasta - uncompressed") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/scaffolds.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:], []]
+ input[2] = [[:], []]
+ input[3] = [[:], []]
+ input[4] = [[:], []]
+ input[5] = [[:], []]
+ input[6] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - uncompressed")}
+ )
+ }
+
+ }
+
+ test("SARS-CoV-2 scaffolds fasta - compressed") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/scaffolds.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:], []]
+ input[2] = [[:], []]
+ input[3] = [[:], []]
+ input[4] = [[:], []]
+ input[5] = [[:], []]
+ input[6] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - compressed")}
+ )
+ }
+
+ }
+
+ test("SARS-CoV-2 scaffolds fasta - add informative sites fasta normal") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ input[1] = [[ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/informative_sites.fas', checkIfExists: true)
+ ]
+ input[2] = [[:], []]
+ input[3] = [[:], []]
+ input[4] = [[:], []]
+ input[5] = [[:], []]
+ input[6] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - add informative sites fasta normal") }
+ )
+ }
+ }
+
+ test("SARS-CoV-2 scaffolds fasta - add informative sites fasta fragments") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:], []]
+ input[2] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/informative_sites.fas', checkIfExists: true)
+ ]
+ input[3] = [[:], []]
+ input[4] = [[:], []]
+ input[5] = [[:], []]
+ input[6] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - add informative sites fasta fragments") }
+ )
+ }
+ }
+
+ test("SARS-CoV-2 scaffolds fasta - add informative sites fasta full") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:], []]
+ input[2] = [[:], []]
+ input[3] = [[ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/informative_sites.fas', checkIfExists: true)
+ ]
+ input[4] = [[:], []]
+ input[5] = [[:], []]
+ input[6] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - add informative sites fasta full") }
+ )
+ }
+
+ }
+
+ test("SARS-CoV-2 scaffolds fasta - add informative sites fasta profile") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:], []]
+ input[2] = [[:], []]
+ input[3] = [[:], []]
+ input[4] = [[ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/informative_sites.fas', checkIfExists: true)
+ ]
+ input[5] = [[:], []]
+ input[6] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - add informative sites fasta profile") }
+ )
+ }
+
+ }
+
+ test("SARS-CoV-2 scaffolds fasta - add informative sites fasta long") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ input[1] = [[:], []]
+ input[2] = [[:], []]
+ input[3] = [[:], []]
+ input[4] = [[:], []]
+ input[5] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/informative_sites.fas', checkIfExists: true)
+ ]
+ input[6] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - add informative sites fasta long") }
+ )
+ }
+
+ }
+
+ test("SARS-CoV-2 scaffolds fasta - add informative sites all sites fasta multiple") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ input[1] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/all_sites.fas', checkIfExists: true)
+ ]
+ input[2] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/informative_sites.fas', checkIfExists: true)
+ ]
+ input[3] = [[:], []]
+ input[4] = [[:], []]
+ input[5] = [[:], []]
+ input[6] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("SARS-CoV-2 scaffolds fasta - add informative sites fasta multiple") }
+ )
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/modules/nf-core/mafft/align/tests/main.nf.test.snap b/modules/nf-core/mafft/align/tests/main.nf.test.snap
new file mode 100644
index 0000000..fd1c213
--- /dev/null
+++ b/modules/nf-core/mafft/align/tests/main.nf.test.snap
@@ -0,0 +1,282 @@
+{
+ "SARS-CoV-2 scaffolds fasta - uncompressed": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas:md5,23426611f4a0df532b6708f072bd445b"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas:md5,23426611f4a0df532b6708f072bd445b"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:35:37.370628782"
+ },
+ "SARS-CoV-2 scaffolds fasta - add informative sites fasta multiple": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,aed7f866c3a20dc9d2f2b4ad73515961"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,aed7f866c3a20dc9d2f2b4ad73515961"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:37:18.354500948"
+ },
+ "SARS-CoV-2 scaffolds fasta - add informative sites fasta normal": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,a57a34f1c566dea114dc1b13416536d4"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,a57a34f1c566dea114dc1b13416536d4"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:36:15.023267596"
+ },
+ "SARS-CoV-2 scaffolds fasta - add informative sites fasta long": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,e8868da70d1f3050a8daaee0e53b2fd9"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,e8868da70d1f3050a8daaee0e53b2fd9"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:37:05.79514229"
+ },
+ "SARS-CoV-2 scaffolds fasta - add informative sites fasta profile": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,c2b5caf39beff4473878e6aa4036ad43"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,c2b5caf39beff4473878e6aa4036ad43"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:36:52.893313726"
+ },
+ "SARS-CoV-2 scaffolds fasta - add informative sites fasta fragments": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,aed7f866c3a20dc9d2f2b4ad73515961"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,aed7f866c3a20dc9d2f2b4ad73515961"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:36:27.678238997"
+ },
+ "SARS-CoV-2 scaffolds fasta - add informative sites fasta full": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,611cb0a65195a282f110f7f56e310c66"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,611cb0a65195a282f110f7f56e310c66"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:36:41.02801897"
+ },
+ "SARS-CoV-2 scaffolds fasta - compressed": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,23426611f4a0df532b6708f072bd445b"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ],
+ "fas": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.fas.gz:md5,23426611f4a0df532b6708f072bd445b"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,87ac79c217c88dbdc575ad66e868c8c0"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.1"
+ },
+ "timestamp": "2024-11-26T16:36:03.570717213"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/mafft/align/tests/tags.yml b/modules/nf-core/mafft/align/tests/tags.yml
new file mode 100644
index 0000000..97b6666
--- /dev/null
+++ b/modules/nf-core/mafft/align/tests/tags.yml
@@ -0,0 +1,2 @@
+mafft/align:
+ - modules/nf-core/mafft/align/**
diff --git a/modules/nf-core/mmseqs/cluster/environment.yml b/modules/nf-core/mmseqs/cluster/environment.yml
new file mode 100644
index 0000000..d356134
--- /dev/null
+++ b/modules/nf-core/mmseqs/cluster/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::mmseqs2=17.b804f
diff --git a/modules/nf-core/mmseqs/cluster/main.nf b/modules/nf-core/mmseqs/cluster/main.nf
new file mode 100644
index 0000000..5e894fc
--- /dev/null
+++ b/modules/nf-core/mmseqs/cluster/main.nf
@@ -0,0 +1,60 @@
+process MMSEQS_CLUSTER {
+ tag "$meta.id"
+ label 'process_high'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mmseqs2:17.b804f--hd6d6fdc_1':
+ 'biocontainers/mmseqs2:17.b804f--hd6d6fdc_1' }"
+
+ input:
+ tuple val(meta), path(db_input, stageAs: "db_input")
+
+ output:
+ tuple val(meta), path("${prefix}/"), emit: db_cluster
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def args2 = task.ext.args2 ?: "*.dbtype"
+ prefix = task.ext.prefix ?: "${meta.id}"
+ if ("$db_input" == "${prefix}") error "Input and output names of databases are the same, set prefix in module configuration to disambiguate!"
+
+ """
+ mkdir -p ${prefix}
+ # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files
+ DB_INPUT_PATH_NAME=\$(find -L "$db_input/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' )
+
+ mmseqs \\
+ cluster \\
+ \$DB_INPUT_PATH_NAME \\
+ ${prefix}/${prefix} \\
+ tmp1 \\
+ $args \\
+ --threads ${task.cpus}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ mkdir -p ${prefix}
+
+ touch ${prefix}/${prefix}.{0..9}
+ touch ${prefix}/${prefix}.dbtype
+ touch ${prefix}/${prefix}.index
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/mmseqs/cluster/meta.yml b/modules/nf-core/mmseqs/cluster/meta.yml
new file mode 100644
index 0000000..ec97485
--- /dev/null
+++ b/modules/nf-core/mmseqs/cluster/meta.yml
@@ -0,0 +1,48 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "mmseqs_cluster"
+description: Cluster sequences using MMSeqs2 cluster.
+keywords:
+ - protein sequence
+ - databases
+ - clustering
+ - searching
+ - indexing
+ - mmseqs2
+tools:
+ - "mmseqs":
+ description: "MMseqs2: ultra fast and sensitive sequence search and clustering
+ suite"
+ homepage: "https://github.com/soedinglab/MMseqs2"
+ documentation: "https://mmseqs.com/latest/userguide.pdf"
+ tool_dev_url: "https://github.com/soedinglab/MMseqs2"
+ doi: "10.1093/bioinformatics/btw006"
+ licence: ["GPL v3"]
+ identifier: biotools:mmseqs
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - db_input:
+ type: file
+ description: Input database
+output:
+ - db_cluster:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - ${prefix}/:
+ type: file
+ description: a clustered MMseqs2 database used for clustering
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@Joon-Klaps"
+maintainers:
+ - "@Joon-Klaps"
diff --git a/modules/nf-core/mmseqs/cluster/mmseqs-cluster.diff b/modules/nf-core/mmseqs/cluster/mmseqs-cluster.diff
new file mode 100644
index 0000000..acde641
--- /dev/null
+++ b/modules/nf-core/mmseqs/cluster/mmseqs-cluster.diff
@@ -0,0 +1,20 @@
+Changes in component 'nf-core/mmseqs/cluster'
+'modules/nf-core/mmseqs/cluster/meta.yml' is unchanged
+'modules/nf-core/mmseqs/cluster/environment.yml' is unchanged
+Changes in 'mmseqs/cluster/main.nf':
+--- modules/nf-core/mmseqs/cluster/main.nf
++++ modules/nf-core/mmseqs/cluster/main.nf
+@@ -8,7 +8,7 @@
+ 'biocontainers/mmseqs2:17.b804f--hd6d6fdc_1' }"
+
+ input:
+- tuple val(meta), path(db_input)
++ tuple val(meta), path(db_input, stageAs: "db_input")
+
+ output:
+ tuple val(meta), path("${prefix}/"), emit: db_cluster
+
+'modules/nf-core/mmseqs/cluster/tests/main.nf.test.snap' is unchanged
+'modules/nf-core/mmseqs/cluster/tests/nextflow.config' is unchanged
+'modules/nf-core/mmseqs/cluster/tests/main.nf.test' is unchanged
+************************************************************
diff --git a/modules/nf-core/mmseqs/cluster/tests/main.nf.test b/modules/nf-core/mmseqs/cluster/tests/main.nf.test
new file mode 100644
index 0000000..0788221
--- /dev/null
+++ b/modules/nf-core/mmseqs/cluster/tests/main.nf.test
@@ -0,0 +1,56 @@
+nextflow_process {
+
+ name "Test Process MMSEQS_CLUSTER"
+ script "../main.nf"
+ process "MMSEQS_CLUSTER"
+ config "./nextflow.config"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "mmseqs"
+ tag "mmseqs/cluster"
+ tag "untar"
+
+ test("Should cluster an mmseqs database") {
+
+ setup {
+ run("UNTAR") {
+ script "modules/nf-core/untar/main.nf"
+ process {
+ """
+ input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true) ]
+ """
+ }
+ }
+ }
+
+ when {
+ process {
+ """
+ input[0] = UNTAR.out.untar
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert process.out.db_cluster.size() == 1 },
+ {
+ def all_files = file(process.out.db_cluster[0][1]).listFiles()
+ def all_file_names = all_files.collect { it.name }.toSorted()
+ def stable_file_names = [
+ 'test_output_cluster.dbtype',
+ 'test_output_cluster.index'
+ ]
+ def stable_files = all_files.findAll { it.name in stable_file_names }.toSorted()
+
+ assert snapshot(
+ all_file_names,
+ stable_files,
+ process.out.versions[0]
+ ).match()
+ }
+ )
+ }
+ }
+}
diff --git a/modules/nf-core/mmseqs/cluster/tests/main.nf.test.snap b/modules/nf-core/mmseqs/cluster/tests/main.nf.test.snap
new file mode 100644
index 0000000..09bc83f
--- /dev/null
+++ b/modules/nf-core/mmseqs/cluster/tests/main.nf.test.snap
@@ -0,0 +1,22 @@
+{
+ "Should cluster an mmseqs database": {
+ "content": [
+ [
+ "test_output_cluster.0",
+ "test_output_cluster.1",
+ "test_output_cluster.dbtype",
+ "test_output_cluster.index"
+ ],
+ [
+ "test_output_cluster.dbtype:md5,b9d9c6dbc098c97ae446f612efd8eafd",
+ "test_output_cluster.index:md5,9848b52b6df827d80a04f7c71c50056b"
+ ],
+ "versions.yml:md5,8cd8fdc3e9d128b0a4dc634b8748c213"
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T14:07:52.454357"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/mmseqs/cluster/tests/nextflow.config b/modules/nf-core/mmseqs/cluster/tests/nextflow.config
new file mode 100644
index 0000000..a43cb18
--- /dev/null
+++ b/modules/nf-core/mmseqs/cluster/tests/nextflow.config
@@ -0,0 +1,10 @@
+process {
+ withName: UNTAR {
+ publishDir = [ enabled : false ]
+ }
+
+ withName: MMSEQS_CLUSTER {
+ ext.prefix = "test_output_cluster"
+ ext.args = '--remove-tmp-files 1 -v 3 '
+ }
+}
diff --git a/modules/nf-core/mmseqs/createdb/environment.yml b/modules/nf-core/mmseqs/createdb/environment.yml
new file mode 100644
index 0000000..d356134
--- /dev/null
+++ b/modules/nf-core/mmseqs/createdb/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::mmseqs2=17.b804f
diff --git a/modules/nf-core/mmseqs/createdb/main.nf b/modules/nf-core/mmseqs/createdb/main.nf
new file mode 100644
index 0000000..6f8d5b1
--- /dev/null
+++ b/modules/nf-core/mmseqs/createdb/main.nf
@@ -0,0 +1,64 @@
+process MMSEQS_CREATEDB {
+ tag "$meta.id"
+ label 'process_low'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mmseqs2:17.b804f--hd6d6fdc_1':
+ 'biocontainers/mmseqs2:17.b804f--hd6d6fdc_1' }"
+
+ input:
+ tuple val(meta), path(sequence)
+
+ output:
+ tuple val(meta), path("${prefix}/"), emit: db
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ prefix = task.ext.prefix ?: "${meta.id}"
+ def is_compressed = sequence.getExtension() == "gz" ? true : false
+ def sequence_name = is_compressed ? sequence.getBaseName() : sequence
+ """
+ if [ "${is_compressed}" == "true" ]; then
+ gzip -c -d ${sequence} > ${sequence_name}
+ fi
+
+ mkdir -p ${prefix}
+
+ mmseqs \\
+ createdb \\
+ ${sequence_name} \\
+ ${prefix}/${prefix} \\
+ $args
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ mkdir -p ${prefix}
+
+ touch ${prefix}/${prefix}
+ touch ${prefix}/${prefix}.dbtype
+ touch ${prefix}/${prefix}.index
+ touch ${prefix}/${prefix}.lookup
+ touch ${prefix}/${prefix}.source
+ touch ${prefix}/${prefix}_h
+ touch ${prefix}/${prefix}_h.dbtype
+ touch ${prefix}/${prefix}_h.index
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/mmseqs/createdb/meta.yml b/modules/nf-core/mmseqs/createdb/meta.yml
new file mode 100644
index 0000000..c392a36
--- /dev/null
+++ b/modules/nf-core/mmseqs/createdb/meta.yml
@@ -0,0 +1,51 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "mmseqs_createdb"
+description: Create an MMseqs database from an existing FASTA/Q file
+keywords:
+ - protein sequence
+ - databases
+ - clustering
+ - searching
+ - indexing
+ - mmseqs2
+tools:
+ - "mmseqs":
+ description: "MMseqs2: ultra fast and sensitive sequence search and clustering
+ suite"
+ homepage: "https://github.com/soedinglab/MMseqs2"
+ documentation: "https://mmseqs.com/latest/userguide.pdf"
+ tool_dev_url: "https://github.com/soedinglab/MMseqs2"
+ doi: "10.1093/bioinformatics/btw006"
+ licence: ["GPL v3"]
+ identifier: biotools:mmseqs
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - sequence:
+ type: file
+ description: Input sequences in FASTA/Q (zipped or unzipped) format to parse
+ into an mmseqs database
+ pattern: "*.{fasta,fasta.gz,fa,fa.gz,fna,fna.gz,fastq,fastq.gz,fq,fq.gz}"
+output:
+ - db:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - ${prefix}/:
+ type: directory
+ description: The created MMseqs2 database
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@Joon-Klaps"
+maintainers:
+ - "@Joon-Klaps"
+ - "@vagkaratzas"
diff --git a/modules/nf-core/mmseqs/createdb/tests/main.nf.test b/modules/nf-core/mmseqs/createdb/tests/main.nf.test
new file mode 100644
index 0000000..d4a4f0c
--- /dev/null
+++ b/modules/nf-core/mmseqs/createdb/tests/main.nf.test
@@ -0,0 +1,64 @@
+nextflow_process {
+
+ name "Test Process MMSEQS_CREATEDB"
+ script "../main.nf"
+ process "MMSEQS_CREATEDB"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "mmseqs"
+ tag "mmseqs/createdb"
+
+ test("Should build an mmseqs db from a contigs fasta file") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ process.out.db,
+ process.out.versions
+ ).match()
+ }
+ )
+ }
+
+ }
+
+ test("Should build an mmseqs db from a zipped amino acid sequence file") {
+
+ when {
+ process {
+ """
+
+ input[0] = [
+ [ id:'test' ],
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ process.out.db,
+ process.out.versions
+ ).match()
+ }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap b/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap
new file mode 100644
index 0000000..9eee149
--- /dev/null
+++ b/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap
@@ -0,0 +1,61 @@
+{
+ "Should build an mmseqs db from a contigs fasta file": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ [
+ "test:md5,a2cda8768736a7a317a09d61556194bd",
+ "test.dbtype:md5,4352d88a78aa39750bf70cd6f27bcaa5",
+ "test.index:md5,4ba298b011e2472ce9f6b99fe6b6e3d5",
+ "test.lookup:md5,32f88756dbcb6aaf7b239b0d61730f1b",
+ "test.source:md5,9ada5b3ea6e1a7e16c4418eb98ae8d9d",
+ "test_h:md5,21c399702a071bdeecce09f9d1df4531",
+ "test_h.dbtype:md5,740bab4f9ec8808aedb68d6b1281aeb2",
+ "test_h.index:md5,d767fb43b37c0a644c676b00f9f93477"
+ ]
+ ]
+ ],
+ [
+ "versions.yml:md5,c62b08152082097334109fe08ec6333a"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T14:11:57.883871"
+ },
+ "Should build an mmseqs db from a zipped amino acid sequence file": {
+ "content": [
+ [
+ [
+ {
+ "id": "test"
+ },
+ [
+ "test:md5,1162504bc65aacf734abdcb0cdbe87de",
+ "test.dbtype:md5,f1d3ff8443297732862df21dc4e57262",
+ "test.index:md5,8cdcbc06c2b99fdb09f3d1735a76def9",
+ "test.lookup:md5,3e27cb93d9ee875ad42a6f32f5651bdc",
+ "test.source:md5,eaa64fc8a5f7ec1ee49b0dcbd1a72e9d",
+ "test_h:md5,f258f8cc04f83c270a75e8b00a6d2d89",
+ "test_h.dbtype:md5,740bab4f9ec8808aedb68d6b1281aeb2",
+ "test_h.index:md5,844bf1950bcd37284fdc5d7117ee4241"
+ ]
+ ]
+ ],
+ [
+ "versions.yml:md5,c62b08152082097334109fe08ec6333a"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T14:12:10.986433"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/mmseqs/createtsv/environment.yml b/modules/nf-core/mmseqs/createtsv/environment.yml
new file mode 100644
index 0000000..d356134
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::mmseqs2=17.b804f
diff --git a/modules/nf-core/mmseqs/createtsv/main.nf b/modules/nf-core/mmseqs/createtsv/main.nf
new file mode 100644
index 0000000..f3e3b95
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/main.nf
@@ -0,0 +1,62 @@
+
+process MMSEQS_CREATETSV {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mmseqs2:17.b804f--hd6d6fdc_1':
+ 'biocontainers/mmseqs2:17.b804f--hd6d6fdc_1' }"
+
+ input:
+ tuple val(meta), path(db_result)
+ tuple val(meta2), path(db_query, stageAs: "db_query")
+ tuple val(meta3), path(db_target, stageAs: "db_target")
+
+ output:
+ tuple val(meta), path("*.tsv"), emit: tsv
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def args2 = task.ext.args2 ?: "*.dbtype" // database generated by mmyseqs cluster | search | taxonomy | ...
+ def args3 = task.ext.args3 ?: "*.dbtype" // database generated by mmyseqs/createdb
+ def args4 = task.ext.args4 ?: "*.dbtype" // database generated by mmyseqs/createdb
+ def prefix = task.ext.prefix ?: "${meta.id}"
+
+ """
+ # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files
+ DB_RESULT_PATH_NAME=\$(find -L "$db_result/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' )
+ DB_QUERY_PATH_NAME=\$(find -L "$db_query/" -maxdepth 1 -name "$args3" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' )
+ DB_TARGET_PATH_NAME=\$(find -L "$db_target/" -maxdepth 1 -name "$args4" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' )
+
+ mmseqs \\
+ createtsv \\
+ \$DB_QUERY_PATH_NAME \\
+ \$DB_TARGET_PATH_NAME \\
+ \$DB_RESULT_PATH_NAME \\
+ ${prefix}.tsv \\
+ $args \\
+ --threads ${task.cpus}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.tsv
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/mmseqs/createtsv/meta.yml b/modules/nf-core/mmseqs/createtsv/meta.yml
new file mode 100644
index 0000000..5a50ff3
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/meta.yml
@@ -0,0 +1,70 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "mmseqs_createtsv"
+description: Create a tsv file from a query and a target database as well as the result
+ database
+keywords:
+ - protein sequence
+ - databases
+ - clustering
+ - searching
+ - indexing
+ - mmseqs2
+ - tsv
+tools:
+ - "mmseqs":
+ description: "MMseqs2: ultra fast and sensitive sequence search and clustering
+ suite"
+ homepage: "https://github.com/soedinglab/MMseqs2"
+ documentation: "https://mmseqs.com/latest/userguide.pdf"
+ tool_dev_url: "https://github.com/soedinglab/MMseqs2"
+ doi: "10.1093/bioinformatics/btw006"
+ licence: ["GPL v3"]
+ identifier: biotools:mmseqs
+input:
+ # Only when we have meta
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - db_result:
+ type: directory
+ description: an MMseqs2 database with result data
+ - - meta2:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - db_query:
+ type: directory
+ description: an MMseqs2 database with query data
+ - - meta3:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - db_target:
+ type: directory
+ description: an MMseqs2 database with target data
+output:
+ #Only when we have meta
+ - tsv:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - "*.tsv":
+ type: file
+ description: The resulting tsv file created using the query, target and result
+ MMseqs databases
+ pattern: "*.{tsv}"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@Joon-Klaps"
+maintainers:
+ - "@Joon-Klaps"
diff --git a/modules/nf-core/mmseqs/createtsv/mmseqs-createtsv.diff b/modules/nf-core/mmseqs/createtsv/mmseqs-createtsv.diff
new file mode 100644
index 0000000..bcdb48a
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/mmseqs-createtsv.diff
@@ -0,0 +1,23 @@
+Changes in component 'nf-core/mmseqs/createtsv'
+'modules/nf-core/mmseqs/createtsv/meta.yml' is unchanged
+'modules/nf-core/mmseqs/createtsv/environment.yml' is unchanged
+Changes in 'mmseqs/createtsv/main.nf':
+--- modules/nf-core/mmseqs/createtsv/main.nf
++++ modules/nf-core/mmseqs/createtsv/main.nf
+@@ -10,8 +10,8 @@
+
+ input:
+ tuple val(meta), path(db_result)
+- tuple val(meta2), path(db_query)
+- tuple val(meta3), path(db_target)
++ tuple val(meta2), path(db_query, stageAs: "db_query")
++ tuple val(meta3), path(db_target, stageAs: "db_target")
+
+ output:
+ tuple val(meta), path("*.tsv"), emit: tsv
+
+'modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config' is unchanged
+'modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config' is unchanged
+'modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap' is unchanged
+'modules/nf-core/mmseqs/createtsv/tests/main.nf.test' is unchanged
+************************************************************
diff --git a/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config b/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config
new file mode 100644
index 0000000..48fee16
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config
@@ -0,0 +1,6 @@
+process {
+
+ withName: MMSEQS_CREATETSV {
+ ext.args2 = '*_clu.dbtype'
+ }
+}
diff --git a/modules/nf-core/mmseqs/createtsv/tests/main.nf.test b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test
new file mode 100644
index 0000000..1aa7463
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test
@@ -0,0 +1,247 @@
+nextflow_process {
+
+ name "Test Process MMSEQS_CREATETSV"
+ script "../main.nf"
+ process "MMSEQS_CREATETSV"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "mmseqs"
+ tag "mmseqs/taxonomy"
+ tag "mmseqs/createdb"
+ tag "mmseqs/databases"
+ tag "untar"
+ tag "mmseqs/createtsv"
+
+ test("mmseqs/createtsv - bacteroides_fragilis - taxonomy") {
+
+ config "./taxonomy.nextflow.config"
+
+ setup {
+ run("MMSEQS_CREATEDB", alias: "MMSEQS_TAXA") {
+ script "../../createdb/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_query', single_end:false ],
+ file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true)
+ ]
+ """
+ }
+ }
+ run("MMSEQS_DATABASES") {
+ script "../../databases/main.nf"
+ process {
+ """
+ input[0] = 'SILVA'
+ """
+ }
+ }
+ run("MMSEQS_TAXONOMY") {
+ script "../../taxonomy/main.nf"
+ process {
+ """
+ input[0] = MMSEQS_TAXA.out.db
+ input[1] = MMSEQS_DATABASES.out.database
+ """
+ }
+ }
+ }
+ when {
+ process {
+ """
+ input[0] = MMSEQS_TAXONOMY.out.db_taxonomy
+ input[1] = [[:],[]]
+ input[2] = MMSEQS_TAXA.out.db
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+ test("mmseqs/createtsv - sarscov2 - cluster") {
+
+ config "./cluster.nextflow.config"
+
+ setup {
+ run("UNTAR", alias: "UNTAR_QUERY") {
+ script "../../../untar/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_query', single_end:true ],
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true),
+ ]
+ """
+ }
+ }
+ run("UNTAR", alias: "UNTAR_TARGET") {
+ script "../../../untar/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_target', single_end:true ],
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true),
+ ]
+ """
+ }
+ }
+ run("UNTAR", alias: "UNTAR_RESULT") {
+ script "../../../untar/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_result', single_end:true ],
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true),
+ ]
+ """
+ }
+ }
+ }
+
+ when {
+
+ process {
+ """
+ ch_query = UNTAR_QUERY.out.untar
+ ch_target = UNTAR_TARGET.out.untar
+ ch_result = UNTAR_RESULT.out.untar
+
+ input[0] = ch_result
+ input[1] = ch_query
+ input[2] = ch_target
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+ test("mmseqs/createtsv - bacteroides_fragilis - taxonomy - stub") {
+
+ options "-stub"
+ config "./taxonomy.nextflow.config"
+
+ setup {
+ run("MMSEQS_CREATEDB", alias: "MMSEQS_TAXA") {
+ script "../../createdb/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_query', single_end:false ],
+ file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true)
+ ]
+ """
+ }
+ }
+ run("MMSEQS_DATABASES") {
+ script "../../databases/main.nf"
+ process {
+ """
+ input[0] = 'SILVA'
+ """
+ }
+ }
+ run("MMSEQS_TAXONOMY") {
+ script "../../taxonomy/main.nf"
+ process {
+ """
+ input[0] = MMSEQS_TAXA.out.db
+ input[1] = MMSEQS_DATABASES.out.database
+ """
+ }
+ }
+ }
+ when {
+ process {
+ """
+ input[0] = MMSEQS_TAXONOMY.out.db_taxonomy
+ input[1] = [[:],[]]
+ input[2] = MMSEQS_TAXA.out.db
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+ test("mmseqs/createtsv - sarscov2 - cluster - stub") {
+
+ options "-stub"
+ config "./cluster.nextflow.config"
+
+ setup {
+ run("UNTAR", alias: "UNTAR_QUERY") {
+ script "../../../untar/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_query', single_end:true ],
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true),
+ ]
+ """
+ }
+ }
+ run("UNTAR", alias: "UNTAR_TARGET") {
+ script "../../../untar/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_target', single_end:true ],
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true),
+ ]
+ """
+ }
+ }
+ run("UNTAR", alias: "UNTAR_RESULT") {
+ script "../../../untar/main.nf"
+ process {
+ """
+ input[0] = [
+ [ id:'test_result', single_end:true ],
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true),
+ ]
+ """
+ }
+ }
+ }
+
+ when {
+
+ process {
+ """
+ ch_query = UNTAR_QUERY.out.untar
+ ch_target = UNTAR_TARGET.out.untar
+ ch_result = UNTAR_RESULT.out.untar
+
+ input[0] = ch_result
+ input[1] = ch_query
+ input[2] = ch_target
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap
new file mode 100644
index 0000000..a70f839
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap
@@ -0,0 +1,142 @@
+{
+ "mmseqs/createtsv - bacteroides_fragilis - taxonomy - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test_query",
+ "single_end": false
+ },
+ "test_query.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ],
+ "tsv": [
+ [
+ {
+ "id": "test_query",
+ "single_end": false
+ },
+ "test_query.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T17:29:15.220926"
+ },
+ "mmseqs/createtsv - sarscov2 - cluster - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test_result",
+ "single_end": true
+ },
+ "test_result.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ],
+ "tsv": [
+ [
+ {
+ "id": "test_result",
+ "single_end": true
+ },
+ "test_result.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T17:29:32.089204"
+ },
+ "mmseqs/createtsv - bacteroides_fragilis - taxonomy": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test_query",
+ "single_end": false
+ },
+ "test_query.tsv:md5,9179f5c85b8b87a4dc998c9d17840161"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ],
+ "tsv": [
+ [
+ {
+ "id": "test_query",
+ "single_end": false
+ },
+ "test_query.tsv:md5,9179f5c85b8b87a4dc998c9d17840161"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T17:28:41.472818"
+ },
+ "mmseqs/createtsv - sarscov2 - cluster": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test_result",
+ "single_end": true
+ },
+ "test_result.tsv:md5,c81449fb936b76aad6f925b965e84bc5"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ],
+ "tsv": [
+ [
+ {
+ "id": "test_result",
+ "single_end": true
+ },
+ "test_result.tsv:md5,c81449fb936b76aad6f925b965e84bc5"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,ce808eb9a57e201a48afec56168f9e77"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T17:28:58.633976"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config b/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config
new file mode 100644
index 0000000..f08205d
--- /dev/null
+++ b/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config
@@ -0,0 +1,7 @@
+process {
+
+ withName: MMSEQS_TAXONOMY {
+ ext.args = '--search-type 2'
+ }
+
+}
diff --git a/modules/nf-core/mmseqs/linclust/environment.yml b/modules/nf-core/mmseqs/linclust/environment.yml
new file mode 100644
index 0000000..d356134
--- /dev/null
+++ b/modules/nf-core/mmseqs/linclust/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::mmseqs2=17.b804f
diff --git a/modules/nf-core/mmseqs/linclust/main.nf b/modules/nf-core/mmseqs/linclust/main.nf
new file mode 100644
index 0000000..128453f
--- /dev/null
+++ b/modules/nf-core/mmseqs/linclust/main.nf
@@ -0,0 +1,60 @@
+process MMSEQS_LINCLUST {
+ tag "$meta.id"
+ label 'process_high'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mmseqs2:17.b804f--hd6d6fdc_1':
+ 'biocontainers/mmseqs2:17.b804f--hd6d6fdc_1' }"
+
+ input:
+ tuple val(meta), path(db_input, stageAs: "db_input")
+
+ output:
+ tuple val(meta), path("${prefix}/"), emit: db_cluster
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def args2 = task.ext.args2 ?: "*.dbtype"
+ prefix = task.ext.prefix ?: "${meta.id}"
+ if ("$db_input" == "${prefix}") error "Input and output names of databases are the same, set prefix in module configuration to disambiguate!"
+
+ """
+ mkdir -p ${prefix}
+ # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files
+ DB_INPUT_PATH_NAME=\$(find -L "$db_input/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' )
+
+ mmseqs \\
+ linclust \\
+ \$DB_INPUT_PATH_NAME \\
+ ${prefix}/${prefix} \\
+ tmp1 \\
+ $args \\
+ --threads ${task.cpus}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ mkdir -p ${prefix}
+
+ touch ${prefix}/${prefix}.{0..9}
+ touch ${prefix}/${prefix}.dbtype
+ touch ${prefix}/${prefix}.index
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/mmseqs/linclust/meta.yml b/modules/nf-core/mmseqs/linclust/meta.yml
new file mode 100644
index 0000000..986e2d8
--- /dev/null
+++ b/modules/nf-core/mmseqs/linclust/meta.yml
@@ -0,0 +1,48 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "mmseqs_linclust"
+description: Cluster sequences in linear time using MMSeqs2 linclust.
+keywords:
+ - protein sequence
+ - databases
+ - clustering
+ - searching
+ - indexing
+ - mmseqs2
+tools:
+ - "mmseqs":
+ description: "MMseqs2: ultra fast and sensitive sequence search and clustering
+ suite"
+ homepage: "https://github.com/soedinglab/MMseqs2"
+ documentation: "https://mmseqs.com/latest/userguide.pdf"
+ tool_dev_url: "https://github.com/soedinglab/MMseqs2"
+ doi: "10.1093/bioinformatics/btw006"
+ licence: ["GPL v3"]
+ identifier: biotools:mmseqs
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - db_input:
+ type: file
+ description: Input database
+output:
+ - db_cluster:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'test', single_end:false ]`
+ - ${prefix}/:
+ type: directory
+ description: a clustered MMseqs2 database
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@vagkaratzas"
+maintainers:
+ - "@vagkaratzas"
diff --git a/modules/nf-core/mmseqs/linclust/mmseqs-linclust.diff b/modules/nf-core/mmseqs/linclust/mmseqs-linclust.diff
new file mode 100644
index 0000000..de5a13e
--- /dev/null
+++ b/modules/nf-core/mmseqs/linclust/mmseqs-linclust.diff
@@ -0,0 +1,20 @@
+Changes in component 'nf-core/mmseqs/linclust'
+'modules/nf-core/mmseqs/linclust/meta.yml' is unchanged
+'modules/nf-core/mmseqs/linclust/environment.yml' is unchanged
+Changes in 'mmseqs/linclust/main.nf':
+--- modules/nf-core/mmseqs/linclust/main.nf
++++ modules/nf-core/mmseqs/linclust/main.nf
+@@ -8,7 +8,7 @@
+ 'biocontainers/mmseqs2:17.b804f--hd6d6fdc_1' }"
+
+ input:
+- tuple val(meta), path(db_input)
++ tuple val(meta), path(db_input, stageAs: "db_input")
+
+ output:
+ tuple val(meta), path("${prefix}/"), emit: db_cluster
+
+'modules/nf-core/mmseqs/linclust/tests/main.nf.test.snap' is unchanged
+'modules/nf-core/mmseqs/linclust/tests/nextflow.config' is unchanged
+'modules/nf-core/mmseqs/linclust/tests/main.nf.test' is unchanged
+************************************************************
diff --git a/modules/nf-core/mmseqs/linclust/tests/main.nf.test b/modules/nf-core/mmseqs/linclust/tests/main.nf.test
new file mode 100644
index 0000000..9a3b6de
--- /dev/null
+++ b/modules/nf-core/mmseqs/linclust/tests/main.nf.test
@@ -0,0 +1,56 @@
+nextflow_process {
+
+ name "Test Process MMSEQS_LINCLUST"
+ script "../main.nf"
+ process "MMSEQS_LINCLUST"
+ config "./nextflow.config"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "mmseqs"
+ tag "mmseqs/linclust"
+ tag "untar"
+
+ test("Should cluster an mmseqs database") {
+
+ setup {
+ run("UNTAR") {
+ script "modules/nf-core/untar/main.nf"
+ process {
+ """
+ input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true) ]
+ """
+ }
+ }
+ }
+
+ when {
+ process {
+ """
+ input[0] = UNTAR.out.untar
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert process.out.db_cluster.size() == 1 },
+ {
+ def all_files = file(process.out.db_cluster[0][1]).listFiles()
+ def all_file_names = all_files.collect { it.name }.toSorted()
+ def stable_file_names = [
+ 'test_output_cluster.dbtype',
+ 'test_output_cluster.index'
+ ]
+ def stable_files = all_files.findAll { it.name in stable_file_names }.toSorted()
+
+ assert snapshot(
+ all_file_names,
+ stable_files,
+ process.out.versions[0]
+ ).match()
+ }
+ )
+ }
+ }
+}
diff --git a/modules/nf-core/mmseqs/linclust/tests/main.nf.test.snap b/modules/nf-core/mmseqs/linclust/tests/main.nf.test.snap
new file mode 100644
index 0000000..ab999b3
--- /dev/null
+++ b/modules/nf-core/mmseqs/linclust/tests/main.nf.test.snap
@@ -0,0 +1,22 @@
+{
+ "Should cluster an mmseqs database": {
+ "content": [
+ [
+ "test_output_cluster.0",
+ "test_output_cluster.1",
+ "test_output_cluster.dbtype",
+ "test_output_cluster.index"
+ ],
+ [
+ "test_output_cluster.dbtype:md5,b9d9c6dbc098c97ae446f612efd8eafd",
+ "test_output_cluster.index:md5,a0b78f31aee2c327d72f32919814baf1"
+ ],
+ "versions.yml:md5,acd9806d081cf15f08f26263bd9dfe80"
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-20T18:13:05.966684"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/mmseqs/linclust/tests/nextflow.config b/modules/nf-core/mmseqs/linclust/tests/nextflow.config
new file mode 100644
index 0000000..c9259b8
--- /dev/null
+++ b/modules/nf-core/mmseqs/linclust/tests/nextflow.config
@@ -0,0 +1,10 @@
+process {
+ withName: UNTAR {
+ publishDir = [ enabled : false ]
+ }
+
+ withName: MMSEQS_LINCLUST {
+ ext.prefix = "test_output_cluster"
+ ext.args = '--remove-tmp-files 1 -v 0 '
+ }
+}
diff --git a/modules/nf-core/seqkit/seq/environment.yml b/modules/nf-core/seqkit/seq/environment.yml
new file mode 100644
index 0000000..160a67c
--- /dev/null
+++ b/modules/nf-core/seqkit/seq/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - "bioconda::seqkit=2.9.0"
diff --git a/modules/nf-core/seqkit/seq/main.nf b/modules/nf-core/seqkit/seq/main.nf
new file mode 100644
index 0000000..9d76da2
--- /dev/null
+++ b/modules/nf-core/seqkit/seq/main.nf
@@ -0,0 +1,63 @@
+process SEQKIT_SEQ {
+ tag "$meta.id"
+ label 'process_low'
+ // File IO can be a bottleneck. See: https://bioinf.shenwei.me/seqkit/usage/#parallelization-of-cpu-intensive-jobs
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/seqkit:2.9.0--h9ee0642_0':
+ 'biocontainers/seqkit:2.9.0--h9ee0642_0' }"
+
+ input:
+ tuple val(meta), path(fastx)
+
+ output:
+ tuple val(meta), path("${prefix}.*") , emit: fastx
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def args2 = task.ext.args2 ?: ''
+ prefix = task.ext.prefix ?: "${meta.id}"
+ def extension = "fastq"
+ if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/ ) {
+ extension = "fasta"
+ }
+ extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension
+ def call_gzip = extension.endsWith('.gz') ? "| gzip -c $args2" : ''
+ if("${prefix}.${extension}" == "$fastx") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+ """
+ seqkit \\
+ seq \\
+ --threads $task.cpus \\
+ $args \\
+ $fastx \\
+ $call_gzip \\
+ > ${prefix}.${extension}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ seqkit: \$(seqkit version | cut -d' ' -f2)
+ END_VERSIONS
+ """
+
+ stub:
+ prefix = task.ext.prefix ?: "${meta.id}"
+ def extension = "fastq"
+ if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/ ) {
+ extension = "fasta"
+ }
+ extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension
+ if("${prefix}.${extension}" == "$fastx") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+ """
+ touch ${prefix}.${extension}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ seqkit: \$(seqkit version | cut -d' ' -f2)
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/seqkit/seq/meta.yml b/modules/nf-core/seqkit/seq/meta.yml
new file mode 100644
index 0000000..7d32aba
--- /dev/null
+++ b/modules/nf-core/seqkit/seq/meta.yml
@@ -0,0 +1,51 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "seqkit_seq"
+description: Transforms sequences (extract ID, filter by length, remove gaps, reverse
+ complement...)
+keywords:
+ - genomics
+ - fasta
+ - fastq
+ - transform
+ - filter
+ - gaps
+ - complement
+tools:
+ - "seqkit":
+ description: "A cross-platform and ultrafast toolkit for FASTA/Q file manipulation"
+ homepage: "https://bioinf.shenwei.me/seqkit/"
+ documentation: "https://bioinf.shenwei.me/seqkit/usage/"
+ tool_dev_url: "https://github.com/shenwei356/seqkit"
+ doi: "10.1371/journal.pone.0163962"
+ licence: ["MIT"]
+ identifier: biotools:seqkit
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'sample1' ]`
+ - fastx:
+ type: file
+ description: Input fasta/fastq file
+ pattern: "*.{fsa,fas,fa,fasta,fastq,fq,fsa.gz,fas.gz,fa.gz,fasta.gz,fastq.gz,fq.gz}"
+output:
+ - fastx:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. `[ id:'sample1' ]`
+ - ${prefix}.*:
+ type: file
+ description: Output fasta/fastq file
+ pattern: "*.{fasta,fasta.gz,fastq,fastq.gz}"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@GallVp"
+maintainers:
+ - "@GallVp"
diff --git a/modules/nf-core/seqkit/seq/tests/main.nf.test b/modules/nf-core/seqkit/seq/tests/main.nf.test
new file mode 100644
index 0000000..9fd1c08
--- /dev/null
+++ b/modules/nf-core/seqkit/seq/tests/main.nf.test
@@ -0,0 +1,145 @@
+nextflow_process {
+
+ name "Test Process SEQKIT_SEQ"
+ script "../main.nf"
+ process "SEQKIT_SEQ"
+ config './nextflow.config'
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "seqkit"
+ tag "seqkit/seq"
+
+ test("sarscov2-genome_fasta") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+
+ }
+
+ test("sarscov2-genome_fasta_gz") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+
+ }
+
+ test("sarscov2-test_1_fastq_gz") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+
+ }
+
+ test("file_name_conflict-fail_with_error") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test_1' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert !process.success },
+ { assert process.stdout.toString().contains("Input and output names are the same") }
+ )
+ }
+
+ }
+
+ test("sarscov2-genome_fasta-stub") {
+
+ options '-stub'
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+
+ }
+
+ test("file_name_conflict-fail_with_error-stub") {
+
+ options '-stub'
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'genome' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert !process.success },
+ { assert process.stdout.toString().contains("Input and output names are the same") }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/seqkit/seq/tests/main.nf.test.snap b/modules/nf-core/seqkit/seq/tests/main.nf.test.snap
new file mode 100644
index 0000000..6817193
--- /dev/null
+++ b/modules/nf-core/seqkit/seq/tests/main.nf.test.snap
@@ -0,0 +1,134 @@
+{
+ "sarscov2-genome_fasta-stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ],
+ "fastx": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-15T15:13:34.513457"
+ },
+ "sarscov2-test_1_fastq_gz": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ],
+ "fastx": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-15T15:13:27.316329"
+ },
+ "sarscov2-genome_fasta": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ],
+ "fastx": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-15T15:13:18.463038"
+ },
+ "sarscov2-genome_fasta_gz": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ],
+ "fastx": [
+ [
+ {
+ "id": "test"
+ },
+ "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "24.10.3"
+ },
+ "timestamp": "2025-01-15T15:13:22.960973"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/seqkit/seq/tests/nextflow.config b/modules/nf-core/seqkit/seq/tests/nextflow.config
new file mode 100644
index 0000000..d8e3c66
--- /dev/null
+++ b/modules/nf-core/seqkit/seq/tests/nextflow.config
@@ -0,0 +1,3 @@
+process {
+ ext.args2 = '-n'
+}
diff --git a/modules/nf-core/seqkit/seq/tests/tags.yml b/modules/nf-core/seqkit/seq/tests/tags.yml
new file mode 100644
index 0000000..5eeca7e
--- /dev/null
+++ b/modules/nf-core/seqkit/seq/tests/tags.yml
@@ -0,0 +1,2 @@
+seqkit/seq:
+ - "modules/nf-core/seqkit/seq/**"
diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml
new file mode 100644
index 0000000..ae4fa45
--- /dev/null
+++ b/modules/nf-core/untar/environment.yml
@@ -0,0 +1,10 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - conda-forge::coreutils=9.5
+ - conda-forge::grep=3.11
+ - conda-forge::gzip=1.13
+ - conda-forge::lbzip2=2.5
+ - conda-forge::sed=4.8
+ - conda-forge::tar=1.34
diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf
new file mode 100644
index 0000000..e712ebe
--- /dev/null
+++ b/modules/nf-core/untar/main.nf
@@ -0,0 +1,84 @@
+process UNTAR {
+ tag "${archive}"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+ ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/52/52ccce28d2ab928ab862e25aae26314d69c8e38bd41ca9431c67ef05221348aa/data'
+ : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:838ba80435a629f8'}"
+
+ input:
+ tuple val(meta), path(archive)
+
+ output:
+ tuple val(meta), path("${prefix}"), emit: untar
+ path "versions.yml", emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def args2 = task.ext.args2 ?: ''
+ prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, ""))
+
+ """
+ mkdir ${prefix}
+
+ ## Ensures --strip-components only applied when top level of tar contents is a directory
+ ## If just files or multiple directories, place all in prefix
+ if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then
+ tar \\
+ -C ${prefix} --strip-components 1 \\
+ -xavf \\
+ ${args} \\
+ ${archive} \\
+ ${args2}
+ else
+ tar \\
+ -C ${prefix} \\
+ -xavf \\
+ ${args} \\
+ ${archive} \\
+ ${args2}
+ fi
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//')
+ END_VERSIONS
+ """
+
+ stub:
+ prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, ""))
+ """
+ mkdir ${prefix}
+ ## Dry-run untaring the archive to get the files and place all in prefix
+ if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then
+ for i in `tar -tf ${archive}`;
+ do
+ if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]];
+ then
+ touch \${i}
+ else
+ mkdir -p \${i}
+ fi
+ done
+ else
+ for i in `tar -tf ${archive}`;
+ do
+ if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]];
+ then
+ touch ${prefix}/\${i}
+ else
+ mkdir -p ${prefix}/\${i}
+ fi
+ done
+ fi
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml
new file mode 100644
index 0000000..3a37bb3
--- /dev/null
+++ b/modules/nf-core/untar/meta.yml
@@ -0,0 +1,52 @@
+name: untar
+description: Extract files.
+keywords:
+ - untar
+ - uncompress
+ - extract
+tools:
+ - untar:
+ description: |
+ Extract tar.gz files.
+ documentation: https://www.gnu.org/software/tar/manual/
+ licence: ["GPL-3.0-or-later"]
+ identifier: ""
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - archive:
+ type: file
+ description: File to be untar
+ pattern: "*.{tar}.{gz}"
+output:
+ - untar:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ pattern: "*/"
+ - ${prefix}:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ pattern: "*/"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@joseespinosa"
+ - "@drpatelh"
+ - "@matthdsm"
+ - "@jfy133"
+maintainers:
+ - "@joseespinosa"
+ - "@drpatelh"
+ - "@matthdsm"
+ - "@jfy133"
diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test
new file mode 100644
index 0000000..c957517
--- /dev/null
+++ b/modules/nf-core/untar/tests/main.nf.test
@@ -0,0 +1,85 @@
+nextflow_process {
+
+ name "Test Process UNTAR"
+ script "../main.nf"
+ process "UNTAR"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "untar"
+
+ test("test_untar") {
+
+ when {
+ process {
+ """
+ input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ]
+ """
+ }
+ }
+
+ then {
+ assertAll (
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+ }
+
+ test("test_untar_onlyfiles") {
+
+ when {
+ process {
+ """
+ input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ]
+ """
+ }
+ }
+
+ then {
+ assertAll (
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+ }
+
+ test("test_untar - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ]
+ """
+ }
+ }
+
+ then {
+ assertAll (
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+ }
+
+ test("test_untar_onlyfiles - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ]
+ """
+ }
+ }
+
+ then {
+ assertAll (
+ { assert process.success },
+ { assert snapshot(process.out).match() },
+ )
+ }
+ }
+}
diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap
new file mode 100644
index 0000000..ceb91b7
--- /dev/null
+++ b/modules/nf-core/untar/tests/main.nf.test.snap
@@ -0,0 +1,158 @@
+{
+ "test_untar_onlyfiles": {
+ "content": [
+ {
+ "0": [
+ [
+ [
+
+ ],
+ [
+ "hello.txt:md5,e59ff97941044f85df5297e1c302d260"
+ ]
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ],
+ "untar": [
+ [
+ [
+
+ ],
+ [
+ "hello.txt:md5,e59ff97941044f85df5297e1c302d260"
+ ]
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-10T12:04:28.231047"
+ },
+ "test_untar_onlyfiles - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ [
+
+ ],
+ [
+ "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ],
+ "untar": [
+ [
+ [
+
+ ],
+ [
+ "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-10T12:04:45.773103"
+ },
+ "test_untar - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ [
+
+ ],
+ [
+ "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e",
+ "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e",
+ "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ],
+ "untar": [
+ [
+ [
+
+ ],
+ [
+ "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e",
+ "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e",
+ "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-10T12:04:36.777441"
+ },
+ "test_untar": {
+ "content": [
+ {
+ "0": [
+ [
+ [
+
+ ],
+ [
+ "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9",
+ "opts.k2d:md5,a033d00cf6759407010b21700938f543",
+ "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c"
+ ]
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ],
+ "untar": [
+ [
+ [
+
+ ],
+ [
+ "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9",
+ "opts.k2d:md5,a033d00cf6759407010b21700938f543",
+ "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c"
+ ]
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,6063247258c56fd271d076bb04dd7536"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.3"
+ },
+ "timestamp": "2024-07-10T12:04:19.377674"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml
new file mode 100644
index 0000000..feb6f15
--- /dev/null
+++ b/modules/nf-core/untar/tests/tags.yml
@@ -0,0 +1,2 @@
+untar:
+ - modules/nf-core/untar/**
diff --git a/nextflow.config b/nextflow.config
index 011f7d4..e81d18f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -9,10 +9,37 @@
// Global default params, used in configs
params {
- // TODO nf-core: Specify your pipeline's command line flags
// Input options
input = null
+ // Pipeline parameters
+ // Clustering
+ save_mmseqs_db = false
+ clustering_tool = 'linclust'
+ cluster_seq_identity = 0.5
+ cluster_coverage = 0.9
+ cluster_cov_mode = 0
+ save_mmseqs_clustering = false
+ cluster_size_threshold = 25
+ // Alignment
+ alignment_tool = 'famsa'
+ trim_msa = true
+ clipping_tool = 'clip_ends'
+ gap_threshold = 0.5
+ recruit_sequences_with_models = true
+ hmmsearch_write_target = false
+ hmmsearch_write_domain = true
+ hmmsearch_evalue_cutoff = 0.001
+ save_hmmsearch_results = false
+ hmmsearch_query_length_threshold = 0.8
+ // Redundancy
+ remove_family_redundancy = true
+ hmmsearch_family_length_threshold = 0.8
+ remove_sequence_redundancy = true
+ cluster_seq_identity_for_redundancy = 0.97
+ cluster_coverage_for_redundancy = 0.97
+ cluster_cov_mode_for_redundancy = 0
+
// MultiQC options
multiqc_config = null
multiqc_title = null
@@ -155,16 +182,17 @@ profiles {
]
}
}
- test { includeConfig 'conf/test.config' }
- test_full { includeConfig 'conf/test_full.config' }
+ test { includeConfig 'conf/test.config' }
+ test_minimal { includeConfig 'conf/test_minimal.config' }
+ test_multi_sample_with_gz { includeConfig 'conf/test_multi_sample_with_gz.config' }
+ test_update { includeConfig 'conf/test_update.config' }
+ test_full { includeConfig 'conf/test_full.config' }
}
// Load nf-core custom profiles from different Institutions
includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null"
-
// Load nf-core/proteinfamilies custom profiles from different institutions.
-// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs
-// includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/proteinfamilies.config" : "/dev/null"
+includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/proteinfamilies.config" : "/dev/null"
// Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile
// Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled
@@ -222,22 +250,28 @@ manifest {
name = 'nf-core/proteinfamilies'
author = """Evangelos Karatzas""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead
contributors = [
- // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0
[
name: 'Evangelos Karatzas',
- affiliation: '',
- email: '',
- github: '',
- contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor')
- orcid: ''
+ affiliation: 'EMBL-EBI',
+ email: 'vangelis@ebi.ac.uk',
+ github: 'https://github.com/vagkaratzas',
+ contribution: ['author'], // List of contribution types ('author', 'maintainer' or 'contributor')
+ orcid: '0000-0001-9132-8981'
],
+ [
+ name: 'Martin Beracochea',
+ affiliation: 'EMBL-EBI',
+ github: 'https://github.com/mberacochea',
+ contribution: ['contributor'], // List of contribution types ('author', 'maintainer' or 'contributor')
+ orcid: '0000-0003-3472-3736'
+ ]
]
homePage = 'https://github.com/nf-core/proteinfamilies'
description = """Generate protein family level models (MSAs, HMMs) starting from a FASTA amino acid sequence file."""
mainScript = 'main.nf'
defaultBranch = 'master'
nextflowVersion = '!>=24.04.2'
- version = '1.0.0dev'
+ version = '1.0.0'
doi = ''
}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index e7a8e3d..ce0d670 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -187,6 +187,170 @@
"hidden": true
}
}
+ },
+ "clustering_params": {
+ "title": "Clustering parameters",
+ "type": "object",
+ "description": "Use these parameters to control the flow of the clustering subworkflow execution.",
+ "properties": {
+ "save_mmseqs_db": {
+ "type": "boolean",
+ "description": "Save the db output folder of mmseqs createdb",
+ "help_text": "Specify to save the mmseqs formatted database of input fasta sequences."
+ },
+ "clustering_tool": {
+ "type": "string",
+ "description": "Choose clustering algorithm. Either simple 'cluster' for medium size inputs, or 'linclust' for less sensitive clustering of larger datasets.",
+ "help_text": "mmseqs algorithms available: 'cluster' or 'linclust'.",
+ "enum": ["linclust", "cluster"]
+ },
+ "cluster_seq_identity": {
+ "type": "number",
+ "default": 0.5,
+ "description": "mmseqs parameter for minimum sequence identity",
+ "help_text": "Minimum sequence identity required for clustering. The mmseqs default is 0.9, but protein families contain way more diverse sequences."
+ },
+ "cluster_coverage": {
+ "type": "number",
+ "default": 0.9,
+ "description": "mmseqs parameter for minimum sequence coverage ratio",
+ "help_text": "Minimum sequence length coverage required for clustering"
+ },
+ "cluster_cov_mode": {
+ "type": "integer",
+ "default": 0,
+ "description": "mmseqs parameter for coverage mode: 0 for both, 1 for target and 2 for query sequence",
+ "help_text": "Defined by the number of aligned residue pairs divided by 0: the maximum of the length of query/centre and target/non-centre sequences, 1: the length of the target/non-centre sequence, 2: the length of the query/centre"
+ },
+ "save_mmseqs_clustering": {
+ "type": "boolean",
+ "description": "Save the clustering output folder of mmseqs cluster or linclust",
+ "help_text": "Specify to save the mmseqs formatted result clustering."
+ },
+ "cluster_size_threshold": {
+ "type": "integer",
+ "default": 25,
+ "description": "Minimum clustering chunk size threshold to create seed multiple sequence alignments upon.",
+ "help_text": "This is an initial filter for the mmseqs clustering results. The lower the threshold, the more families will be generated. Additional sequences may be later recruited in the families."
+ }
+ }
+ },
+ "alignment_params": {
+ "title": "Alignment parameters",
+ "type": "object",
+ "description": "Use these parameters to control the multiple sequence alignment subworkflow execution.",
+ "properties": {
+ "alignment_tool": {
+ "type": "string",
+ "description": "Choose alignment tool. FAMSA is recommended as best time-memory-accuracy combination option.",
+ "help_text": "Available tools: 'famsa' or 'mafft'.",
+ "enum": ["famsa", "mafft"]
+ },
+ "trim_msa": {
+ "type": "boolean",
+ "description": "Boolean whether to trim the MSA gaps",
+ "default": true,
+ "fa_icon": "fas fa-check-square",
+ "hidden": true
+ },
+ "clipping_tool": {
+ "type": "string",
+ "description": "Choose clipping tool. ClipKIT clips gaps throughout the sequence while clip_ends only at the ends.",
+ "help_text": "Available tools: local module 'clip_ends' or nf-core 'clipkit'.",
+ "enum": ["clip_ends", "clipkit"]
+ },
+ "gap_threshold": {
+ "type": "number",
+ "default": 0.5,
+ "description": "MSA positions with gappiness greater than this threshold will be trimmed"
+ },
+ "recruit_sequences_with_models": {
+ "type": "boolean",
+ "description": "Set to true to recruit additional sequences from the input FASTA file using the family HMMs to refine the alignments",
+ "default": true,
+ "fa_icon": "fas fa-check-square",
+ "hidden": true,
+ "help_text": "If this is set to true, the hmmer/hmmsearch module will be used to recruit additional sequences from the input fasta file into the family (above the hmmsearch_query_length_threshold), resulting in bigger but better described families"
+ },
+ "hmmsearch_write_target": {
+ "type": "boolean",
+ "description": "Boolean whether to generate target results file of hmmsearch",
+ "default": false,
+ "fa_icon": "fas fa-check-square",
+ "hidden": true,
+ "help_text": "Specify to calculate the hmmsearch tabular (space-delimited) summary of per-target output (*.tbl.gz)."
+ },
+ "hmmsearch_write_domain": {
+ "type": "boolean",
+ "description": "Boolean whether to generate domain results file of hmmsearch",
+ "default": true,
+ "fa_icon": "fas fa-check-square",
+ "hidden": true,
+ "help_text": "Specify to calculate the hmmsearch tabular (space-delimited) summary of per-domain output (*.domtbl.gz)."
+ },
+ "hmmsearch_evalue_cutoff": {
+ "type": "number",
+ "default": 0.001,
+ "description": "hmmsearch e-value cutoff threshold for reported results"
+ },
+ "save_hmmsearch_results": {
+ "type": "boolean",
+ "description": "Save the output of hmmsearch (.domtbl.gz and .tbl.gz)",
+ "help_text": "Specify to save the .domtbl.gz and .tbl.gz files generated by running hmmsearch on the family model against the input set of sequences."
+ },
+ "hmmsearch_query_length_threshold": {
+ "type": "number",
+ "default": 0.8,
+ "description": "hmmsearch minimum length percentage filter of hit env vs query length",
+ "help_text": "This length threshold should be quite high to make sure that small, fragmented sequences are not recruited in the family models"
+ }
+ }
+ },
+ "redundancy_params": {
+ "title": "Redundancy removal parameters",
+ "type": "object",
+ "description": "Use these parameters to control the redundancy removal subworkflow execution.",
+ "properties": {
+ "remove_family_redundancy": {
+ "type": "boolean",
+ "description": "Removal of between-family redundancy via hmmsearch.",
+ "default": true,
+ "fa_icon": "fas fa-question-circle",
+ "hidden": true,
+ "help_text": "Similar families should not be further processed, for better resource management and to avoid duplication of families."
+ },
+ "hmmsearch_family_length_threshold": {
+ "type": "number",
+ "default": 0.8,
+ "description": "hmmsearch minimum length percentage filter of hit env vs query length, for redundant family removal"
+ },
+ "remove_sequence_redundancy": {
+ "type": "boolean",
+ "description": "Removal of inside-family redundancy of sequences via mmseqs clustering.",
+ "default": true,
+ "fa_icon": "fas fa-question-circle",
+ "hidden": true,
+ "help_text": "Highly similar sequences within the same family should be removed, while making sure that the protein family diversity is still well captured."
+ },
+ "cluster_seq_identity_for_redundancy": {
+ "type": "number",
+ "default": 0.97,
+ "description": "mmseqs parameter for minimum sequence identity",
+ "help_text": "Need to be quite high, to make sure that the diversity is still well captured, well highly similar sequences are removed"
+ },
+ "cluster_coverage_for_redundancy": {
+ "type": "number",
+ "default": 0.97,
+ "description": "mmseqs parameter for minimum sequence coverage ratio",
+ "help_text": "Need to be quite high, to make sure that the diversity is still well captured, well highly similar sequences are removed"
+ },
+ "cluster_cov_mode_for_redundancy": {
+ "type": "integer",
+ "default": 0,
+ "description": "mmseqs parameter for coverage mode: 0 for both, 1 for target and 2 for query sequence",
+ "help_text": "Defined by the number of aligned residue pairs divided by 0: the maximum of the length of query/centre and target/non-centre sequences, 1: the length of the target/non-centre sequence, 2: the length of the query/centre"
+ }
+ }
}
},
"allOf": [
@@ -198,6 +362,15 @@
},
{
"$ref": "#/$defs/generic_options"
+ },
+ {
+ "$ref": "#/$defs/clustering_params"
+ },
+ {
+ "$ref": "#/$defs/alignment_params"
+ },
+ {
+ "$ref": "#/$defs/redundancy_params"
}
]
}
diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json
index 1236db2..cb51440 100644
--- a/ro-crate-metadata.json
+++ b/ro-crate-metadata.json
@@ -21,9 +21,9 @@
{
"@id": "./",
"@type": "Dataset",
- "creativeWorkStatus": "InProgress",
- "datePublished": "2025-01-27T14:48:14+00:00",
- "description": "
\n \n \n \n \n
\n\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinfamilies/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/proteinfamilies/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinfamilies/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinfamilies/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinfamilies/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinfamilies)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinfamilies-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinfamilies)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinfamilies** is a bioinformatics pipeline that ...\n\n\n\n\n2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n\n\nNow, you can run the pipeline using:\n\n\n\n```bash\nnextflow run nf-core/proteinfamilies \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinfamilies/usage) and the [parameter documentation](https://nf-co.re/proteinfamilies/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinfamilies/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinfamilies/output).\n\n## Credits\n\nnf-core/proteinfamilies was originally written by Evangelos Karatzas.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinfamilies` channel](https://nfcore.slack.com/channels/proteinfamilies) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n",
+ "creativeWorkStatus": "Stable",
+ "datePublished": "2025-02-03T09:44:15+00:00",
+ "description": "
\n \n \n \n \n
\n\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinfamilies/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/proteinfamilies/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinfamilies/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinfamilies/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinfamilies/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinfamilies)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinfamilies-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinfamilies)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinfamilies** is a bioinformatics pipeline that generates protein families from amino acid sequences and/or updates existing families with new sequences.\nIt takes a protein fasta file as input, clusters the sequences and then generates protein family Hiden Markov Models (HMMs) along with their multiple sequence alignments (MSAs).\nOptionally, paths to existing family HMMs and MSAs can be given (must have matching base filenames one-to-one) in order to update with new sequences in case of matching hits.\n\n\n
\n \n
\n\n### Create families\n\n1. Cluster sequences ([`MMseqs2`](https://github.com/soedinglab/MMseqs2/))\n2. Perform multiple sequence alignment (MSA) ([`FAMSA`](https://github.com/refresh-bio/FAMSA/) or [`mafft`](https://github.com/GSLBiotech/mafft/))\n3. Optionally, clip gap parts of the MSA ([`ClipKIT`](https://github.com/JLSteenwyk/ClipKIT/))\n4. Generate family HMMs and fish additional sequences into the family ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))\n5. Optionally, remove redundant families by comparing family representative sequences against family models with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))\n6. Optionally, from the remaining families, remove in-family redundant sequences by strictly clustering with ([`MMseqs2`](https://github.com/soedinglab/MMseqs2/)) and keep cluster representatives\n7. Present statistics for remaining/updated families size distributions and representative sequence lengths ([`MultiQC`](http://multiqc.info/))\n\n### Update families\n\n1. Find which families to update by comparing the input sequences against existing family models with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))\n2. For non hit sequences continue with the above: A. Create families. For hit sequences and families continue to: 3\n3. Extract family sequences ([`SeqKit`](https://github.com/shenwei356/seqkit/)) and concatenate with filtered hit sequences of each family\n4. Optionally, remove in-family redundant sequences by strictly clustering with ([`MMseqs2`](https://github.com/soedinglab/MMseqs2/)) and keeping cluster representatives\n5. Perform multiple sequence alignment (MSA) ([`FAMSA`](https://github.com/refresh-bio/FAMSA/) or [`mafft`](https://github.com/GSLBiotech/mafft/))\n6. Optionally, clip gap parts of the MSA ([`ClipKIT`](https://github.com/JLSteenwyk/ClipKIT/))\n7. Update family HMM with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,fasta,existing_hmms_to_update,existing_msas_to_update\nCONTROL_REP1,input/mgnifams_input_small.fa,,\n```\n\nEach row contains a fasta file with amino acid sequences (can be zipped or unzipped).\nOptionally, a row may contain tarball archives (tar.gz) of existing families' HMM and MSA folders, in order to be updated.\nIn this case, the HMM and MSA files must be matching in numbers and in base filenames (not the extension).\nHit families/sequences will be updated, while no hit sequences will create new families.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/proteinfamilies \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinfamilies/usage) and the [parameter documentation](https://nf-co.re/proteinfamilies/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinfamilies/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinfamilies/output).\n\n## Credits\n\nnf-core/proteinfamilies was originally written by Evangelos Karatzas.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Martin Beracochea](https://github.com/mberacochea)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinfamilies` channel](https://nfcore.slack.com/channels/proteinfamilies) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n",
"hasPart": [
{
"@id": "main.nf"
@@ -31,6 +31,9 @@
{
"@id": "assets/"
},
+ {
+ "@id": "bin/"
+ },
{
"@id": "conf/"
},
@@ -43,6 +46,9 @@
{
"@id": "modules/"
},
+ {
+ "@id": "modules/local/"
+ },
{
"@id": "modules/nf-core/"
},
@@ -99,7 +105,7 @@
},
"mentions": [
{
- "@id": "#b16a2c17-506e-47b5-828a-ab5eecd6fcce"
+ "@id": "#a37271bb-3044-4545-9788-69aae39e1c58"
}
],
"name": "nf-core/proteinfamilies"
@@ -121,21 +127,40 @@
},
{
"@id": "main.nf",
- "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
+ "@type": [
+ "File",
+ "SoftwareSourceCode",
+ "ComputationalWorkflow"
+ ],
"dateCreated": "",
- "dateModified": "2025-01-27T14:48:14Z",
+ "dateModified": "2025-02-03T09:44:15Z",
"dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/",
- "keywords": ["nf-core", "nextflow", "metagenomics", "protein-families", "proteomics"],
- "license": ["MIT"],
- "name": ["nf-core/proteinfamilies"],
+ "keywords": [
+ "nf-core",
+ "nextflow",
+ "metagenomics",
+ "protein-families",
+ "proteomics"
+ ],
+ "license": [
+ "MIT"
+ ],
+ "name": [
+ "nf-core/proteinfamilies"
+ ],
"programmingLanguage": {
"@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"
},
"sdPublisher": {
"@id": "https://nf-co.re/"
},
- "url": ["https://github.com/nf-core/proteinfamilies", "https://nf-co.re/proteinfamilies/dev/"],
- "version": ["1.0.0dev"]
+ "url": [
+ "https://github.com/nf-core/proteinfamilies",
+ "https://nf-co.re/proteinfamilies/1.0.0/"
+ ],
+ "version": [
+ "1.0.0"
+ ]
},
{
"@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow",
@@ -150,11 +175,11 @@
"version": "!>=24.04.2"
},
{
- "@id": "#b16a2c17-506e-47b5-828a-ab5eecd6fcce",
+ "@id": "#a37271bb-3044-4545-9788-69aae39e1c58",
"@type": "TestSuite",
"instance": [
{
- "@id": "#f760361b-0f06-46b2-9386-2773235ea937"
+ "@id": "#959646ff-6477-433a-9da5-35d4cbeb15c8"
}
],
"mainEntity": {
@@ -163,7 +188,7 @@
"name": "Test suite for nf-core/proteinfamilies"
},
{
- "@id": "#f760361b-0f06-46b2-9386-2773235ea937",
+ "@id": "#959646ff-6477-433a-9da5-35d4cbeb15c8",
"@type": "TestInstance",
"name": "GitHub Actions workflow for testing nf-core/proteinfamilies",
"resource": "repos/nf-core/proteinfamilies/actions/workflows/ci.yml",
@@ -185,6 +210,11 @@
"@type": "Dataset",
"description": "Additional files"
},
+ {
+ "@id": "bin/",
+ "@type": "Dataset",
+ "description": "Scripts that must be callable from a pipeline process"
+ },
{
"@id": "conf/",
"@type": "Dataset",
@@ -205,6 +235,11 @@
"@type": "Dataset",
"description": "Modules used by the pipeline"
},
+ {
+ "@id": "modules/local/",
+ "@type": "Dataset",
+ "description": "Pipeline-specific modules"
+ },
{
"@id": "modules/nf-core/",
"@type": "Dataset",
@@ -292,4 +327,4 @@
"url": "https://nf-co.re/"
}
]
-}
+}
\ No newline at end of file
diff --git a/subworkflows/local/align_sequences/main.nf b/subworkflows/local/align_sequences/main.nf
new file mode 100644
index 0000000..2239b33
--- /dev/null
+++ b/subworkflows/local/align_sequences/main.nf
@@ -0,0 +1,29 @@
+/*
+ MULTIPLE SEQUENCE ALIGNMENT
+*/
+
+include { FAMSA_ALIGN } from '../../../modules/nf-core/famsa/align/main'
+include { MAFFT_ALIGN } from '../../../modules/nf-core/mafft/align/main'
+
+workflow ALIGN_SEQUENCES {
+ take:
+ sequences // tuple val(meta), path(fasta)
+
+ main:
+ ch_versions = Channel.empty()
+ ch_alignments = Channel.empty()
+
+ if (params.alignment_tool == 'famsa') {
+ alignment_res = FAMSA_ALIGN( sequences, [[:],[]], false )
+ ch_versions = ch_versions.mix( FAMSA_ALIGN.out.versions )
+ ch_alignments = alignment_res.alignment
+ } else { // fallback: mafft
+ alignment_res = MAFFT_ALIGN( sequences, [[:], []], [[:], []], [[:], []], [[:], []], [[:], []], false )
+ ch_versions = ch_versions.mix( MAFFT_ALIGN.out.versions )
+ ch_alignments = alignment_res.fas
+ }
+
+ emit:
+ versions = ch_versions
+ alignments = ch_alignments
+}
diff --git a/subworkflows/local/execute_clustering/main.nf b/subworkflows/local/execute_clustering/main.nf
new file mode 100644
index 0000000..5626c02
--- /dev/null
+++ b/subworkflows/local/execute_clustering/main.nf
@@ -0,0 +1,53 @@
+/*
+ SEQUENCE CLUSTERING
+*/
+
+include { MMSEQS_CREATEDB } from '../../../modules/nf-core/mmseqs/createdb/main'
+include { MMSEQS_CLUSTER } from '../../../modules/nf-core/mmseqs/cluster/main'
+include { MMSEQS_LINCLUST } from '../../../modules/nf-core/mmseqs/linclust/main'
+include { MMSEQS_CREATETSV } from '../../../modules/nf-core/mmseqs/createtsv/main'
+
+workflow EXECUTE_CLUSTERING {
+ take:
+ sequences // tuple val(meta), path(fasta)
+
+ main:
+ ch_versions = Channel.empty()
+ ch_clustering_tsv = Channel.empty()
+
+ MMSEQS_CREATEDB( sequences )
+ ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions )
+
+ if (params.clustering_tool == 'cluster') {
+ cluster_res = MMSEQS_CLUSTER( MMSEQS_CREATEDB.out.db )
+ ch_versions = ch_versions.mix( MMSEQS_CLUSTER.out.versions )
+ } else { // fallback: linclust
+ cluster_res = MMSEQS_LINCLUST( MMSEQS_CREATEDB.out.db )
+ ch_versions = ch_versions.mix( MMSEQS_LINCLUST.out.versions )
+ }
+
+ // Join to ensure in sync
+ ch_input_for_createtsv = MMSEQS_CREATEDB.out.db
+ .join(cluster_res.db_cluster)
+ .multiMap { meta, db, db_cluster ->
+ db: [ meta, db ]
+ db_cluster: [ meta, db_cluster ]
+ }
+
+ MMSEQS_CREATETSV(ch_input_for_createtsv.db_cluster, ch_input_for_createtsv.db, ch_input_for_createtsv.db)
+ ch_versions = ch_versions.mix( MMSEQS_CREATETSV.out.versions )
+ ch_clustering_tsv = MMSEQS_CREATETSV.out.tsv
+
+ // Join to ensure in sync
+ ch_clustering_output = sequences
+ .join(MMSEQS_CREATETSV.out.tsv)
+ .multiMap { meta, seqs, clusters ->
+ seqs: [meta, seqs]
+ clusters: [meta, clusters]
+ }
+
+ emit:
+ versions = ch_versions
+ seqs = ch_clustering_output.seqs
+ clusters = ch_clustering_output.clusters
+}
diff --git a/subworkflows/local/generate_families/main.nf b/subworkflows/local/generate_families/main.nf
new file mode 100644
index 0000000..b610698
--- /dev/null
+++ b/subworkflows/local/generate_families/main.nf
@@ -0,0 +1,88 @@
+/*
+ FAMILY MODEL GENERATION
+*/
+
+include { ALIGN_SEQUENCES } from '../../../subworkflows/local/align_sequences'
+include { CLIPKIT } from '../../../modules/nf-core/clipkit/main'
+include { CLIP_ENDS } from '../../../modules/local/clip_ends/main'
+include { HMMER_HMMBUILD } from '../../../modules/nf-core/hmmer/hmmbuild/main'
+include { HMMER_HMMSEARCH } from '../../../modules/nf-core/hmmer/hmmsearch/main'
+include { FILTER_RECRUITED } from '../../../modules/local/filter_recruited/main'
+include { HMMER_HMMALIGN } from '../../../modules/nf-core/hmmer/hmmalign/main'
+
+workflow GENERATE_FAMILIES {
+ take:
+ sequences // tuple val(meta), path(fasta)
+ fasta_chunks
+
+ main:
+ ch_versions = Channel.empty()
+ ch_msa = Channel.empty()
+ ch_fasta = Channel.empty()
+ ch_hmm = Channel.empty()
+
+ ch_fasta = fasta_chunks
+ .transpose()
+ .map { meta, file_path ->
+ [ [id: meta.id, chunk: file(file_path, checkIfExists: true).baseName], file_path ]
+ }
+
+ ALIGN_SEQUENCES( ch_fasta )
+ ch_versions = ch_versions.mix( ALIGN_SEQUENCES.out.versions )
+ ch_msa = ALIGN_SEQUENCES.out.alignments
+
+ if (params.trim_msa) {
+ if (params.clipping_tool == 'clipkit') {
+ CLIPKIT( ch_msa )
+ ch_versions = ch_versions.mix( CLIPKIT.out.versions )
+ ch_msa = CLIPKIT.out.clipkit
+ } else { // fallback: local module clip_ends
+ CLIP_ENDS( ch_msa, params.gap_threshold )
+ ch_versions = ch_versions.mix( CLIP_ENDS.out.versions )
+ ch_msa = CLIP_ENDS.out.fas
+ }
+ }
+
+ HMMER_HMMBUILD( ch_msa, [] )
+ ch_versions = ch_versions.mix( HMMER_HMMBUILD.out.versions )
+ ch_hmm = HMMER_HMMBUILD.out.hmm
+
+ // Combine with same id to ensure in sync
+ ch_input_for_hmmsearch = ch_hmm
+ .map { meta, hmm -> [ [id: meta.id], meta, hmm ] }
+ .combine(sequences, by: 0)
+ .map { id, meta, hmm, seqs -> [ meta, hmm, seqs, false, params.hmmsearch_write_target, params.hmmsearch_write_domain ] }
+
+ if (params.recruit_sequences_with_models) {
+ HMMER_HMMSEARCH( ch_input_for_hmmsearch )
+ ch_versions = ch_versions.mix( HMMER_HMMSEARCH.out.versions )
+
+ // Combine with same id to ensure in sync
+ ch_input_for_filter_recruited = HMMER_HMMSEARCH.out.domain_summary
+ .map { meta, domtbl -> [ [id: meta.id], meta, domtbl ] }
+ .combine(sequences, by: 0)
+ .map { id, meta, domtbl, seqs -> [ meta, domtbl, seqs ] }
+
+ FILTER_RECRUITED( ch_input_for_filter_recruited, params.hmmsearch_query_length_threshold )
+ ch_versions = ch_versions.mix( FILTER_RECRUITED.out.versions )
+ ch_fasta = FILTER_RECRUITED.out.fasta
+
+ // Join to ensure in sync
+ ch_input_for_hmmalign = ch_fasta
+ .join(ch_hmm)
+ .multiMap { meta, seqs, hmms ->
+ seq: [ meta, seqs ]
+ hmm: [ hmms ]
+ }
+
+ HMMER_HMMALIGN( ch_input_for_hmmalign.seq, ch_input_for_hmmalign.hmm )
+ ch_versions = ch_versions.mix( HMMER_HMMALIGN.out.versions )
+ ch_msa = HMMER_HMMALIGN.out.sto
+ }
+
+ emit:
+ versions = ch_versions
+ msa = ch_msa
+ fasta = ch_fasta
+ hmm = ch_hmm
+}
diff --git a/subworkflows/local/remove_redundancy/main.nf b/subworkflows/local/remove_redundancy/main.nf
new file mode 100644
index 0000000..0df5582
--- /dev/null
+++ b/subworkflows/local/remove_redundancy/main.nf
@@ -0,0 +1,93 @@
+/*
+ REMOVAL OF REDUNDANT SEQUENCES AND FAMILIES
+*/
+
+include { EXTRACT_FAMILY_REPS } from '../../../modules/local/extract_family_reps/main'
+include { CAT_CAT } from '../../../modules/nf-core/cat/cat'
+include { HMMER_HMMSEARCH } from '../../../modules/nf-core/hmmer/hmmsearch/main'
+include { REMOVE_REDUNDANT_FAMS } from '../../../modules/local/remove_redundant_fams/main'
+include { FILTER_NON_REDUNDANT_HMMS } from '../../../modules/local/filter_non_redundant_hmms/main'
+include { EXECUTE_CLUSTERING } from '../../../subworkflows/local/execute_clustering'
+include { REMOVE_REDUNDANT_SEQS } from '../../../modules/local/remove_redundant_seqs/main'
+include { ALIGN_SEQUENCES } from '../../../subworkflows/local/align_sequences'
+
+workflow REMOVE_REDUNDANCY {
+ take:
+ msa // tuple val(meta), path(fas)
+ fasta // tuple val(meta), path(fasta)
+ hmm // tuple val(meta), path(hmm)
+
+ main:
+ ch_versions = Channel.empty()
+
+ if (params.remove_family_redundancy) {
+ ch_msa = msa
+ .map { meta, aln -> [[id: meta.id], aln] }
+ .groupTuple(by: 0)
+ EXTRACT_FAMILY_REPS( ch_msa )
+ ch_versions = ch_versions.mix( EXTRACT_FAMILY_REPS.out.versions )
+
+ ch_hmm = hmm
+ .map { meta, model -> [[id: meta.id], model] }
+ .groupTuple(by: 0)
+ CAT_CAT( ch_hmm )
+ ch_versions = ch_versions.mix( CAT_CAT.out.versions )
+
+ ch_input_for_hmmsearch = CAT_CAT.out.file_out
+ .combine(EXTRACT_FAMILY_REPS.out.fasta, by: 0)
+ .map { meta, model, seqs -> [meta, model, seqs, false, false, true] }
+
+ HMMER_HMMSEARCH( ch_input_for_hmmsearch )
+ ch_versions = ch_versions.mix( HMMER_HMMSEARCH.out.versions )
+
+ fasta = fasta
+ .map { meta, fas -> [[id: meta.id], fas] }
+ .groupTuple(by: 0)
+
+ // Join to ensure in sync
+ ch_input_for_fam_removal = EXTRACT_FAMILY_REPS.out.map
+ .join(HMMER_HMMSEARCH.out.domain_summary)
+ .join(fasta)
+ .multiMap { meta, map, domtbl, seqs ->
+ map: [meta, map]
+ domtbl: [meta, domtbl]
+ seqs: [meta, seqs]
+ }
+
+ REMOVE_REDUNDANT_FAMS( ch_input_for_fam_removal.map, ch_input_for_fam_removal.domtbl, ch_input_for_fam_removal.seqs, params.hmmsearch_family_length_threshold )
+ ch_versions = ch_versions.mix( REMOVE_REDUNDANT_FAMS.out.versions )
+ fasta = REMOVE_REDUNDANT_FAMS.out.fasta
+
+ // Join to ensure in sync
+ ch_input_for_hmm_filtering = fasta
+ .join(ch_hmm)
+ .multiMap { meta, seqs, models ->
+ seqs: [meta, seqs]
+ models: [meta, models]
+ }
+ FILTER_NON_REDUNDANT_HMMS( ch_input_for_hmm_filtering.seqs, ch_input_for_hmm_filtering.models )
+ ch_versions = ch_versions.mix( FILTER_NON_REDUNDANT_HMMS.out.versions )
+
+ fasta = fasta
+ .transpose()
+ .map { meta, file ->
+ [[id: meta.id, chunk: file.getSimpleName().split('_')[-1]], file]
+ }
+ }
+
+ if (params.remove_sequence_redundancy) {
+ EXECUTE_CLUSTERING( fasta )
+ ch_versions = ch_versions.mix( EXECUTE_CLUSTERING.out.versions )
+
+ REMOVE_REDUNDANT_SEQS( EXECUTE_CLUSTERING.out.clusters, EXECUTE_CLUSTERING.out.seqs )
+ ch_versions = ch_versions.mix( REMOVE_REDUNDANT_SEQS.out.versions )
+
+ ALIGN_SEQUENCES( REMOVE_REDUNDANT_SEQS.out.fasta )
+ ch_versions = ch_versions.mix( ALIGN_SEQUENCES.out.versions )
+ msa = ALIGN_SEQUENCES.out.alignments
+ }
+
+ emit:
+ versions = ch_versions
+ msa = msa
+}
diff --git a/subworkflows/local/update_families/main.nf b/subworkflows/local/update_families/main.nf
new file mode 100644
index 0000000..df1120d
--- /dev/null
+++ b/subworkflows/local/update_families/main.nf
@@ -0,0 +1,143 @@
+include { UNTAR as UNTAR_HMM } from '../../../modules/nf-core/untar/main'
+include { UNTAR as UNTAR_MSA } from '../../../modules/nf-core/untar/main'
+include { validateMatchingFolders } from '../../../subworkflows/local/utils_nfcore_proteinfamilies_pipeline'
+include { CAT_CAT as CAT_HMM } from '../../../modules/nf-core/cat/cat/main'
+include { HMMER_HMMSEARCH } from '../../../modules/nf-core/hmmer/hmmsearch/main'
+include { BRANCH_HITS_FASTA } from '../../../modules/local/branch_hits_fasta'
+include { SEQKIT_SEQ } from '../../../modules/nf-core/seqkit/seq/main'
+include { CAT_CAT as CAT_FASTA } from '../../../modules/nf-core/cat/cat/main'
+include { EXECUTE_CLUSTERING } from '../../../subworkflows/local/execute_clustering'
+include { REMOVE_REDUNDANT_SEQS } from '../../../modules/local/remove_redundant_seqs/main'
+include { ALIGN_SEQUENCES } from '../../../subworkflows/local/align_sequences'
+include { CLIPKIT } from '../../../modules/nf-core/clipkit/main'
+include { CLIP_ENDS } from '../../../modules/local/clip_ends/main'
+include { HMMER_HMMBUILD } from '../../../modules/nf-core/hmmer/hmmbuild/main'
+include { EXTRACT_FAMILY_REPS } from '../../../modules/local/extract_family_reps/main'
+
+workflow UPDATE_FAMILIES {
+ take:
+ ch_samplesheet_for_update // channel: [meta, sequences, existing_hmms_to_update, existing_msas_to_update]
+
+ main:
+ ch_versions = Channel.empty()
+ ch_updated_family_reps = Channel.empty()
+ ch_no_hit_seqs = Channel.empty()
+
+ ch_input_for_untar = ch_samplesheet_for_update
+ .multiMap { meta, _fasta, existing_hmms_to_update, existing_msas_to_update ->
+ hmm: [ meta, existing_hmms_to_update ]
+ msa: [ meta, existing_msas_to_update ]
+ }
+
+ UNTAR_HMM( ch_input_for_untar.hmm )
+ ch_versions = ch_versions.mix( UNTAR_HMM.out.versions )
+
+ UNTAR_MSA( ch_input_for_untar.msa )
+ ch_versions = ch_versions.mix( UNTAR_MSA.out.versions )
+
+ // check that the HMMs and the MSAs match
+ // join to ensure in sync
+ ch_folders_to_validate = UNTAR_HMM.out.untar
+ .join(UNTAR_MSA.out.untar)
+ .multiMap { meta, folder1, folder2 ->
+ hmm_folder_ch: [meta, folder1]
+ msa_folder_ch: [meta, folder2]
+ }
+ validateMatchingFolders(ch_folders_to_validate.hmm_folder_ch, ch_folders_to_validate.msa_folder_ch)
+
+ // Squeeze the HMMs into a single file
+ CAT_HMM( UNTAR_HMM.out.untar.map { meta, folder -> [meta, file("${folder.toUriString()}/*", checkIfExists: true)] } )
+ ch_versions = ch_versions.mix( CAT_HMM.out.versions )
+
+ // Prep the sequences to search against the HMM concatenated model of families
+ ch_input_for_hmmsearch = CAT_HMM.out.file_out
+ .combine(ch_samplesheet_for_update, by: 0)
+ .map { meta, concatenated_hmm, fasta, _existing_hmms_to_update, _existing_msas_to_update -> [meta, concatenated_hmm, fasta, false, false, true] }
+
+ HMMER_HMMSEARCH( ch_input_for_hmmsearch )
+ ch_versions = ch_versions.mix( HMMER_HMMSEARCH.out.versions )
+
+ ch_input_for_branch_hits = HMMER_HMMSEARCH.out.domain_summary
+ .join(ch_samplesheet_for_update)
+ .multiMap { meta, domtbl, fasta, _existing_hmms_to_update, _existing_msas_to_update ->
+ domtbl: [ meta, domtbl ]
+ fasta: [ meta, fasta ]
+ }
+
+ // Branch hit families/fasta proteins from non hit fasta proteins
+ BRANCH_HITS_FASTA ( ch_input_for_branch_hits.fasta, ch_input_for_branch_hits.domtbl, params.hmmsearch_query_length_threshold )
+ ch_versions = ch_versions.mix( BRANCH_HITS_FASTA.out.versions )
+ ch_no_hit_seqs = BRANCH_HITS_FASTA.out.non_hit_fasta
+
+ ch_hits_fasta = BRANCH_HITS_FASTA.out.hits
+ .transpose()
+ .map { meta, file ->
+ [[id: meta.id, family: file.getSimpleName()], file]
+ }
+
+ ch_family_msas = UNTAR_MSA.out.untar
+ .map { meta, folder ->
+ [meta, file("${folder.toUriString()}/*", checkIfExists: true)]
+ }
+ .transpose()
+ .map { meta, file ->
+ [[id: meta.id, family: file.getSimpleName()], file]
+ }
+
+ // Keep fasta with family sequences by removing gaps
+ SEQKIT_SEQ( ch_family_msas )
+ ch_versions = ch_versions.mix(SEQKIT_SEQ.out.versions)
+
+ // Match newly recruited sequences with existing ones for each family
+ ch_input_for_cat = SEQKIT_SEQ.out.fastx
+ .combine(ch_hits_fasta, by: 0)
+ .map { meta, family_fasta, new_fasta ->
+ [meta, [family_fasta, new_fasta]]
+ }
+
+ // Aggregate each family's MSA sequences with the newly recruited ones
+ CAT_FASTA( ch_input_for_cat )
+ ch_versions = ch_versions.mix( CAT_FASTA.out.versions )
+ fasta_ch = CAT_FASTA.out.file_out
+
+ if (params.remove_sequence_redundancy) {
+ // Strict clustering to remove redundancy
+ EXECUTE_CLUSTERING( fasta_ch )
+ ch_versions = ch_versions.mix( EXECUTE_CLUSTERING.out.versions )
+
+ REMOVE_REDUNDANT_SEQS( EXECUTE_CLUSTERING.out.clusters, EXECUTE_CLUSTERING.out.seqs )
+ ch_versions = ch_versions.mix( REMOVE_REDUNDANT_SEQS.out.versions )
+ fasta_ch = REMOVE_REDUNDANT_SEQS.out.fasta
+ }
+
+ ALIGN_SEQUENCES( fasta_ch )
+ ch_versions = ch_versions.mix( ALIGN_SEQUENCES.out.versions )
+ ch_msa = ALIGN_SEQUENCES.out.alignments
+
+ if (params.trim_msa) {
+ if (params.clipping_tool == 'clipkit') {
+ CLIPKIT( ch_msa )
+ ch_versions = ch_versions.mix( CLIPKIT.out.versions )
+ ch_msa = CLIPKIT.out.clipkit
+ } else { // fallback: local module clip_ends
+ CLIP_ENDS( ch_msa, params.gap_threshold )
+ ch_versions = ch_versions.mix( CLIP_ENDS.out.versions )
+ ch_msa = CLIP_ENDS.out.fas
+ }
+ }
+
+ HMMER_HMMBUILD( ch_msa, [] )
+ ch_versions = ch_versions.mix( HMMER_HMMBUILD.out.versions )
+
+ ch_msa = ch_msa
+ .map { meta, aln -> [ [id: meta.id], aln ] }
+ .groupTuple(by: 0)
+ EXTRACT_FAMILY_REPS( ch_msa )
+ ch_versions = ch_versions.mix( EXTRACT_FAMILY_REPS.out.versions )
+ ch_updated_family_reps = ch_updated_family_reps.mix( EXTRACT_FAMILY_REPS.out.map )
+
+ emit:
+ versions = ch_versions
+ no_hit_seqs = ch_no_hit_seqs
+ updated_family_reps = ch_updated_family_reps
+}
diff --git a/subworkflows/local/utils_nfcore_proteinfamilies_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfamilies_pipeline/main.nf
index c3fd4aa..c3f887b 100644
--- a/subworkflows/local/utils_nfcore_proteinfamilies_pipeline/main.nf
+++ b/subworkflows/local/utils_nfcore_proteinfamilies_pipeline/main.nf
@@ -67,25 +67,8 @@ workflow PIPELINE_INITIALISATION {
// Create channel from input file provided through params.input
//
- Channel
+ ch_samplesheet = Channel
.fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json"))
- .map {
- meta, fastq_1, fastq_2 ->
- if (!fastq_2) {
- return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ]
- } else {
- return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ]
- }
- }
- .groupTuple()
- .map { samplesheet ->
- validateInputSamplesheet(samplesheet)
- }
- .map {
- meta, fastqs ->
- return [ meta, fastqs.flatten() ]
- }
- .set { ch_samplesheet }
emit:
samplesheet = ch_samplesheet
@@ -164,7 +147,6 @@ def validateInputSamplesheet(input) {
// Generate methods description for MultiQC
//
def toolCitationText() {
- // TODO nf-core: Optionally add in-text citation tools to this list.
// Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "",
// Uncomment function in methodsDescriptionText to render in MultiQC report
def citation_text = [
@@ -177,7 +159,6 @@ def toolCitationText() {
}
def toolBibliographyText() {
- // TODO nf-core: Optionally add bibliographic entries to this list.
// Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
Author (2023) Pub name, Journal, DOI
" : "",
// Uncomment function in methodsDescriptionText to render in MultiQC report
def reference_text = [
@@ -211,7 +192,7 @@ def methodsDescriptionText(mqc_methods_yaml) {
meta["tool_citations"] = ""
meta["tool_bibliography"] = ""
- // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled!
+ // Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled!
// meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".")
// meta["tool_bibliography"] = toolBibliographyText()
@@ -224,3 +205,29 @@ def methodsDescriptionText(mqc_methods_yaml) {
return description_html.toString()
}
+//
+// Validate that two folders (HMMs and MSAs for update) contain the same number of files and matching base filenames
+//
+def validateMatchingFolders(channel1, channel2) {
+ // Fetch the contents of the channels
+ channel1
+ .join(channel2)
+ .map { meta, folder1, folder2 ->
+ def files1 = folder1.listFiles()
+ def files2 = folder2.listFiles()
+
+ // Check if the number of files matches
+ if (files1.size() != files2.size()) {
+ error("[nf-core/proteinfamilies] ERROR: Folder mismatch: ${folder1} has ${files1.size()} files, but ${folder2} has ${files2.size()} files.")
+ }
+
+ // Extract base filenames (without extensions) and sort
+ def baseNames1 = files1.collect { it.getSimpleName() }.sort()
+ def baseNames2 = files2.collect { it.getSimpleName() }.sort()
+
+ // Check if base filenames match one to one
+ if (baseNames1 != baseNames2) {
+ error("[nf-core/proteinfamilies] ERROR: Filename mismatch: Expected matching files in ${folder1} and ${folder2}. Base filenames do not match.")
+ }
+ }
+}
diff --git a/workflows/proteinfamilies.nf b/workflows/proteinfamilies.nf
index 77d8b29..18ddcff 100644
--- a/workflows/proteinfamilies.nf
+++ b/workflows/proteinfamilies.nf
@@ -9,6 +9,26 @@ include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pi
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfamilies_pipeline'
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ IMPORT LOCAL MODULES/SUBWORKFLOWS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+//
+// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
+//
+include { UPDATE_FAMILIES } from '../subworkflows/local/update_families'
+include { EXECUTE_CLUSTERING } from '../subworkflows/local/execute_clustering'
+include { GENERATE_FAMILIES } from '../subworkflows/local/generate_families'
+include { REMOVE_REDUNDANCY } from '../subworkflows/local/remove_redundancy'
+
+//
+// MODULE: Local to the pipeline
+//
+include { CHUNK_CLUSTERS } from '../modules/local/chunk_clusters/main'
+include { EXTRACT_FAMILY_REPS } from '../modules/local/extract_family_reps/main'
+
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN MAIN WORKFLOW
@@ -16,13 +36,71 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_prot
*/
workflow PROTEINFAMILIES {
-
take:
ch_samplesheet // channel: samplesheet read in from --input
+
main:
- ch_versions = Channel.empty()
+ ch_versions = Channel.empty()
ch_multiqc_files = Channel.empty()
+ ch_family_reps = Channel.empty()
+
+ ch_samplesheet_for_create = Channel.empty()
+ ch_samplesheet_for_update = Channel.empty()
+
+ ch_branch_result = ch_samplesheet
+ .branch { _meta, _fasta, existing_hmms_to_update, existing_msas_to_update ->
+ to_create: !existing_hmms_to_update?.size() && !existing_msas_to_update?.size()
+ to_update: existing_hmms_to_update?.size() && existing_msas_to_update?.size()
+ }
+
+ /************************************/
+ /* Splitting the samplesheet into 2 */
+ /* - Entries to create new families */
+ /* (they only have sequences) */
+ /* - Entries to update existing */
+ /* families (existing HMM models */
+ /* and MSAs) */
+ /************************************/
+ ch_samplesheet_for_create = ch_branch_result.to_create
+ .map { meta, fasta, _existing_hmms, _existing_msas ->
+ [meta, fasta]
+ }
+ ch_samplesheet_for_update = ch_branch_result.to_update
+
+ // Updating existing families
+ if (ch_branch_result.to_update) {
+ UPDATE_FAMILIES( ch_samplesheet_for_update )
+ ch_versions = ch_versions.mix( UPDATE_FAMILIES.out.versions )
+
+ ch_family_reps = ch_family_reps.mix( UPDATE_FAMILIES.out.updated_family_reps )
+ ch_samplesheet_for_create = ch_samplesheet_for_create.mix( UPDATE_FAMILIES.out.no_hit_seqs )
+ }
+
+ // Creating new families
+ // Clustering
+ EXECUTE_CLUSTERING( ch_samplesheet_for_create )
+ ch_versions = ch_versions.mix( EXECUTE_CLUSTERING.out.versions )
+
+ CHUNK_CLUSTERS( EXECUTE_CLUSTERING.out.clusters, EXECUTE_CLUSTERING.out.seqs, params.cluster_size_threshold )
+ ch_versions = ch_versions.mix( CHUNK_CLUSTERS.out.versions )
+
+ // Multiple sequence alignment
+ GENERATE_FAMILIES( ch_samplesheet_for_create, CHUNK_CLUSTERS.out.fasta_chunks )
+ ch_versions = ch_versions.mix( GENERATE_FAMILIES.out.versions )
+
+ // Remove redundant sequences and families
+ REMOVE_REDUNDANCY( GENERATE_FAMILIES.out.msa, GENERATE_FAMILIES.out.fasta, GENERATE_FAMILIES.out.hmm )
+ ch_versions = ch_versions.mix( REMOVE_REDUNDANCY.out.versions )
+
+ // Post-processing
+ ch_msa = REMOVE_REDUNDANCY.out.msa
+ .map { meta, aln -> [ [id: meta.id], aln ] }
+ .groupTuple(by: 0)
+
+ EXTRACT_FAMILY_REPS( ch_msa )
+ ch_versions = ch_versions.mix( EXTRACT_FAMILY_REPS.out.versions )
+ ch_family_reps = ch_family_reps.mix( EXTRACT_FAMILY_REPS.out.map )
//
// Collate and save software versions
@@ -30,12 +108,11 @@ workflow PROTEINFAMILIES {
softwareVersionsToYAML(ch_versions)
.collectFile(
storeDir: "${params.outdir}/pipeline_info",
- name: 'nf_core_' + 'proteinfamilies_software_' + 'mqc_' + 'versions.yml',
+ name: 'nf_core_pipeline_software_mqc_versions.yml',
sort: true,
newLine: true
).set { ch_collated_versions }
-
//
// MODULE: MultiQC
//
@@ -67,6 +144,8 @@ workflow PROTEINFAMILIES {
)
)
+ ch_multiqc_files = ch_multiqc_files.mix(ch_family_reps.collect { it[1] }.ifEmpty([]))
+
MULTIQC (
ch_multiqc_files.collect(),
ch_multiqc_config.toList(),
@@ -76,9 +155,9 @@ workflow PROTEINFAMILIES {
[]
)
- emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html
- versions = ch_versions // channel: [ path(versions.yml) ]
-
+ emit:
+ multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html
+ versions = ch_versions // channel: [ path(versions.yml) ]
}
/*