diff --git a/scripts/mirna_extension.py b/scripts/mirna_extension.py index 01303a7..7ba749a 100755 --- a/scripts/mirna_extension.py +++ b/scripts/mirna_extension.py @@ -1,239 +1,389 @@ #!/usr/bin/env python - -"""Extend miRNAs start and end coordinates by n nucleotides. - -This script uses the class MirnaExtension to extend the start and end -coordiantes of the mature miRNAs.This class is build upon two methods. The -first one, turns a GFF3 file into an in-memory database using gffutils. The -second method, iterares over all primary miRNAS in the database to extend -its mature forms' end and start cooridnates by n nucleotides without exceeding -the chromosome boundaries. Thereafter, the primary miRNAs star/end coordinates -will also be extened if and only if, the mature miRNA coordinates exceeds the -primary miRNA ones; this extension will be recorded as _-y and/or _+x being the -the extension on the 5' and 3' respectively. Finally, two different annotation -files will be created; one for the primary miRNAs and one for the mature forms. - -Usage: - mirna_extension.py GFF3 [--outdir Path] [-e int] [--chr Path] - GFF3 > mirna_extension.py [--outdir Path] [-e int] [--chr Path] -""" +"""Extend miRNA start and end coordinates and ensure name uniqueness.""" import argparse from pathlib import Path -import sys -from typing import Dict, Optional +from sqlite3 import InterfaceError +from typing import Optional + import gffutils # type: ignore +class AnnotationException(BaseException): + """A custom exception class for MirnaExtension class.""" + + class MirnaExtension: - """Class to extend miRNAs start and end coordinates. + """Class for updating miRNA annotated coordinates and names. Attributes: - db: - in-memory database of the GFF3 file. - - Methods: - load_gff_file: - creates an in-memory db from an input file - extend_mirnas: - extends the start and end of the mature miRNAs by n nucleotides + db: In-memory database of the input GFF3 records. + db_out: In-memory database of the updated GFF3 records. + seq_lengths: Dictionary mapping reference sequence IDs to their + lengths. """ def __init__(self) -> None: """Initialize class.""" - self.db = None + self.db: Optional[gffutils.FeatureDB] = None + self.db_out: Optional[gffutils.FeatureDB] = None + self.seq_lengths: Optional[dict[str, int]] = None - def load_gff_file(self, gff_file: Optional[Path] = None) -> None: - """Load GFF3 file. - - This method uses the gffutils package to create and in-memory database - of the GFF3 file. + def set_db(self, path: Path) -> None: + """Load GFF3 file into `gffutils.FeatureDB`. Args: - gff_file: - path to the input GFF3 file. If None, input is taken from the - standard input (stdin). + gff_file: Path to a GFF3 file. """ - if gff_file is None: + try: self.db = gffutils.create_db( - sys.stdin, dbfn=":memory:", force=True, keep_order=True + str(path), dbfn=":memory:", force=True, keep_order=True ) - else: - self.db = gffutils.create_db( - str(gff_file), dbfn=":memory:", force=True, keep_order=True + except gffutils.exceptions.EmptyInputError: + pass + except InterfaceError as err: + raise AnnotationException( + "\n\n" + "Illegal coordinate: The provided GFF3 miRNA annotation file" + " contains at least one miRNA species starting at position 0." + " Please, check that all entries start at least at position 1" + "\n" + ) from err + + def set_seq_lengths(self, path: Optional[Path] = None) -> None: + """Set the reference sequence lengths. + + Args: + path: Path to a tabulated file containing the names of reference + sequences and the corresponding total length, in base pairs. + """ + self.seq_lengths = {} + anno_lengths = {} + + if not isinstance(self.db, gffutils.FeatureDB): + return + + for _id in self.db.seqids(): # type: ignore + anno_lengths[_id] = max( + rec.end for rec in self.db.region(_id) # type: ignore ) - def extend_mirnas( - self, - primir_out: Path, - mir_out: Path, - n: int = 6, - seq_lengths: Optional[dict[str, int]] = None, + if path is None: + self.seq_lengths = anno_lengths + else: + with open(path, encoding="utf-8") as _file: + for line in _file: + ref_seq, length = line.strip().split("\t") + + try: + max_len = anno_lengths[ref_seq] + except KeyError: + max_len = 0 + + try: + self.seq_lengths[ref_seq] = int(length) + except ValueError as exc: + raise ValueError( + f'Invalid length: "{length}" for sequence' + f' "{ref_seq}"; integer value expected' + ) from exc + + if max_len > int(length): + raise AnnotationException( + "The provided GFF3 miRNA annotations and" + " reference sequence lengths are incompatible:" + f" end coordinate {max_len} exceeds length of" + f' reference sequence "{ref_seq}" ({length} nt)' + ) + + def adjust_names( + self, precursor: gffutils.Feature, matures: list[gffutils.Feature] ) -> None: - """Extend miRNAs start and end coordinates. + """Adjust miRNAs 'Name' attribute. + + This method adjusts the miRNAs 'Name' attribute to account for the + different genomic locations the miRNA sequence is annotated on and + ensure their uniqueness. + + In precursors, the suffix indicating its replica number, is taken + either from the 'Name' or the 'ID' attribute. Mature miRNAs take + this number as an infix/suffix to construct its name. If the annotated + name already has an infix/suffix, it is replaced by the precursor one. + + Precursor names have the format: + 'SPECIES-mir-NAME-#' + Mature miRNA names present one of the following formats depending on + whether the arm is specified or not: + - 'SPECIES-miR-NAME-#-ARM' + - 'SPECIES-miR-NAME-#' + where # is the integer indicating the replica the miRNAs are. + + If a precursor has a replica but its number is set in the 'ID' + attribute, the first instance does not has a suffix and but the other + one do. + + If a precursor has no other occurrences, no modifications are made. + + Args: + precursor: 'miRNA primary transcript' feature entry + matures: list with the corresponding 'mature miRNA' feature(s) + entry(s) + """ + replica = None + precursor_name = precursor.attributes["Name"][0].split("-") + precursor_id = precursor.attributes["ID"][0].split("_") + + if len(precursor_name) == 4: + replica = precursor_name[-1] + + elif len(precursor_name) == 3 and len(precursor_id) == 2: + replica = precursor_id[-1] + precursor_name.append(replica) + precursor.attributes["Name"][0] = "-".join(precursor_name) + + if replica is not None: + for mir in matures: + mir_name = mir.attributes["Name"][0].split("-") + + if len(mir_name) == 5: + mir_name[3] = replica + + elif len(mir_name) == 4 and mir_name[-1] != replica: + mir_name.insert(3, replica) + + elif len(mir_name) == 3: + mir_name.append(replica) + + mir.attributes["Name"][0] = "-".join(mir_name) + + def process_precursor( + self, precursor: gffutils.Feature, n: int = 6 + ) -> list[gffutils.Feature]: + """Extend miRNAs start and end coordinates and ensure name uniqueness. This method elongates the start and end coordinates of mature miRNAs - by n nucleotides. In the case that this extension makes the start/end + by 'n' nucleotides. In the case that this extension makes the start/end coordinates to exceed the corresponding primary miRNA boundaries, these will be extended as far as the miRNA coordinates. If provided, the elongation will take into account the chromosome size. + In addition, the method `adjust_names` is called to ensure uniqueness + in the `Name` attribute for both the precursor and its arms. + Args: - primir_out: - path to the output extended precursor miRNA file. - mir_out: - path to the output extended mature miRNA file. - n: - number of nucleotides to extend miRs start and end coordinates. - seq_lengths: - a dictionary that maps reference sequence IDs to their lengths - (in nucleotides). If None, sequence lengths are inferred from - the input GFF3 file. + precursor: 'miRNA primary transcript' feature entry + n: Number of nucleotides to extend miRs start and end coordinates. """ - assert isinstance(self.db, gffutils.interface.FeatureDB) - - # Set end boundary - if seq_lengths is None: - seq_lengths = {} - for seqid in self.db.seqids(): - seq_lengths[seqid] = max( - rec.end for rec in self.db.region(seqid) - ) - - with ( - open(primir_out, "w", encoding="utf-8") as primir, - open(mir_out, "w", encoding="utf-8") as mirna, + assert isinstance(self.seq_lengths, dict) + + pre_start = precursor.start + pre_end = precursor.end + + matures = list( + self.db.region( # type: ignore + strand=precursor.strand, + seqid=precursor.seqid, + start=precursor.start, + end=precursor.end, + featuretype="miRNA", + completely_within=True, + ) + ) + + self.adjust_names(precursor=precursor, matures=matures) + + for mir in matures: + try: + mir.start = max(mir.start - n, 1) + mir.end = min(mir.end + n, self.seq_lengths[mir.seqid]) + except KeyError as exc: + raise KeyError( + "The provided GFF3 miRNA annotations and reference" + "sequence lengths are incompatible: reference sequence" + f' "{mir.seqid}" of miRNA "{mir.id}" is not available in' + " the provided reference sequence lengths table" + ) from exc + + precursor.start = min( + precursor.start, min(mir.start for mir in matures) + ) + precursor.end = max(precursor.end, max(mir.end for mir in matures)) + precursor.attributes["Name"][0] += f"_-{pre_start - precursor.start}" + precursor.attributes["Name"][0] += f"_+{precursor.end - pre_end}" + + return [precursor] + matures + + def update_db( + self, + n: int = 6, + ) -> None: + """Update miRNA annotations in the local database. + + Using the method `process_precursor` annotated coordinates are + extended by `n` nucleotides and, if needed, the `Name` attribute is + modified to contain the sequence replica number. + + Args: + n: Number of nucleotides to extend miRs start and end coordinates. + """ + if not isinstance(self.db, gffutils.FeatureDB): + return + + feats_updated: list[gffutils.Feature] = [] + + for pre in self.db.features_of_type( # type: ignore + "miRNA_primary_transcript" ): - for primary_mirna in self.db.features_of_type( - "miRNA_primary_transcript" - ): - seqid = primary_mirna.seqid - start = int(primary_mirna.start) - end = int(primary_mirna.end) - - mature_miRNAs = list( - self.db.region( - seqid=seqid, - start=start, - end=end, - featuretype="miRNA", - completely_within=True, - ) - ) - - if mature_miRNAs: - for mir in mature_miRNAs: - if mir.start - n > 0: - mir.start -= n - else: - mir.start = 0 - - if mir.end + n < seq_lengths[seqid]: - mir.end += n - else: - mir.end = seq_lengths[seqid] - - if mir.start < start: - primary_mirna.start = mir.start - - if mir.end > end: - primary_mirna.end = mir.end - - mirna.write(str(mir) + "\n") - - start_diff = start - primary_mirna.start - end_diff = primary_mirna.end - end - primary_mirna.attributes["Name"][0] += f"_-{start_diff}" - primary_mirna.attributes["Name"][0] += f"_+{end_diff}" - - primir.write(str(primary_mirna) + "\n") + feats_updated.extend(self.process_precursor(precursor=pre, n=n)) + + self.db_out = gffutils.create_db( + feats_updated, + dbfn=":memory:", + force=True, + keep_order=True, + ) + + def write_gff( + self, path: Path, feature_type: Optional[str] = None + ) -> None: + """Write features to a GFF3 file. + + Args: + path: Path to the output file. + feature_type: Feature type to write. If `None`, all features will + be written. + """ + with open(path, "w", encoding="utf-8") as _file: + if not isinstance(self.db_out, gffutils.FeatureDB): + _file.write("") + elif feature_type is None: + for feature in self.db_out.all_features(): # type: ignore + _file.write(str(feature) + "\n") + else: + for feature in self.db_out.features_of_type( # type: ignore + feature_type + ): + _file.write(str(feature) + "\n") def parse_arguments(): - """Command-line arguments parser.""" + """Parse command-line arguments.""" + description = """Extend miRNA annotated regions and ensure name uniqueness. + +Adjust the miRNAs 'Name' attribute to account for the different genomic +locations the miRNA sequence is annotated on and ensure their uniqueness. +In precursors, the suffix indicating its replica number, is taken either from +the 'Name' or the 'ID' attribute. Mature miRNAs take this number as an +infix/suffix to construct its name. If the annotated name already has an +infix/suffix, it is replaced by the precursor one. + +Precursor names have the format: + 'SPECIES-mir-NAME-#' +Mature miRNA names present one of the following formats depending on whether +the arm is specified or not: + - 'SPECIES-miR-NAME-#-ARM' + - 'SPECIES-miR-NAME-#' +where # is the integer indicating the replica the miRNAs are. + +If a precursor has a replica but its number is set in the 'ID' attribute, the +first instance does not has a suffix and but the other one do. +If a precursor has no other occurrences, no modifications are made. + +Extend mature miRNA start and end coordinates in a GFF3 file by the indicated +number of nucleotides without exceeding chromosome boundaries. If the +extension causes the mature miRNA coordinates to exceed the boundaries of the +corresponding precursor (see note in "Constraints" below), the precursor +coordinates are extended accordingly. The precursor name will be appended +with '_-n' and/or '_+m', where n and m represent the extensions on the 5' and +3' end, respectively (or 0 otherwise). The modified mature miRNA and precursor +annotations will be written to separate GFF3 files. + +Constraints: +The script was written for GFF3 files obtained from miRBase +(http://www.mirbase.org/). + +The following assumptions are made: +- The input GFF3 file contains miRNA annotations for a single species. +- The input GFF3 file contains features of type "miRNA_primary_transcript" + (referred to as "precursors") and "miRNA" (referred to as "mature miRNAs"). +- The input GFF3 file contains a Name attribute for each precursor. +- Each precursor contains one or more mature miRNAs. +- Each mature miRNA is a child of exactly one precursor and is completely + within the boundaries of the precursor. +""" parser = argparse.ArgumentParser( - description="Script to extend miRNAs start and end coordinates." + description=description, + formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( "-v", "--version", action="version", - version="%(prog)s 1.0", - help="Show program's version number and exit", + version="%(prog)s 1.1.0", + help="show program's version number and exit", ) parser.add_argument( "input", - help="Path to the GFF3 annotation file. If not provided, the input \ - will be read from the standard input.", - type=Path, - ) - parser.add_argument( - "--outdir", - help="Path to the output directory. Default: %(default)s.", - default=Path.cwd(), + help="path to the GFF3 annotation file", type=Path, ) parser.add_argument( "-e", "--extension", - help="Number of nucleotides to extend the coordinates. Default: \ - %(default)d.", + help=( + "number of nucleotides by which to extend the miRNA boundaries;" + " default: %(default)d" + ), default=6, type=int, + metavar="INT", ) parser.add_argument( "--chr", - help="Path to the tabulated file containing the chromosome and its \ - length in basepairs. If not provided, the length will be set to \ - the biggest coordinate of the last miRNA primary transcript.", + help=( + "path to a tabulated file containing the chromosome names and" + " their lengths, in basepairs; if not provided, the sequence" + " lengths will be set to the most distal 3' boundary of the miRNA" + " precursors annotated for each chromosome in the input GFF3 file;" + " default: %(default)s" + ), default=None, type=Path, + metavar="PATH", + ) + parser.add_argument( + "--outdir", + help="path to the output directory; default: %(default)s", + default=Path.cwd(), + type=Path, + metavar="PATH", ) - return parser -def main(arguments) -> None: +def main(args) -> None: """Extend miRNAs start/end coordinates.""" - outdir = Path(arguments.outdir) - outdir.mkdir(parents=True, exist_ok=True) + args.outdir.mkdir(parents=True, exist_ok=True) + mirna_extension = MirnaExtension() - primir_out = outdir / ( - f"extended_primir_annotation_{arguments.extension}_nt.gff3" - ) - mir_out = ( - outdir / f"extended_mirna_annotation_{arguments.extension}_nt.gff3" - ) + mirna_extension.set_db(path=args.input) - with open(arguments.input, encoding="utf-8") as in_file: - if len(in_file.read()) == 0: - with ( - open(primir_out, "w", encoding="utf-8") as primir, - open(mir_out, "w", encoding="utf-8") as mir, - ): - primir.write("") - mir.write("") - return - - # Create dictionary with the ref. sequence length - seq_lengths: Optional[Dict] = None - if arguments.chr: - seq_lengths = {} - with open(arguments.chr, encoding="utf-8") as f: - for line in f: - ref_seq, length = line.strip().split("\t") - seq_lengths[ref_seq] = int(length) - - m = MirnaExtension() - m.load_gff_file(arguments.input) - m.extend_mirnas( - n=arguments.extension, - seq_lengths=seq_lengths, - primir_out=primir_out, - mir_out=mir_out, + if isinstance(mirna_extension.db, gffutils.FeatureDB): + mirna_extension.set_seq_lengths(path=args.chr) + mirna_extension.update_db(n=args.extension) + + mirna_extension.write_gff( + path=args.outdir + / f"extended_primir_annotation_{args.extension}_nt.gff3", + feature_type="miRNA_primary_transcript", + ) + mirna_extension.write_gff( + path=args.outdir + / f"extended_mirna_annotation_{args.extension}_nt.gff3", + feature_type="miRNA", ) if __name__ == "__main__": - args = parse_arguments().parse_args() # pragma: no cover - main(args) # pragma: no cover + arguments = parse_arguments().parse_args() # pragma: no cover + main(arguments) # pragma: no cover diff --git a/scripts/mirna_quantification.py b/scripts/mirna_quantification.py index 9f4ed88..8b44cfb 100755 --- a/scripts/mirna_quantification.py +++ b/scripts/mirna_quantification.py @@ -242,7 +242,7 @@ def collapsed_nh_contribution(aln: pysam.AlignedSegment) -> float: The contribution is computed as the ratio of the number of reads collapsed in the alignment and the NH value. It is assumed that the alignment query - name contians the number of collapsed reads as well as the NH value in the + name contains the number of collapsed reads as well as the NH value in the format NAME-COUNT_NH. Args: @@ -250,9 +250,10 @@ def collapsed_nh_contribution(aln: pysam.AlignedSegment) -> float: Alignment to which the overall contribution is calculated Returns: - the conrtibution of the alignment to the overall count + Contribution of alignment to overall count """ name = str(aln.query_name) + values = [] try: if val := re.search(r"\d+_\d+$", name): values = val.group().split("_") @@ -274,7 +275,7 @@ def collapsed_contribution(aln: pysam.AlignedSegment) -> float: The contribution is computed as the ratio of the number of reads collapsed in the alignment and the value stored in the NH tag. If the tag is missing, - the NH value is 1. It is assumed that the alignment query name contians + the NH value is 1. It is assumed that the alignment query name contains the number of collapsed reads in the format NAME-COUNT. Args: @@ -282,9 +283,10 @@ def collapsed_contribution(aln: pysam.AlignedSegment) -> float: Alignment to which the overall contribution is calculated Returns: - the conrtibution of the alignment to the overall count + Contribution of alignment to overall count """ name = str(aln.query_name) + collapsed = 0.0 try: if coll := re.search(r"\d+$", name): collapsed = float(coll.group()) @@ -312,7 +314,7 @@ def nh_contribution(aln: pysam.AlignedSegment) -> float: The contribution is computed as the ratio of the number of reads collapsed in the alignment and the value stored in the NH tag. If the tag is missing, - the NH value is 1. It is assumed that the alignment query name contians the + the NH value is 1. It is assumed that the alignment query name contains the NH value in the format NAME_NH. Args: @@ -320,9 +322,10 @@ def nh_contribution(aln: pysam.AlignedSegment) -> float: Alignment to which the overall contribution is calculated Returns: - the conrtibution of the alignment to the overall count + Contribution of alignment to overall count """ name = str(aln.query_name) + nh_val = 0.0 try: if cont := re.search(r"\d+$", name): nh_val = float(cont.group()) @@ -352,7 +355,7 @@ def contribution(aln: pysam.AlignedSegment) -> float: Alignment to which the overall contribution is calculated Returns: - the conrtibution of the alignment to the overall count + Contribution of alignment to overall count """ try: return 1 / float(aln.get_tag("NH")) @@ -362,7 +365,7 @@ def contribution(aln: pysam.AlignedSegment) -> float: def get_name(pre_name: str) -> list[str]: - """Get the final name for the spieces name. + """Get the final name for the species name. Take a string and processes it to obtain the final name for the species and the type of miRNA the string belongs to. Only the feat_name is diff --git a/scripts/tests/files/chr_size.txt b/scripts/tests/files/chr_size.tsv similarity index 100% rename from scripts/tests/files/chr_size.txt rename to scripts/tests/files/chr_size.tsv diff --git a/scripts/tests/files/extreme_chr_mir_anno.gff3 b/scripts/tests/files/extreme_chr_mir_anno.gff3 index ca2928b..8e2eb24 100644 --- a/scripts/tests/files/extreme_chr_mir_anno.gff3 +++ b/scripts/tests/files/extreme_chr_mir_anno.gff3 @@ -1,3 +1,2 @@ -19 . miRNA 0 80 . + . ID=MIMAT0002822;Alias=MIMAT0002822;Name=hsa-miR-512-5p;Derives_from=MI0003140 -19 . miRNA 83 124 . + . ID=MIMAT0002823;Alias=MIMAT0002823;Name=hsa-miR-512-3p;Derives_from=MI0003140 +19 . miRNA 1 80 . + . ID=MIMAT0002822;Alias=MIMAT0002822;Name=hsa-miR-512-1-5p;Derives_from=MI0003140 19 . miRNA 599982 600000 . + . ID=MIMAT0004978;Alias=MIMAT0004978;Name=hsa-miR-935;Derives_from=MI0005757 diff --git a/scripts/tests/files/extreme_chr_primir_anno.gff3 b/scripts/tests/files/extreme_chr_primir_anno.gff3 index 05e7924..c9a9369 100644 --- a/scripts/tests/files/extreme_chr_primir_anno.gff3 +++ b/scripts/tests/files/extreme_chr_primir_anno.gff3 @@ -1,2 +1,2 @@ -19 . miRNA_primary_transcript 0 124 . + . ID=MI0003140;Alias=MI0003140;Name=hsa-mir-512-1_-2_+2 +19 . miRNA_primary_transcript 1 122 . + . ID=MI0003140;Alias=MI0003140;Name=hsa-mir-512-1_-1_+0 19 . miRNA_primary_transcript 515667 600000 . + . ID=MI0005757;Alias=MI0005757;Name=hsa-mir-935_-0_+3 diff --git a/scripts/tests/files/extreme_mir_anno.gff3 b/scripts/tests/files/extreme_mir_anno.gff3 index 21e8ba6..87c9dba 100644 --- a/scripts/tests/files/extreme_mir_anno.gff3 +++ b/scripts/tests/files/extreme_mir_anno.gff3 @@ -1,3 +1,3 @@ -19 . miRNA 6 80 . + . ID=MIMAT0002822;Alias=MIMAT0002822;Name=hsa-miR-512-5p;Derives_from=MI0003140 -19 . miRNA 83 124 . + . ID=MIMAT0002823;Alias=MIMAT0002823;Name=hsa-miR-512-3p;Derives_from=MI0003140 -19 . miRNA 315716 315757 . + . ID=MIMAT0004978;Alias=MIMAT0004978;Name=hsa-miR-935;Derives_from=MI0005757 +19 . miRNA 10 76 . + . ID=MIMAT0002822;Alias=MIMAT0002822;Name=hsa-miR-512-1-5p;Derives_from=MI0003140 +19 . miRNA 87 120 . + . ID=MIMAT0002823;Alias=MIMAT0002823;Name=hsa-miR-512-1-3p;Derives_from=MI0003140 +19 . miRNA 315720 315756 . + . ID=MIMAT0004978;Alias=MIMAT0004978;Name=hsa-miR-935;Derives_from=MI0005757 diff --git a/scripts/tests/files/extreme_primir_anno.gff3 b/scripts/tests/files/extreme_primir_anno.gff3 index 17e3040..238be43 100644 --- a/scripts/tests/files/extreme_primir_anno.gff3 +++ b/scripts/tests/files/extreme_primir_anno.gff3 @@ -1,2 +1,2 @@ -19 . miRNA_primary_transcript 6 124 . + . ID=MI0003140;Alias=MI0003140;Name=hsa-mir-512-1_-3_+2 +19 . miRNA_primary_transcript 9 122 . + . ID=MI0003140;Alias=MI0003140;Name=hsa-mir-512-1_-0_+0 19 . miRNA_primary_transcript 315667 315757 . + . ID=MI0005757;Alias=MI0005757;Name=hsa-mir-935_-0_+0 diff --git a/scripts/tests/files/in_illegal_mirna_anno.gff3 b/scripts/tests/files/in_illegal_mirna_anno.gff3 new file mode 100644 index 0000000..3e3fa04 --- /dev/null +++ b/scripts/tests/files/in_illegal_mirna_anno.gff3 @@ -0,0 +1,3 @@ +19 . miRNA_primary_transcript 2 122 . + . ID=MI0003140;Alias=MI0003140;Name=hsa-mir-512-1 +19 . miRNA 3 74 . + . ID=MIMAT0002822;Alias=MIMAT0002822;Name=hsa-miR-512-5p;Derives_from=MI0003140 +19 . miRNA 0 24 . + . ID=MIMAT2002822;Alias=MIMAT2002822;Name=hsa-miR-512-3p;Derives_from=MI0003140 diff --git a/scripts/tests/files/in_mirna_anno.gff3 b/scripts/tests/files/in_mirna_anno.gff3 index 5b96e95..330021b 100644 --- a/scripts/tests/files/in_mirna_anno.gff3 +++ b/scripts/tests/files/in_mirna_anno.gff3 @@ -1,5 +1,6 @@ -19 . miRNA_primary_transcript 2517 2614 . + . ID=MI0003141;Alias=MI0003141;Name=hsa-mir-512-2 -19 . miRNA 2536 2558 . + . ID=MIMAT0002822_1;Alias=MIMAT0002822;Name=hsa-miR-512-5p;Derives_from=MI0003141 -19 . miRNA 2573 2594 . + . ID=MIMAT0002823_1;Alias=MIMAT0002823;Name=hsa-miR-512-3p;Derives_from=MI0003141 -19 . miRNA_primary_transcript 5328 5400 . + . ID=MI0003786;Alias=MI0003786;Name=hsa-mir-1323 -19 . miRNA 5338 5359 . + . ID=MIMAT0005795;Alias=MIMAT0005795;Name=hsa-miR-1323;Derives_from=MI0003786 +19 . miRNA_primary_transcript 121035 121101 . + . ID=MI0000779;Alias=MI0000779;Name=hsa-mir-371a +19 . miRNA 121040 121059 . + . ID=MIMAT0004687;Alias=MIMAT0004687;Name=hsa-miR-371a-5p;Derives_from=MI0000779 +19 . miRNA 121076 121098 . + . ID=MIMAT0000723;Alias=MIMAT0000723;Name=hsa-miR-371a-3p;Derives_from=MI0000779 +19 . miRNA_primary_transcript 121037 121102 . - . ID=MI0017393;Alias=MI0017393;Name=hsa-mir-371b +19 . miRNA 121075 121096 . - . ID=MIMAT0019892;Alias=MIMAT0019892;Name=hsa-miR-371b-5p;Derives_from=MI0017393 +19 . miRNA 121038 121060 . - . ID=MIMAT0019893;Alias=MIMAT0019893;Name=hsa-miR-371b-3p;Derives_from=MI0017393 diff --git a/scripts/tests/files/in_mirna_extreme_chr_mirs.gff3 b/scripts/tests/files/in_mirna_extreme_chr_mirs.gff3 index 16434b8..144fff9 100644 --- a/scripts/tests/files/in_mirna_extreme_chr_mirs.gff3 +++ b/scripts/tests/files/in_mirna_extreme_chr_mirs.gff3 @@ -1,5 +1,4 @@ 19 . miRNA_primary_transcript 2 122 . + . ID=MI0003140;Alias=MI0003140;Name=hsa-mir-512-1 19 . miRNA 3 74 . + . ID=MIMAT0002822;Alias=MIMAT0002822;Name=hsa-miR-512-5p;Derives_from=MI0003140 -19 . miRNA 89 118 . + . ID=MIMAT0002823;Alias=MIMAT0002823;Name=hsa-miR-512-3p;Derives_from=MI0003140 19 . miRNA_primary_transcript 515667 599997 . + . ID=MI0005757;Alias=MI0005757;Name=hsa-mir-935 -19 . miRNA 599988 599996 . + . ID=MIMAT0004978;Alias=MIMAT0004978;Name=hsa-miR-935;Derives_from=MI0005757 \ No newline at end of file +19 . miRNA 599988 599996 . + . ID=MIMAT0004978;Alias=MIMAT0004978;Name=hsa-miR-935;Derives_from=MI0005757 diff --git a/scripts/tests/files/in_mirna_extreme_mirs.gff3 b/scripts/tests/files/in_mirna_extreme_mirs.gff3 index d3589b0..da9adac 100644 --- a/scripts/tests/files/in_mirna_extreme_mirs.gff3 +++ b/scripts/tests/files/in_mirna_extreme_mirs.gff3 @@ -2,4 +2,4 @@ 19 . miRNA 12 74 . + . ID=MIMAT0002822;Alias=MIMAT0002822;Name=hsa-miR-512-5p;Derives_from=MI0003140 19 . miRNA 89 118 . + . ID=MIMAT0002823;Alias=MIMAT0002823;Name=hsa-miR-512-3p;Derives_from=MI0003140 19 . miRNA_primary_transcript 315667 315757 . + . ID=MI0005757;Alias=MI0005757;Name=hsa-mir-935 -19 . miRNA 315722 315754 . + . ID=MIMAT0004978;Alias=MIMAT0004978;Name=hsa-miR-935;Derives_from=MI0005757 \ No newline at end of file +19 . miRNA 315722 315754 . + . ID=MIMAT0004978;Alias=MIMAT0004978;Name=hsa-miR-935;Derives_from=MI0005757 diff --git a/scripts/tests/files/in_replica_mirna_anno.gff3 b/scripts/tests/files/in_replica_mirna_anno.gff3 new file mode 100644 index 0000000..405965f --- /dev/null +++ b/scripts/tests/files/in_replica_mirna_anno.gff3 @@ -0,0 +1,18 @@ +3 . miRNA_primary_transcript 160404745 160404825 . + . ID=MI0000115;Alias=MI0000115;Name=hsa-mir-16-2 +3 . miRNA 160404754 160404775 . + . ID=MIMAT0000069_1;Alias=MIMAT0000069;Name=hsa-miR-16-5p;Derives_from=MI0000115 +3 . miRNA 160404797 160404818 . + . ID=MIMAT0004518;Alias=MIMAT0004518;Name=hsa-miR-16-2-3p;Derives_from=MI0000115 +13 . miRNA_primary_transcript 50048973 50049061 . - . ID=MI0000070;Alias=MI0000070;Name=hsa-mir-16-1 +13 . miRNA 50049027 50049048 . - . ID=MIMAT0000069;Alias=MIMAT0000069;Name=hsa-miR-16-5p;Derives_from=MI0000070 +13 . miRNA 50048985 50049006 . - . ID=MIMAT0004489;Alias=MIMAT0004489;Name=hsa-miR-16-1-3p;Derives_from=MI0000070 +20 . miRNA_primary_transcript 63919449 63919520 . + . ID=MI0005763;Alias=MI0005763;Name=hsa-mir-941-1 +20 . miRNA 63919495 63919517 . + . ID=MIMAT0004984;Alias=MIMAT0004984;Name=hsa-miR-941;Derives_from=MI0005763 +20 . miRNA_primary_transcript 63919505 63919576 . + . ID=MI0005764;Alias=MI0005764;Name=hsa-mir-941-2 +20 . miRNA 63919551 63919573 . + . ID=MIMAT0004984_1;Alias=MIMAT0004984;Name=hsa-miR-941;Derives_from=MI0005764 +20 . miRNA_primary_transcript 63919561 63919632 . + . ID=MI0005765;Alias=MI0005765;Name=hsa-mir-941-3 +20 . miRNA 63919607 63919629 . + . ID=MIMAT0004984_2;Alias=MIMAT0004984;Name=hsa-miR-941;Derives_from=MI0005765 +21 . miRNA_primary_transcript 8206563 8206618 . + . ID=MI0033425;Alias=MI0033425;Name=hsa-mir-10401 +21 . miRNA 8206563 8206582 . + . ID=MIMAT0041633;Alias=MIMAT0041633;Name=hsa-miR-10401-5p;Derives_from=MI0033425 +21 . miRNA 8206598 8206618 . + . ID=MIMAT0041634;Alias=MIMAT0041634;Name=hsa-miR-10401-3p;Derives_from=MI0033425 +21 . miRNA_primary_transcript 8250772 8250827 . + . ID=MI0033425_2;Alias=MI0033425;Name=hsa-mir-10401 +21 . miRNA 8250772 8250791 . + . ID=MIMAT0041633_1;Alias=MIMAT0041633;Name=hsa-miR-10401-5p;Derives_from=MI0033425 +21 . miRNA 8250807 8250827 . + . ID=MIMAT0041634_1;Alias=MIMAT0041634;Name=hsa-miR-10401-3p;Derives_from=MI0033425 diff --git a/scripts/tests/files/mir_anno.gff3 b/scripts/tests/files/mir_anno.gff3 index e763c0c..649f48e 100644 --- a/scripts/tests/files/mir_anno.gff3 +++ b/scripts/tests/files/mir_anno.gff3 @@ -1,3 +1,6 @@ -19 . miRNA 2530 2564 . + . ID=MIMAT0002822_1;Alias=MIMAT0002822;Name=hsa-miR-512-5p;Derives_from=MI0003141 -19 . miRNA 2567 2600 . + . ID=MIMAT0002823_1;Alias=MIMAT0002823;Name=hsa-miR-512-3p;Derives_from=MI0003141 -19 . miRNA 5332 5365 . + . ID=MIMAT0005795;Alias=MIMAT0005795;Name=hsa-miR-1323;Derives_from=MI0003786 +19 . miRNA_primary_transcript 121034 121104 . + . ID=MI0000779;Alias=MI0000779;Name=hsa-mir-371a_-1_+3 +19 . miRNA 121034 121065 . + . ID=MIMAT0004687;Alias=MIMAT0004687;Name=hsa-miR-371a-5p;Derives_from=MI0000779 +19 . miRNA 121070 121104 . + . ID=MIMAT0000723;Alias=MIMAT0000723;Name=hsa-miR-371a-3p;Derives_from=MI0000779 +19 . miRNA_primary_transcript 121032 121102 . - . ID=MI0017393;Alias=MI0017393;Name=hsa-mir-371b_-5_+0 +19 . miRNA 121032 121066 . - . ID=MIMAT0019893;Alias=MIMAT0019893;Name=hsa-miR-371b-3p;Derives_from=MI0017393 +19 . miRNA 121069 121102 . - . ID=MIMAT0019892;Alias=MIMAT0019892;Name=hsa-miR-371b-5p;Derives_from=MI0017393 diff --git a/scripts/tests/files/primir_anno.gff3 b/scripts/tests/files/primir_anno.gff3 index 1c05165..e69de29 100644 --- a/scripts/tests/files/primir_anno.gff3 +++ b/scripts/tests/files/primir_anno.gff3 @@ -1,2 +0,0 @@ -19 . miRNA_primary_transcript 2517 2614 . + . ID=MI0003141;Alias=MI0003141;Name=hsa-mir-512-2_-0_+0 -19 . miRNA_primary_transcript 5328 5400 . + . ID=MI0003786;Alias=MI0003786;Name=hsa-mir-1323_-0_+0 diff --git a/scripts/tests/files/replica_mirna_anno.gff3 b/scripts/tests/files/replica_mirna_anno.gff3 new file mode 100644 index 0000000..c029bb2 --- /dev/null +++ b/scripts/tests/files/replica_mirna_anno.gff3 @@ -0,0 +1,18 @@ +3 . miRNA_primary_transcript 160404745 160404825 . + . ID=MI0000115;Alias=MI0000115;Name=hsa-mir-16-2 +3 . miRNA 160404754 160404775 . + . ID=MIMAT0000069_1;Alias=MIMAT0000069;Name=hsa-miR-16-2-5p;Derives_from=MI0000115 +3 . miRNA 160404797 160404818 . + . ID=MIMAT0004518;Alias=MIMAT0004518;Name=hsa-miR-16-2-3p;Derives_from=MI0000115 +13 . miRNA_primary_transcript 50048973 50049061 . - . ID=MI0000070;Alias=MI0000070;Name=hsa-mir-16-1 +13 . miRNA 50049027 50049048 . - . ID=MIMAT0000069;Alias=MIMAT0000069;Name=hsa-miR-16-1-5p;Derives_from=MI0000070 +13 . miRNA 50048985 50049006 . - . ID=MIMAT0004489;Alias=MIMAT0004489;Name=hsa-miR-16-1-3p;Derives_from=MI0000070 +20 . miRNA_primary_transcript 63919449 63919520 . + . ID=MI0005763;Alias=MI0005763;Name=hsa-mir-941-1 +20 . miRNA 63919495 63919517 . + . ID=MIMAT0004984;Alias=MIMAT0004984;Name=hsa-miR-941-1;Derives_from=MI0005763 +20 . miRNA_primary_transcript 63919505 63919576 . + . ID=MI0005764;Alias=MI0005764;Name=hsa-mir-941-2 +20 . miRNA 63919551 63919573 . + . ID=MIMAT0004984_1;Alias=MIMAT0004984;Name=hsa-miR-941-2;Derives_from=MI0005764 +20 . miRNA_primary_transcript 63919561 63919632 . + . ID=MI0005765;Alias=MI0005765;Name=hsa-mir-941-3 +20 . miRNA 63919607 63919629 . + . ID=MIMAT0004984_2;Alias=MIMAT0004984;Name=hsa-miR-941-3;Derives_from=MI0005765 +21 . miRNA_primary_transcript 8206563 8206618 . + . ID=MI0033425;Alias=MI0033425;Name=hsa-mir-10401 +21 . miRNA 8206563 8206582 . + . ID=MIMAT0041633;Alias=MIMAT0041633;Name=hsa-miR-10401-5p;Derives_from=MI0033425 +21 . miRNA 8206598 8206618 . + . ID=MIMAT0041634;Alias=MIMAT0041634;Name=hsa-miR-10401-3p;Derives_from=MI0033425 +21 . miRNA_primary_transcript 8250772 8250827 . + . ID=MI0033425_2;Alias=MI0033425;Name=hsa-mir-10401-2 +21 . miRNA 8250772 8250791 . + . ID=MIMAT0041633_1;Alias=MIMAT0041633;Name=hsa-miR-10401-2-5p;Derives_from=MI0033425 +21 . miRNA 8250807 8250827 . + . ID=MIMAT0041634_1;Alias=MIMAT0041634;Name=hsa-miR-10401-2-3p;Derives_from=MI0033425 diff --git a/scripts/tests/test_mirna_extension.py b/scripts/tests/test_mirna_extension.py index 16f8309..b7466af 100644 --- a/scripts/tests/test_mirna_extension.py +++ b/scripts/tests/test_mirna_extension.py @@ -1,13 +1,14 @@ """Unit tests for module 'mirna_extension.py'.""" import argparse +from pathlib import Path import sys -from pathlib import Path -import gffutils +import gffutils # type: ignore import pytest -from ..mirna_extension import main, MirnaExtension, parse_arguments +from ..mirna_extension import (AnnotationException, MirnaExtension, main, + parse_arguments) @pytest.fixture @@ -19,18 +20,34 @@ def gff_empty(): @pytest.fixture -def gff_no_extremes(): +def gff_illegal_coords(): + """Import path to miRNA annotation file with illegal coordinates.""" + in_illegal = Path("files/in_illegal_mirna_anno.gff3") + + return in_illegal + + +@pytest.fixture +def gff_replicas(): + """Import path to miRNA annotation files with replicas.""" + in_replica = Path("files/in_replica_mirna_anno.gff3") + out_replica = Path("files/replica_mirna_anno.gff3") + + return in_replica, out_replica + + +@pytest.fixture +def gff_diff_strand(): """Import path to miRNA annotation files.""" - in_no_extreme = Path("files/in_mirna_anno.gff3") - out_primir = Path("files/primir_anno.gff3") + in_diff_strand = Path("files/in_mirna_anno.gff3") out_mir = Path("files/mir_anno.gff3") - return in_no_extreme, out_primir, out_mir + return in_diff_strand, out_mir @pytest.fixture def gff_extremes(): - """Import path to mirna annotation files with extreme miRNA coords.""" + """Import path to miRNA annotation files with extreme miRNA coords.""" in_extremes = Path("files/in_mirna_extreme_mirs.gff3") out_primir = Path("files/extreme_primir_anno.gff3") out_mir = Path("files/extreme_mir_anno.gff3") @@ -40,13 +57,471 @@ def gff_extremes(): @pytest.fixture def gff_extremes_chr(): - """Import path to mirna annotation files and chr size.""" - chr_size = Path("files/chr_size.txt") + """Import path to miRNA annotation files.""" in_chr_extremes = Path("files/in_mirna_extreme_chr_mirs.gff3") out_primir = Path("files/extreme_chr_primir_anno.gff3") out_mir = Path("files/extreme_chr_mir_anno.gff3") - return chr_size, in_chr_extremes, out_primir, out_mir + return in_chr_extremes, out_primir, out_mir + + +@pytest.fixture +def seq_len_tbl(): + """Import path to sequence lengths table.""" + correct_tbl = Path("files/chr_size.tsv") + + return correct_tbl + + +class TestSetDb: + """Test for the 'set_db' method.""" + + def test_set_db_file(self, gff_diff_strand): + """Test setting local db from file.""" + in_file, exp_out = gff_diff_strand + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + assert miR_obj is not None + assert isinstance(miR_obj.db, gffutils.FeatureDB) + assert ( + len(list(miR_obj.db.features_of_type("miRNA_primary_transcript"))) + == 2 + ) + assert len(list(miR_obj.db.features_of_type("miRNA"))) == 4 + + def test_set_db_empty_file(self, gff_empty): + """Test setting local db from empty file.""" + in_file = gff_empty + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + assert miR_obj.db is None + + def test_set_db_illegal_coord(self, gff_illegal_coords): + """Test setting local db from file with illegal start coordinate.""" + in_file = gff_illegal_coords + + miR_obj = MirnaExtension() + + with pytest.raises(AnnotationException, + match=r".* position 0. .*"): + miR_obj.set_db(path=in_file) + + +class TestSetSeqLengths: + """Test for the 'set_seq_lengths' method.""" + + def test_set_lengths_no_tbl(self, gff_diff_strand): + """Test create sequence lengths dictionary from GFF3 file.""" + in_file, exp_out = gff_diff_strand + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths() + + assert len(miR_obj.seq_lengths.keys()) == 1 + assert miR_obj.seq_lengths["19"] == 121102 + + def test_set_lengths_wrong_type_tbl( + self, tmp_path, gff_extremes, seq_len_tbl + ): + """Test create sequence lengths dictionary from wrong table.""" + in_file, pre_exp, mir_exp = gff_extremes + + table = tmp_path / "files/tbl.tsv" + table.parent.mkdir() + table.touch() + table.write_text("19\t600000bp") + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + with pytest.raises(ValueError, match=r".* integer .*"): + miR_obj.set_seq_lengths(path=table) + + def test_set_lengths_wrong_len_tbl( + self, tmp_path, gff_extremes_chr, seq_len_tbl + ): + """Test create sequence lengths dictionary from wrong table.""" + in_file, pre_exp, mir_exp = gff_extremes_chr + + table = tmp_path / "files/tbl.tsv" + table.parent.mkdir() + table.touch() + table.write_text("19\t315700") + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + with pytest.raises(AnnotationException, match=r".* exceeds .*"): + miR_obj.set_seq_lengths(path=table) + + def test_set_lengths_correct_tbl(self, gff_extremes_chr, seq_len_tbl): + """Test create sequence lengths dictionary from correct table.""" + in_file, pre_exp, mir_exp = gff_extremes_chr + correct_tbl = seq_len_tbl + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(path=correct_tbl) + + assert len(miR_obj.seq_lengths.keys()) == 1 + + +class TestAdjustNames: + """Test for the 'adjust_names' method.""" + + def test_adjust_names_no_replicas(self, gff_replicas): + """Test adjusting names when no replicas are present.""" + in_file, out_file = gff_replicas + out_mir = ["hsa-miR-10401-3p", "hsa-miR-10401-5p"] + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + precursor = miR_obj.db["MI0033425"] + matures = [miR_obj.db["MIMAT0041633"], miR_obj.db["MIMAT0041634"]] + + miR_obj.adjust_names(precursor=precursor, matures=matures) + + assert precursor.attributes["Name"][0] == "hsa-mir-10401" + assert matures[0].attributes["Name"][0] in out_mir + assert matures[1].attributes["Name"][0] in out_mir + + def test_adjust_names_id_replica(self, gff_replicas): + """Test adjusting names when replica integer is in the ID.""" + in_file, out_file = gff_replicas + out_mir = ["hsa-miR-10401-2-3p", "hsa-miR-10401-2-5p"] + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + precursor = miR_obj.db["MI0033425_2"] + matures = [miR_obj.db["MIMAT0041633_1"], miR_obj.db["MIMAT0041634_1"]] + + miR_obj.adjust_names(precursor=precursor, matures=matures) + + assert precursor.attributes["Name"][0] == "hsa-mir-10401-2" + assert matures[0].attributes["Name"][0] in out_mir + assert matures[1].attributes["Name"][0] in out_mir + + def test_adjust_names_name_replica(self, gff_replicas): + """Test adjusting names when replica integer is in the Name.""" + in_file, out_file = gff_replicas + out_mir = ["hsa-miR-16-1-3p", "hsa-miR-16-1-5p"] + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + precursor = miR_obj.db["MI0000070"] + matures = [miR_obj.db["MIMAT0000069"], miR_obj.db["MIMAT0004489"]] + + miR_obj.adjust_names(precursor=precursor, matures=matures) + + assert precursor.attributes["Name"][0] == "hsa-mir-16-1" + assert matures[0].attributes["Name"][0] in out_mir + assert matures[1].attributes["Name"][0] in out_mir + + def test_adjust_names_single(self, gff_replicas): + """Test adjusting names when a single mature miR is present.""" + in_file, out_file = gff_replicas + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + precursor = miR_obj.db["MI0005764"] + matures = [miR_obj.db["MIMAT0004984_1"]] + + miR_obj.adjust_names(precursor=precursor, matures=matures) + + assert precursor.attributes["Name"][0] == "hsa-mir-941-2" + assert matures[0].attributes["Name"][0] == "hsa-miR-941-2" + + def test_adjust_names_replace_replica(self, gff_replicas): + """Test adjusting names when mature miRs have the replica integer.""" + in_file, out_file = gff_replicas + out_mir = ["hsa-miR-16-2-3p", "hsa-miR-16-2-5p"] + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + + precursor = miR_obj.db["MI0000115"] + matures = [miR_obj.db["MIMAT0000069"], miR_obj.db["MIMAT0004518"]] + + miR_obj.adjust_names(precursor=precursor, matures=matures) + + assert precursor.attributes["Name"][0] == "hsa-mir-16-2" + assert matures[0].attributes["Name"][0] in out_mir + assert matures[1].attributes["Name"][0] in out_mir + + +class TestProcessPrecursor: + """Test for the 'process_precursor' method.""" + + def test_process_prec_diff_strand_same_coords( + self, + gff_diff_strand, + seq_len_tbl, + ): + """Test processing precursor on different strands, similar coords.""" + in_file, out_file = gff_diff_strand + correct_tbl = seq_len_tbl + + exp_mir_obj = MirnaExtension() + exp_mir_obj.set_db(path=out_file) + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(correct_tbl) + + out_pre_1 = miR_obj.process_precursor( + precursor=miR_obj.db["MI0000779"], n=6 + ) + out_pre_2 = miR_obj.process_precursor( + precursor=miR_obj.db["MI0017393"], n=6 + ) + + assert out_pre_1[0] == exp_mir_obj.db["MI0000779"] + assert out_pre_2[0] == exp_mir_obj.db["MI0017393"] + + for mir in out_pre_1[1:]: + assert mir in list( + exp_mir_obj.db.region( + seqid="19", + strand="+", + start=121034, + end=121104, + featuretype="miRNA", + ) + ) + for mir in out_pre_2[1:]: + assert mir in list( + exp_mir_obj.db.region( + seqid="19", + strand="-", + start=121032, + end=121102, + featuretype="miRNA", + ) + ) + + def test_process_prec_miR_out_seq_boundaries( + self, + gff_extremes_chr, + seq_len_tbl, + ): + """Test processing precursor for outside seq boundaries miRNAs.""" + in_file, pre_out, mir_out = gff_extremes_chr + correct_tbl = seq_len_tbl + + exp_pre_obj = MirnaExtension() + exp_pre_obj.set_db(path=pre_out) + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(correct_tbl) + + out_pre_1 = miR_obj.process_precursor( + precursor=miR_obj.db["MI0005757"], n=6 + ) + out_pre_2 = miR_obj.process_precursor( + precursor=miR_obj.db["MI0003140"], n=6 + ) + + assert out_pre_1[0] == exp_pre_obj.db["MI0005757"] + assert out_pre_2[0] == exp_pre_obj.db["MI0003140"] + assert out_pre_1[1].end == miR_obj.seq_lengths["19"] + assert out_pre_2[1].start == 1 + + def test_process_prec_unknown_seq(self, gff_extremes, tmp_path): + """Test processing precursor with annotated seq ID not in len dict.""" + in_file, pre_out, mir_out = gff_extremes + + table = tmp_path / "files/tbl.tsv" + table.parent.mkdir() + table.touch() + table.write_text("13\t600000") + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(table) + + with pytest.raises(KeyError, match=r".* not available .*"): + miR_obj.process_precursor(precursor=miR_obj.db["MI0005757"]) + + def test_process_prec_no_precursor_extension(self, gff_extremes): + """Test processing precursor without precursor extension.""" + in_file, pre_out, mir_out = gff_extremes + + exp_pre_obj = MirnaExtension() + exp_mir_obj = MirnaExtension() + exp_pre_obj.set_db(path=pre_out) + exp_mir_obj.set_db(path=mir_out) + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths() + + out_pre = miR_obj.process_precursor( + precursor=miR_obj.db["MI0003140"], n=2 + ) + + assert out_pre[0] == exp_pre_obj.db["MI0003140"] + assert out_pre[1:] == list( + exp_mir_obj.db.region( + seqid="19", start=6, end=124, featuretype="miRNA" + ) + ) + + +class TestUpdateDb: + """Test for the 'update_db' method.""" + + def test_update_db_no_extension_no_names(self, gff_diff_strand): + """Test not extending miRNA coordinates nor changing names.""" + in_file, out_file = gff_diff_strand + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths() + miR_obj.update_db(n=0) + + for mir in miR_obj.db.features_of_type("miRNA"): + exp_mir = miR_obj.db_out[mir.id] + assert mir.attributes["Name"][0] == exp_mir.attributes["Name"][0] + + def test_update_db_no_extension_unique_names(self, gff_replicas): + """Test not extending miRNA coordinates but changing names.""" + in_file, out_file = gff_replicas + + exp_mir_obj = MirnaExtension() + exp_mir_obj.set_db(path=out_file) + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths() + miR_obj.update_db(n=0) + + for mir in miR_obj.db_out.features_of_type("miRNA"): + exp_mir = exp_mir_obj.db[mir.id] + assert mir.attributes["Name"][0] == exp_mir.attributes["Name"][0] + + def test_update_db_extend_6_nts_no_names( + self, gff_diff_strand, seq_len_tbl + ): + """Test extending miRNA coordinates 6 nts but not changing names.""" + in_file, out_file = gff_diff_strand + correct_tbl = seq_len_tbl + + exp_mir_obj = MirnaExtension() + exp_mir_obj.set_db(path=out_file) + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(correct_tbl) + miR_obj.update_db(n=6) + + for mir in miR_obj.db_out.all_features(): + exp_mir = exp_mir_obj.db[mir.id] + assert mir.astuple() == exp_mir.astuple() + + def test_update_db_extend_6_nts_unique_names(self, gff_replicas): + """Test extending miRNA coordinates 6 nts and changing names.""" + in_file, out_file = gff_replicas + + exp_mir_obj = MirnaExtension() + exp_mir_obj.set_db(path=out_file) + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths() + miR_obj.update_db(n=6) + + for mir in miR_obj.db_out.features_of_type("miRNA"): + exp_mir = exp_mir_obj.db[mir.id] + assert mir.attributes["Name"][0] == exp_mir.attributes["Name"][0] + + +class TestWriteGFF: + """Test for the 'write_gff' method.""" + + def test_write_empty_file(self, gff_empty, tmp_path): + """Test writing an empty file.""" + empty_file = gff_empty + + empty_out = tmp_path / "empty.gff3" + + miR_obj = MirnaExtension() + miR_obj.set_db(path=empty_file) + miR_obj.set_seq_lengths() + miR_obj.update_db() + miR_obj.write_gff(path=empty_out) + + with open(empty_out, encoding="utf-8") as output, open( + empty_file, encoding="utf-8" + ) as expected: + assert output.read() == expected.read() + + def test_write_precursor_file(self, gff_extremes, seq_len_tbl, tmp_path): + """Test writing only precursor GFF3.""" + in_file, pre_exp, mir_exp = gff_extremes + correct_tbl = seq_len_tbl + + pre_out = tmp_path / "extended_premir_annotation_2_nt.gff3" + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(correct_tbl) + miR_obj.update_db(n=2) + miR_obj.write_gff( + path=pre_out, feature_type="miRNA_primary_transcript" + ) + + with open(pre_out, encoding="utf-8") as output, open( + pre_exp, encoding="utf-8" + ) as expected: + assert output.read() == expected.read() + + def test_write_mature_mir_file(self, gff_extremes, seq_len_tbl, tmp_path): + """Test writing only mature miRNA GFF3.""" + in_file, pre_exp, mir_exp = gff_extremes + correct_tbl = seq_len_tbl + + mir_out = tmp_path / "extended_mir_annotation_2_nt.gff3" + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(correct_tbl) + miR_obj.update_db(n=2) + miR_obj.write_gff(path=mir_out, feature_type="miRNA") + + with open(mir_out, encoding="utf-8") as output, open( + mir_exp, encoding="utf-8" + ) as expected: + assert output.read() == expected.read() + + def test_write_both_features_file( + self, tmp_path, gff_diff_strand, seq_len_tbl + ): + """Test writing both precursor and mature miRNA GFF3.""" + in_file, exp_out = gff_diff_strand + correct_tbl = seq_len_tbl + + out_file = tmp_path / "extended_mirna_annotation_6_nt.gff3" + + miR_obj = MirnaExtension() + miR_obj.set_db(path=in_file) + miR_obj.set_seq_lengths(correct_tbl) + miR_obj.update_db() + miR_obj.write_gff(path=out_file) + + with open(out_file, encoding="utf-8") as output, open( + exp_out, encoding="utf-8" + ) as expected: + assert output.read() == expected.read() class TestParseArguments: @@ -57,6 +532,7 @@ def test_no_files(self, monkeypatch): with pytest.raises(SystemExit) as sysex: monkeypatch.setattr(sys, "argv", ["mirna_extension"]) parse_arguments().parse_args() + assert sysex.value.code == 2 def test_in_files(self, monkeypatch, gff_empty, tmp_path): @@ -77,9 +553,12 @@ def test_in_files(self, monkeypatch, gff_empty, tmp_path): args = parse_arguments().parse_args() assert isinstance(args, argparse.Namespace) - def test_all_arguments(self, monkeypatch, gff_extremes_chr, tmp_path): + def test_all_arguments( + self, monkeypatch, gff_extremes_chr, seq_len_tbl, tmp_path + ): """Call with all the arguments.""" - chr_size, gff_in, gff_pre_out, gff_mir_out = gff_extremes_chr + gff_in, gff_pre_out, gff_mir_out = gff_extremes_chr + correct_tbl = seq_len_tbl monkeypatch.setattr( sys, @@ -90,7 +569,7 @@ def test_all_arguments(self, monkeypatch, gff_extremes_chr, tmp_path): "--outdir", str(tmp_path), "--chr", - str(chr_size), + str(correct_tbl), "--extension", "6", ], @@ -105,7 +584,7 @@ class TestMain: def test_main_empty_file(self, monkeypatch, gff_empty, tmp_path): """Test main function with an empty file.""" - gff_empty = gff_empty + in_empty = gff_empty primir_out = tmp_path / "extended_primir_annotation_6_nt.gff3" mir_out = tmp_path / "extended_mirna_annotation_6_nt.gff3" @@ -115,7 +594,7 @@ def test_main_empty_file(self, monkeypatch, gff_empty, tmp_path): "argv", [ "mirna_extension", - str(gff_empty), + str(in_empty), "--outdir", str(tmp_path), ], @@ -123,61 +602,54 @@ def test_main_empty_file(self, monkeypatch, gff_empty, tmp_path): args = parse_arguments().parse_args() main(args) - with open(gff_empty, "r") as expected, open(primir_out, "r") as output: + with open(in_empty, encoding="utf-8") as expected, open( + primir_out, encoding="utf-8" + ) as output: assert output.read() == expected.read() - with open(gff_empty, "r") as expected, open(mir_out, "r") as output: + with open(in_empty, encoding="utf-8") as expected, open( + mir_out, encoding="utf-8" + ) as output: assert output.read() == expected.read() - def test_main_no_extreme_coords( - self, monkeypatch, tmp_path, gff_no_extremes - ): + def test_main_no_extreme_coords(self, monkeypatch, tmp_path, gff_extremes): """Test main function with no extreme coords.""" - in_gff, pre_gff, mir_gff = gff_no_extremes - - primir_out = tmp_path / "extended_primir_annotation_6_nt.gff3" - mir_out = tmp_path / "extended_mirna_annotation_6_nt.gff3" - - monkeypatch.setattr( - sys, - "argv", - ["mirna_extension", str(in_gff), "--outdir", str(tmp_path)], - ) - args = parse_arguments().parse_args() - main(args) - - with open(pre_gff, "r") as expected, open(primir_out, "r") as output: - assert output.read() == expected.read() - - with open(mir_gff, "r") as expected, open(mir_out, "r") as output: - assert output.read() == expected.read() - - def test_main_extreme_coords(self, monkeypatch, tmp_path, gff_extremes): - """Test main function with extreme coords.""" in_gff, pre_gff, mir_gff = gff_extremes - primir_out = tmp_path / "extended_primir_annotation_6_nt.gff3" - mir_out = tmp_path / "extended_mirna_annotation_6_nt.gff3" + primir_out = tmp_path / "extended_primir_annotation_2_nt.gff3" + mir_out = tmp_path / "extended_mirna_annotation_2_nt.gff3" monkeypatch.setattr( sys, "argv", - ["mirna_extension", str(in_gff), "--outdir", str(tmp_path)], + [ + "mirna_extension", + str(in_gff), + "--outdir", + str(tmp_path), + "--extension", + "2", + ], ) args = parse_arguments().parse_args() main(args) - with open(pre_gff, "r") as expected, open(primir_out, "r") as output: + with open(pre_gff, encoding="utf-8") as expected, open( + primir_out, encoding="utf-8" + ) as output: assert output.read() == expected.read() - with open(mir_gff, "r") as expected, open(mir_out, "r") as output: + with open(mir_gff, encoding="utf-8") as expected, open( + mir_out, encoding="utf-8" + ) as output: assert output.read() == expected.read() def test_main_extreme_coords_limit_size( - self, monkeypatch, tmp_path, gff_extremes_chr + self, monkeypatch, tmp_path, gff_extremes_chr, seq_len_tbl ): """Test main function with extreme coords and limited by chr size.""" - chr_size, in_gff, pre_gff, mir_gff = gff_extremes_chr + in_gff, pre_gff, mir_gff = gff_extremes_chr + correct_tbl = seq_len_tbl primir_out = tmp_path / "extended_primir_annotation_6_nt.gff3" mir_out = tmp_path / "extended_mirna_annotation_6_nt.gff3" @@ -191,122 +663,18 @@ def test_main_extreme_coords_limit_size( "--outdir", str(tmp_path), "--chr", - str(chr_size), + str(correct_tbl), ], ) args = parse_arguments().parse_args() main(args) - with open(pre_gff, "r") as expected, open(primir_out, "r") as output: - assert output.read() == expected.read() - - with open(mir_gff, "r") as expected, open(mir_out, "r") as output: - assert output.read() == expected.read() - - -class TestLoadGffFile: - """Test for the 'load_gff_file' method.""" - - def test_load_gff_file(self, gff_no_extremes): - """Test input loading from file.""" - in_file, pre_exp, mir_exp = gff_no_extremes - - mirnaObject = MirnaExtension() - mirnaObject.load_gff_file(str(in_file)) - - assert mirnaObject is not None - assert isinstance(mirnaObject.db, gffutils.FeatureDB) - assert ( - len( - list( - mirnaObject.db.features_of_type("miRNA_primary_transcript") - ) - ) - == 2 - ) - assert len(list(mirnaObject.db.features_of_type("miRNA"))) == 3 - - def test_load_gff_file_std(self, monkeypatch, gff_no_extremes): - """Test input loading from standard input.""" - in_file, pre_exp, mir_exp = gff_no_extremes - monkeypatch.setattr(sys, "stdin", str(in_file)) - - mirnaObject = MirnaExtension() - mirnaObject.load_gff_file() - - assert mirnaObject is not None - assert isinstance(mirnaObject.db, gffutils.FeatureDB) - assert ( - len( - list( - mirnaObject.db.features_of_type("miRNA_primary_transcript") - ) - ) - == 2 - ) - assert len(list(mirnaObject.db.features_of_type("miRNA"))) == 3 - - -class TestExtendMirnas: - """Test for the 'extend_mirnas' method.""" - - def test_extend_mirnas_no_extreme_coords(self, tmp_path, gff_no_extremes): - """Test miRNA extension with no extreme coordinates.""" - in_file, pre_exp, mir_exp = gff_no_extremes - - primir_out = tmp_path / "extended_primir_annotation_6_nt.gff3" - mir_out = tmp_path / "extended_mirna_annotation_6_nt.gff3" - - mirnaObject = MirnaExtension() - mirnaObject.load_gff_file(str(in_file)) - mirnaObject.extend_mirnas(primir_out=primir_out, mir_out=mir_out) - - with open(primir_out, "r") as output, open(pre_exp, "r") as expected: - assert output.read() == expected.read() - - with open(mir_out, "r") as output, open(mir_exp, "r") as expected: - assert output.read() == expected.read() - - def test_extend_mirnas_extreme_coords(self, tmp_path, gff_extremes): - """Test miRNA extension with miRNAs having extreme coordinates.""" - in_file, pre_exp, mir_exp = gff_extremes - - primir_out = tmp_path / "extended_primir_annotation_6_nt.gff3" - mir_out = tmp_path / "extended_mirna_annotation_6_nt.gff3" - - mirnaObject = MirnaExtension() - mirnaObject.load_gff_file(str(in_file)) - mirnaObject.extend_mirnas(primir_out=primir_out, mir_out=mir_out) - - with open(primir_out, "r") as output, open(pre_exp, "r") as expected: - assert output.read() == expected.read() - - with open(mir_out, "r") as output, open(mir_exp, "r") as expected: - assert output.read() == expected.read() - - def test_extend_mirnas_extreme_coords_chr_boundaries( - self, tmp_path, gff_extremes_chr - ): - """Test miRNA extension with extreme coordinates and chr boundaries.""" - chr_size, in_file, pre_exp, mir_exp = gff_extremes_chr - - primir_out = tmp_path / "extended_primir_annotation_6_nt.gff3" - mir_out = tmp_path / "extended_mirna_annotation_6_nt.gff3" - - len_dict = {} - with open(chr_size, "r") as f: - for line in f: - line = line.strip().split("\t") - len_dict[line[0]] = int(line[1]) - - mirnaObject = MirnaExtension() - mirnaObject.load_gff_file(str(in_file)) - mirnaObject.extend_mirnas( - primir_out=primir_out, mir_out=mir_out, seq_lengths=len_dict - ) - - with open(primir_out, "r") as output, open(pre_exp, "r") as expected: + with open(pre_gff, encoding="utf-8") as expected, open( + primir_out, encoding="utf-8" + ) as output: assert output.read() == expected.read() - with open(mir_out, "r") as output, open(mir_exp, "r") as expected: + with open(mir_gff, encoding="utf-8") as expected, open( + mir_out, encoding="utf-8" + ) as output: assert output.read() == expected.read() diff --git a/scripts/validation_fasta.py b/scripts/validation_fasta.py index 5d78615..3960965 100755 --- a/scripts/validation_fasta.py +++ b/scripts/validation_fasta.py @@ -128,7 +128,7 @@ def __init__(self): sys.stdout.write("DONE\n") # ID FILTER LIST # - +id_filter = [] if args.filter: sys.stdout.write("Filtering FASTA file...") diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 3ae1d5b..1a2161a 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -45,7 +45,7 @@ fe5388094985e9604a302d39d2abc82c results/intermediates/test_lib/oligomap_transc d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/oligomap_transcriptome_sorted.oligomap 96a178aabc41d3020992efd6acbecb5a results/intermediates/genome_processed.fa.bz d34fc868b861b1bc46db07a397dc0f10 results/intermediates/genome_processed.fa.bz.fai -be7a0d92e57480190de57eb30baffa36 results/intermediates/extended_mirna_annotation_6_nt.gff3 +05542e2fdb7e8c7abd4374b0c63b5d1e results/intermediates/extended_mirna_annotation_6_nt.gff3 8148cd880602255be166beb59bbed95a results/intermediates/genome_header.sam 09e24a504bfec37fee3d5ff1b5c7738e results/intermediates/exons.bed 4fb453846e88593d0cac13220ec2d685 results/intermediates/segemehl_genome_index.idx @@ -54,7 +54,7 @@ f5749f5d2096386444a3d021e0a01817 results/intermediates/genome_processed.fa.bz.g d34fc868b861b1bc46db07a397dc0f10 results/intermediates/genome_processed.fa.fai 21e102e4ebd3508bb06f46366a3d578d results/intermediates/exons.gtf 003b92b245ac336e3d70a513033e1cee results/intermediates/transcriptome_trimmed_id.fa -cc5c3512dab0e269d82bd625de74198e results/intermediates/extended_primir_annotation_6_nt.gff3 +8fc8e961a10b73ab7901c406a0679634 results/intermediates/extended_primir_annotation_6_nt.gff3 f28cc0143ab6659bef3de3a7afa1dccc results/intermediates/mirna_annotations.gff3 2d437f8681f4248d4f2075f86debb920 results/intermediates/transcriptome.fa 7eb64c112830266bcf416ded60b4cf77 results/intermediates/segemehl_transcriptome_index.idx