Skip to content

Commit

Permalink
Deprecate set_pair_info and _set_mate_info for set_mate_info
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval committed Dec 27, 2024
1 parent 4ee5878 commit 56d9692
Show file tree
Hide file tree
Showing 6 changed files with 399 additions and 372 deletions.
103 changes: 84 additions & 19 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@
from pathlib import Path
from typing import IO
from typing import Any
from typing import Callable
from typing import Dict
from typing import Iterable
from typing import Iterator
Expand All @@ -176,6 +177,7 @@
from pysam import AlignedSegment
from pysam import AlignmentFile as SamFile
from pysam import AlignmentHeader as SamHeader
from typing_extensions import deprecated

import fgpyo.io
from fgpyo.collections import PeekableIterator
Expand Down Expand Up @@ -670,7 +672,7 @@ def build(
"""The default orientations for properly paired reads."""


def is_properly_paired(
def is_proper_pair(
r1: AlignedSegment,
r2: Optional[AlignedSegment] = None,
max_insert_size: int = 1000,
Expand Down Expand Up @@ -802,38 +804,101 @@ def isize(r1: AlignedSegment, r2: AlignedSegment) -> int:
return r2_pos - r1_pos


def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = True) -> None:
"""Resets mate pair information between reads in a pair. Requires that both r1
and r2 are mapped. Can be handed reads that already have pairing flags setup or
independent R1 and R2 records that are currently flagged as SE reads.
def sum_of_base_qualities(rec: AlignedSegment, min_quality_score: int = 15) -> int:
"""Calculate the sum of base qualities score for an alignment record.
This function is useful for calculating the "mate score" as implemented in samtools fixmate.
Args:
rec: The alignment record to calculate the sum of base qualities from.
min_quality_score: The minimum base quality score to use for summation.
See:
[`calc_sum_of_base_qualities()`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L227-L238)
[`MD_MIN_QUALITY`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L42)
"""
score: int = sum(qual for qual in rec.query_qualities if qual >= min_quality_score)
return score


def set_mate_info(
r1: AlignedSegment,
r2: AlignedSegment,
is_proper_pair: Callable[[AlignedSegment, AlignedSegment], bool] = is_proper_pair,
) -> None:
"""Resets mate pair information between reads in a pair.
Args:
r1: Read 1 (first read in the template).
r2: Read 2 with the same query name as r1 (second read in the template).
is_proper_pair: A function that takes the two alignments and determines proper pair status.
"""
if r1.query_name != r2.query_name:
raise ValueError("Cannot set mate info on alignments with different query names!")

Check warning on line 834 in fgpyo/sam/__init__.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sam/__init__.py#L834

Added line #L834 was not covered by tests

for src, dest in [(r1, r2), (r2, r1)]:
dest.next_reference_id = src.reference_id
dest.next_reference_name = src.reference_name
dest.next_reference_start = src.reference_start
dest.mate_is_forward = src.is_forward
dest.mate_is_mapped = src.is_mapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)

r1.set_tag("ms", sum_of_base_qualities(r2))
r2.set_tag("ms", sum_of_base_qualities(r1))

template_length = isize(r1, r2)
r1.template_length = template_length
r2.template_length = -template_length

proper_pair = is_proper_pair(r1, r2)
r1.is_proper_pair = proper_pair
r2.is_proper_pair = proper_pair


def set_as_pairs(
r1: AlignedSegment,
r2: AlignedSegment,
is_proper_pair: Callable[[AlignedSegment, AlignedSegment], bool] = is_proper_pair,
) -> None:
"""Forces the two reads to become pairs as long as they share the same query name.
This function will take two reads, as long as they have the same query name, and make them
pairs. The reads are marked as pairs and then first and second read flags are set. Finally, all
mate information is reset upon both reads.
This function is useful for taking once-single-end reads and making them pairs.
Args:
r1: read 1
r2: read 2 with the same queryname as r1
r1: Read 1 (first read in the template).
r2: Read 2 with the same query name as r1 (second read in the template).
is_proper_pair: A function that takes the two reads and determines proper pair status.
"""
if r1.query_name != r2.query_name:
raise ValueError("Cannot set pair info on reads with different query names!")
raise ValueError("Cannot pair reads with different query names!")

Check warning on line 876 in fgpyo/sam/__init__.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sam/__init__.py#L876

Added line #L876 was not covered by tests

for r in [r1, r2]:
r.is_paired = True
r.is_proper_pair = proper_pair

r1.is_read1 = True
r1.is_read2 = False
r2.is_read2 = True
r2.is_read1 = False

for src, dest in [(r1, r2), (r2, r1)]:
dest.next_reference_id = src.reference_id
dest.next_reference_start = src.reference_start
dest.mate_is_reverse = src.is_reverse
dest.mate_is_unmapped = src.mate_is_unmapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)
set_mate_info(r1=r1, r2=r2, is_proper_pair=is_proper_pair)

Check warning on line 886 in fgpyo/sam/__init__.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sam/__init__.py#L886

Added line #L886 was not covered by tests

insert_size = isize(r1, r2)
r1.template_length = insert_size
r2.template_length = -insert_size

@deprecated("Use `set_mate_info()` instead. Deprecated as of fgpyo 0.8.0.")
def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = True) -> None:
"""Resets mate pair information between reads in a pair.
Requires that both r1 and r2 are mapped. Can be handed reads that already have pairing flags
setup or independent R1 and R2 records that are currently flagged as SE reads.
Args:
r1: Read 1 (first read in the template).
r2: Read 2 with the same query name as r1 (second read in the template).
proper_pair: whether the pair is proper or not.
"""
set_as_pairs(r1, r2, lambda r1, r2: proper_pair)

Check warning on line 901 in fgpyo/sam/__init__.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sam/__init__.py#L901

Added line #L901 was not covered by tests


@attr.s(frozen=True, auto_attribs=True)
Expand Down
78 changes: 11 additions & 67 deletions fgpyo/sam/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,70 +264,6 @@ def _set_length_dependent_fields(
if not rec.is_unmapped:
rec.cigarstring = cigar if cigar else f"{length}M"

def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.
Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.
Args:
r1: the first read in the pair
r2: the sceond read in the pair
"""
for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = sam.NO_REF_INDEX
rec.next_reference_id = sam.NO_REF_INDEX
rec.reference_start = sam.NO_REF_POS
rec.next_reference_start = sam.NO_REF_POS
rec.is_unmapped = True
rec.mate_is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True

def rg(self) -> Dict[str, Any]:
"""Returns the single read group that is defined in the header."""
# The `RG` field contains a list of read group mappings
Expand Down Expand Up @@ -444,8 +380,16 @@ def add_pair(
raise ValueError("Cannot use chrom in combination with chrom1 or chrom2")

chrom = sam.NO_REF_NAME if chrom is None else chrom
chrom1 = next(c for c in (chrom1, chrom) if c is not None)
chrom2 = next(c for c in (chrom2, chrom) if c is not None)

if start1 != sam.NO_REF_POS:
chrom1 = next(c for c in (chrom1, chrom) if c is not None)
else:
chrom1 = sam.NO_REF_NAME

if start2 != sam.NO_REF_POS:
chrom2 = next(c for c in (chrom2, chrom) if c is not None)
else:
chrom2 = sam.NO_REF_NAME

if chrom1 == sam.NO_REF_NAME and start1 != sam.NO_REF_POS:
raise ValueError("start1 cannot be used on its own - specify chrom or chrom1")
Expand All @@ -468,7 +412,7 @@ def add_pair(
)

# Sync up mate info and we're done!
self._set_mate_info(r1, r2)
sam.set_mate_info(r1, r2)
self._records.append(r1)
self._records.append(r2)
return r1, r2
Expand Down
Loading

0 comments on commit 56d9692

Please sign in to comment.