Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Check against HN tag instead of is_unmapped property #69

Merged
merged 7 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 24 additions & 13 deletions prymer/offtarget/bwa.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,10 @@ def __str__(self) -> str:

@dataclass(init=True, frozen=True)
class BwaResult:
"""Represents zero or more hits or alignments found by BWA for the given query. The number of
hits found may be more than the number of hits listed in the `hits` attribute.
"""
Represents zero or more hits or alignments found by BWA for the given query.

The number of hits found may be more than the number of hits listed in the `hits` attribute.

Attributes:
query: the query (as given, no RC'ing)
Expand Down Expand Up @@ -359,7 +361,8 @@ def map_all(self, queries: list[Query]) -> list[BwaResult]:
return results

def _to_result(self, query: Query, rec: pysam.AlignedSegment) -> BwaResult:
"""Converts the query and alignment to a result.
"""
Convert the query and alignment to a result.

Args:
query: the original query
Expand All @@ -370,17 +373,20 @@ def _to_result(self, query: Query, rec: pysam.AlignedSegment) -> BwaResult:
"Query and Results are out of order" f"Query=${query.id}, Result=${rec.query_name}"
)

num_hits: Optional[int] = int(rec.get_tag("HN")) if rec.has_tag("HN") else None
if rec.is_unmapped:
if num_hits is not None and num_hits > 0:
raise ValueError(f"Read was unmapped but num_hits > 0: {rec}")
return BwaResult(query=query, hit_count=0, hits=[])
elif num_hits > self.max_hits:
return BwaResult(query=query, hit_count=num_hits, hits=[])
else:
num_hits: int = int(rec.get_tag("HN")) if rec.has_tag("HN") else 0
# `to_hits()` removes artifactual hits which span the boundary between concatenated
# reference sequences. If we are reporting a list of hits, the number of hits should match
# the size of this list. Otherwise, we either have zero hits, or more than the maximum
# number of hits. In both of the latter cases, we have to rely on the count reported in the
# `HN` tag.
hits: list[BwaHit]
if 0 < num_hits <= self.max_hits:
hits = self.to_hits(rec=rec)
hit_count = num_hits if len(hits) == 0 else len(hits)
return BwaResult(query=query, hit_count=hit_count, hits=hits)
num_hits = len(hits)
else:
hits = []

return BwaResult(query=query, hit_count=num_hits, hits=hits)

def to_hits(self, rec: AlignedSegment) -> list[BwaHit]:
"""Extracts the hits from the given alignment. Beyond the current alignment
Expand Down Expand Up @@ -425,4 +431,9 @@ def to_hits(self, rec: AlignedSegment) -> list[BwaHit]:
if not self.include_alt_hits:
hits = [hit for hit in hits if not hit.refname.endswith("_alt")]

# Remove hits that extend beyond the end of a contig - these are artifacts of `bwa aln`'s
# alignment process, which concatenates all reference sequences and reports hits which span
# across contigs.
hits = [hit for hit in hits if hit.end <= self.header.get_reference_length(hit.refname)]

return hits
18 changes: 13 additions & 5 deletions tests/offtarget/test_bwa.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,13 +262,21 @@ def test_to_result_out_of_order(ref_fasta: Path) -> None:
def test_to_result_num_hits_on_unmapped(ref_fasta: Path) -> None:
with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa:
query = Query(bases="GATTACA", id="foo")
# Exception: HN cannot be non-zero

# HN *can* be non-zero
rec = SamBuilder().add_single(name=query.id, attrs={"HN": 42})
with pytest.raises(ValueError, match="Read was unmapped but num_hits > 0"):
bwa._to_result(query=query, rec=rec)
result = bwa._to_result(query=query, rec=rec)
assert result.hit_count == 42
assert result.hits == []

# OK: HN tag is zero
rec = SamBuilder().add_single(name=query.id, attrs={"HN": 0})
bwa._to_result(query=query, rec=rec)
result = bwa._to_result(query=query, rec=rec)
assert result.hit_count == 0
assert result.hits == []

# OK: no HN tag
rec = SamBuilder().add_single(name=query.id)
bwa._to_result(query=query, rec=rec)
result = bwa._to_result(query=query, rec=rec)
assert result.hit_count == 0
assert result.hits == []
Loading