Skip to content

Commit

Permalink
Add IntervalSet and replace use of cgranges
Browse files Browse the repository at this point in the history
  • Loading branch information
TedBrookings committed Dec 3, 2024
1 parent 8ebed61 commit 25201bd
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 36 deletions.
92 changes: 90 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

76 changes: 42 additions & 34 deletions pybedlite/overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,13 @@
from typing import List
from typing import Optional
from typing import Protocol
from typing import Set
from typing import Type
from typing import TypeVar
from typing import Union

import attr
from superintervals import IntervalSet

import cgranges as cr
from pybedlite.bed_record import BedRecord
from pybedlite.bed_record import BedStrand
from pybedlite.bed_source import BedSource
Expand Down Expand Up @@ -269,7 +268,7 @@ class OverlapDetector(Generic[SpanType], Iterable[SpanType]):

def __init__(self, intervals: Optional[Iterable[SpanType]] = None) -> None:
# A mapping from the contig/chromosome name to the associated interval tree
self._refname_to_tree: Dict[str, cr.cgranges] = {} # type: ignore
self._refname_to_tree: Dict[str, IntervalSet] = {}
self._refname_to_indexed: Dict[str, bool] = {}
self._refname_to_intervals: Dict[str, List[SpanType]] = {}
if intervals is not None:
Expand All @@ -286,7 +285,7 @@ def add(self, interval: SpanType) -> None:
interval: the interval to add to this detector
"""
if interval.refname not in self._refname_to_tree:
self._refname_to_tree[interval.refname] = cr.cgranges() # type: ignore
self._refname_to_tree[interval.refname] = IntervalSet()
self._refname_to_indexed[interval.refname] = False
self._refname_to_intervals[interval.refname] = []

Expand All @@ -295,9 +294,10 @@ def add(self, interval: SpanType) -> None:
interval_idx: int = len(self._refname_to_intervals[interval.refname])
self._refname_to_intervals[interval.refname].append(interval)

# Add the interval to the tree
# Add the interval to the tree. Note that IntervalSet uses closed intervals whereas we are
# using half-open intervals, so add 1 to start
tree = self._refname_to_tree[interval.refname]
tree.add(interval.refname, interval.start, interval.end, interval_idx)
tree.add(interval.start + 1, interval.end, interval_idx)

# Flag this tree as needing to be indexed after adding a new interval, but defer
# indexing
Expand All @@ -322,18 +322,38 @@ def overlaps_any(self, interval: Span) -> bool:
True if and only if the given interval overlaps with any interval in this
detector.
"""
tree = self._refname_to_tree.get(interval.refname)
tree = self._refname_to_tree.get(interval.refname, None)
if tree is None:
return False
else:
if not self._refname_to_indexed[interval.refname]:
tree.index()
try:
next(iter(tree.overlap(interval.refname, interval.start, interval.end)))
except StopIteration:
return False
else:
return True
self._refname_to_indexed[interval.refname] = True
# IntervalSet uses closed intervals whereas we are using half-open intervals, so add 1
# to start
return tree.any_overlaps(interval.start + 1, interval.end)

def iter_overlaps(self, interval: Span) -> Iterator[SpanType]:
"""Yields any intervals in this detector that overlap the given interval
Args:
interval: the interval to check
Yields:
Intervals in this detector that overlap the given interval, in insertion order.
"""
tree = self._refname_to_tree.get(interval.refname, None)
if tree is not None:
if not self._refname_to_indexed[interval.refname]:
tree.index()
self._refname_to_indexed[interval.refname] = True
ref_intervals: List[SpanType] = self._refname_to_intervals[interval.refname]
# IntervalSet uses closed intervals whereas we are using half-open intervals, so add 1
# to start.
# Also IntervalSet yields indices in reverse insertion order, so yield intervals in
# reverse of indices list.
for index in reversed(tree.find_overlaps(interval.start + 1, interval.end)):
yield ref_intervals[index]

def get_overlaps(self, interval: Span) -> List[SpanType]:
"""Returns any intervals in this detector that overlap the given interval.
Expand All @@ -351,27 +371,15 @@ def get_overlaps(self, interval: Span) -> List[SpanType]:
* The interval's strand, positive or negative (assumed to be positive if undefined)
* The interval's reference sequence name (lexicographically)
"""
tree = self._refname_to_tree.get(interval.refname)
if tree is None:
return []
else:
if not self._refname_to_indexed[interval.refname]:
tree.index()
ref_intervals: List[SpanType] = self._refname_to_intervals[interval.refname]
# NB: only return unique instances of intervals
intervals: Set[SpanType] = {
ref_intervals[index]
for _, _, index in tree.overlap(interval.refname, interval.start, interval.end)
}
return sorted(
intervals,
key=lambda intv: (
intv.start,
intv.end,
self._negative(intv),
intv.refname,
),
)
return sorted(
set(self.iter_overlaps(interval)),
key=lambda intv: (
intv.start,
intv.end,
self._negative(intv),
intv.refname,
),
)

@staticmethod
def _negative(interval: Span) -> bool:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ packages = [{ include = "pybedlite" }, { include = "cgranges" }]
python = "^3.8.0"
attrs = "^23.0.0"
sphinx = { version = "^7.0.0", optional = true }
superintervals = "0.2.2"

[tool.poetry.dev-dependencies]
pytest = "^7.0.0"
Expand Down

0 comments on commit 25201bd

Please sign in to comment.