From 25201bd24fc4605ca8ad4127e7e7be1759ea4cd0 Mon Sep 17 00:00:00 2001 From: Ted Brookings Date: Tue, 3 Dec 2024 09:48:39 -0500 Subject: [PATCH] Add IntervalSet and replace use of cgranges --- poetry.lock | 92 ++++++++++++++++++++++++++++++++++- pybedlite/overlap_detector.py | 76 ++++++++++++++++------------- pyproject.toml | 1 + 3 files changed, 133 insertions(+), 36 deletions(-) diff --git a/poetry.lock b/poetry.lock index f88b3bd..7fb94dc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. [[package]] name = "alabaster" @@ -295,6 +295,81 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "cython" +version = "3.0.11" +description = "The Cython compiler for writing C extensions in the Python language." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +files = [ + {file = "Cython-3.0.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:44292aae17524abb4b70a25111fe7dec1a0ad718711d47e3786a211d5408fdaa"}, + {file = "Cython-3.0.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a75d45fbc20651c1b72e4111149fed3b33d270b0a4fb78328c54d965f28d55e1"}, + {file = "Cython-3.0.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d89a82937ce4037f092e9848a7bbcc65bc8e9fc9aef2bb74f5c15e7d21a73080"}, + {file = "Cython-3.0.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ea2e7e2d3bc0d8630dafe6c4a5a89485598ff8a61885b74f8ed882597efd5"}, + {file = "Cython-3.0.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cee29846471ce60226b18e931d8c1c66a158db94853e3e79bc2da9bd22345008"}, + {file = "Cython-3.0.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eeb6860b0f4bfa402de8929833fe5370fa34069c7ebacb2d543cb017f21fb891"}, + {file = "Cython-3.0.11-cp310-cp310-win32.whl", hash = "sha256:3699391125ab344d8d25438074d1097d9ba0fb674d0320599316cfe7cf5f002a"}, + {file = "Cython-3.0.11-cp310-cp310-win_amd64.whl", hash = "sha256:d02f4ebe15aac7cdacce1a628e556c1983f26d140fd2e0ac5e0a090e605a2d38"}, + {file = "Cython-3.0.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:75ba1c70b6deeaffbac123856b8d35f253da13552207aa969078611c197377e4"}, + {file = "Cython-3.0.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af91497dc098718e634d6ec8f91b182aea6bb3690f333fc9a7777bc70abe8810"}, + {file = "Cython-3.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3999fb52d3328a6a5e8c63122b0a8bd110dfcdb98dda585a3def1426b991cba7"}, + {file = "Cython-3.0.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d566a4e09b8979be8ab9f843bac0dd216c81f5e5f45661a9b25cd162ed80508c"}, + {file = "Cython-3.0.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:46aec30f217bdf096175a1a639203d44ac73a36fe7fa3dd06bd012e8f39eca0f"}, + {file = "Cython-3.0.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ddd1fe25af330f4e003421636746a546474e4ccd8f239f55d2898d80983d20ed"}, + {file = "Cython-3.0.11-cp311-cp311-win32.whl", hash = "sha256:221de0b48bf387f209003508e602ce839a80463522fc6f583ad3c8d5c890d2c1"}, + {file = "Cython-3.0.11-cp311-cp311-win_amd64.whl", hash = "sha256:3ff8ac1f0ecd4f505db4ab051e58e4531f5d098b6ac03b91c3b902e8d10c67b3"}, + {file = "Cython-3.0.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:11996c40c32abf843ba652a6d53cb15944c88d91f91fc4e6f0028f5df8a8f8a1"}, + {file = "Cython-3.0.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63f2c892e9f9c1698ecfee78205541623eb31cd3a1b682668be7ac12de94aa8e"}, + {file = "Cython-3.0.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b14c24f1dc4c4c9d997cca8d1b7fb01187a218aab932328247dcf5694a10102"}, + {file = "Cython-3.0.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c8eed5c015685106db15dd103fd040948ddca9197b1dd02222711815ea782a27"}, + {file = "Cython-3.0.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780f89c95b8aec1e403005b3bf2f0a2afa060b3eba168c86830f079339adad89"}, + {file = "Cython-3.0.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a690f2ff460682ea985e8d38ec541be97e0977fa0544aadc21efc116ff8d7579"}, + {file = "Cython-3.0.11-cp312-cp312-win32.whl", hash = "sha256:2252b5aa57621848e310fe7fa6f7dce5f73aa452884a183d201a8bcebfa05a00"}, + {file = "Cython-3.0.11-cp312-cp312-win_amd64.whl", hash = "sha256:da394654c6da15c1d37f0b7ec5afd325c69a15ceafee2afba14b67a5df8a82c8"}, + {file = "Cython-3.0.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4341d6a64d47112884e0bcf31e6c075268220ee4cd02223047182d4dda94d637"}, + {file = "Cython-3.0.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:351955559b37e6c98b48aecb178894c311be9d731b297782f2b78d111f0c9015"}, + {file = "Cython-3.0.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c02361af9bfa10ff1ccf967fc75159e56b1c8093caf565739ed77a559c1f29f"}, + {file = "Cython-3.0.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6823aef13669a32caf18bbb036de56065c485d9f558551a9b55061acf9c4c27f"}, + {file = "Cython-3.0.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6fb68cef33684f8cc97987bee6ae919eee7e18ee6a3ad7ed9516b8386ef95ae6"}, + {file = "Cython-3.0.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:790263b74432cb997740d73665f4d8d00b9cd1cecbdd981d93591ddf993d4f12"}, + {file = "Cython-3.0.11-cp313-cp313-win32.whl", hash = "sha256:e6dd395d1a704e34a9fac00b25f0036dce6654c6b898be6f872ac2bb4f2eda48"}, + {file = "Cython-3.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:52186101d51497519e99b60d955fd5cb3bf747c67f00d742e70ab913f1e42d31"}, + {file = "Cython-3.0.11-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:c69d5cad51388522b98a99b4be1b77316de85b0c0523fa865e0ea58bbb622e0a"}, + {file = "Cython-3.0.11-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8acdc87e9009110adbceb7569765eb0980129055cc954c62f99fe9f094c9505e"}, + {file = "Cython-3.0.11-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1dd47865f4c0a224da73acf83d113f93488d17624e2457dce1753acdfb1cc40c"}, + {file = "Cython-3.0.11-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:301bde949b4f312a1c70e214b0c3bc51a3f955d466010d2f68eb042df36447b0"}, + {file = "Cython-3.0.11-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:f3953d2f504176f929862e5579cfc421860c33e9707f585d70d24e1096accdf7"}, + {file = "Cython-3.0.11-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:3f2b062f6df67e8a56c75e500ca330cf62c85ac26dd7fd006f07ef0f83aebfa3"}, + {file = "Cython-3.0.11-cp36-cp36m-win32.whl", hash = "sha256:c3d68751668c66c7a140b6023dba5d5d507f72063407bb609d3a5b0f3b8dfbe4"}, + {file = "Cython-3.0.11-cp36-cp36m-win_amd64.whl", hash = "sha256:bcd29945fafd12484cf37b1d84f12f0e7a33ba3eac5836531c6bd5283a6b3a0c"}, + {file = "Cython-3.0.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4e9a8d92978b15a0c7ca7f98447c6c578dc8923a0941d9d172d0b077cb69c576"}, + {file = "Cython-3.0.11-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:421017466e9260aca86823974e26e158e6358622f27c0f4da9c682f3b6d2e624"}, + {file = "Cython-3.0.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d80a7232938d523c1a12f6b1794ab5efb1ae77ad3fde79de4bb558d8ab261619"}, + {file = "Cython-3.0.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfa550d9ae39e827a6e7198076df763571cb53397084974a6948af558355e028"}, + {file = "Cython-3.0.11-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:aedceb6090a60854b31bf9571dc55f642a3fa5b91f11b62bcef167c52cac93d8"}, + {file = "Cython-3.0.11-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:473d35681d9f93ce380e6a7c8feb2d65fc6333bd7117fbc62989e404e241dbb0"}, + {file = "Cython-3.0.11-cp37-cp37m-win32.whl", hash = "sha256:3379c6521e25aa6cd7703bb7d635eaca75c0f9c7f1b0fdd6dd15a03bfac5f68d"}, + {file = "Cython-3.0.11-cp37-cp37m-win_amd64.whl", hash = "sha256:14701edb3107a5d9305a82d9d646c4f28bfecbba74b26cc1ee2f4be08f602057"}, + {file = "Cython-3.0.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:598699165cfa7c6d69513ee1bffc9e1fdd63b00b624409174c388538aa217975"}, + {file = "Cython-3.0.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0583076c4152b417a3a8a5d81ec02f58c09b67d3f22d5857e64c8734ceada8c"}, + {file = "Cython-3.0.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52205347e916dd65d2400b977df4c697390c3aae0e96275a438cc4ae85dadc08"}, + {file = "Cython-3.0.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:989899a85f0d9a57cebb508bd1f194cb52f0e3f7e22ac259f33d148d6422375c"}, + {file = "Cython-3.0.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:53b6072a89049a991d07f42060f65398448365c59c9cb515c5925b9bdc9d71f8"}, + {file = "Cython-3.0.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f988f7f8164a6079c705c39e2d75dbe9967e3dacafe041420d9af7b9ee424162"}, + {file = "Cython-3.0.11-cp38-cp38-win32.whl", hash = "sha256:a1f4cbc70f6b7f0c939522118820e708e0d490edca42d852fa8004ec16780be2"}, + {file = "Cython-3.0.11-cp38-cp38-win_amd64.whl", hash = "sha256:187685e25e037320cae513b8cc4bf9dbc4465c037051aede509cbbf207524de2"}, + {file = "Cython-3.0.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0fc6fdd6fa493be7bdda22355689d5446ac944cd71286f6f44a14b0d67ee3ff5"}, + {file = "Cython-3.0.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b1d1f6f94cc5d42a4591f6d60d616786b9cd15576b112bc92a23131fcf38020"}, + {file = "Cython-3.0.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4ab2b92a3e6ed552adbe9350fd2ef3aa0cc7853cf91569f9dbed0c0699bbeab"}, + {file = "Cython-3.0.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:104d6f2f2c827ccc5e9e42c80ef6773a6aa94752fe6bc5b24a4eab4306fb7f07"}, + {file = "Cython-3.0.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:13062ce556a1e98d2821f7a0253b50569fdc98c36efd6653a65b21e3f8bbbf5f"}, + {file = "Cython-3.0.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:525d09b3405534763fa73bd78c8e51ac8264036ce4c16d37dfd1555a7da6d3a7"}, + {file = "Cython-3.0.11-cp39-cp39-win32.whl", hash = "sha256:b8c7e514075696ca0f60c337f9e416e61d7ccbc1aa879a56c39181ed90ec3059"}, + {file = "Cython-3.0.11-cp39-cp39-win_amd64.whl", hash = "sha256:8948802e1f5677a673ea5d22a1e7e273ca5f83e7a452786ca286eebf97cee67c"}, + {file = "Cython-3.0.11-py2.py3-none-any.whl", hash = "sha256:0e25f6425ad4a700d7f77cd468da9161e63658837d1bc34861a9861a4ef6346d"}, + {file = "cython-3.0.11.tar.gz", hash = "sha256:7146dd2af8682b4ca61331851e6aebce9fe5158e75300343f80c07ca80b1faff"}, +] + [[package]] name = "docutils" version = "0.20.1" @@ -854,6 +929,19 @@ files = [ lint = ["docutils-stubs", "flake8", "mypy"] test = ["pytest"] +[[package]] +name = "superintervals" +version = "0.2.2" +description = "Rapid interval intersections" +optional = false +python-versions = "*" +files = [ + {file = "superintervals-0.2.2.tar.gz", hash = "sha256:0810b16eb810171e8869acd364f5e9e8b931f9922eb413de377b13015f9b69b1"}, +] + +[package.dependencies] +Cython = "*" + [[package]] name = "tomli" version = "2.0.1" @@ -914,4 +1002,4 @@ docs = ["sphinx"] [metadata] lock-version = "2.0" python-versions = "^3.8.0" -content-hash = "a087243f9a79748bb28ecff439c2e124c113a6f1d4fc87bc96b75b9c0c62bf7b" +content-hash = "8377a04c2b14e9d0307bc5b3d9e49bc63ff7c151c7e62fbfad67e96096d970ff" diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 66a50f7..2bedb58 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -67,14 +67,13 @@ from typing import List from typing import Optional from typing import Protocol -from typing import Set from typing import Type from typing import TypeVar from typing import Union import attr +from superintervals import IntervalSet -import cgranges as cr from pybedlite.bed_record import BedRecord from pybedlite.bed_record import BedStrand from pybedlite.bed_source import BedSource @@ -269,7 +268,7 @@ class OverlapDetector(Generic[SpanType], Iterable[SpanType]): def __init__(self, intervals: Optional[Iterable[SpanType]] = None) -> None: # A mapping from the contig/chromosome name to the associated interval tree - self._refname_to_tree: Dict[str, cr.cgranges] = {} # type: ignore + self._refname_to_tree: Dict[str, IntervalSet] = {} self._refname_to_indexed: Dict[str, bool] = {} self._refname_to_intervals: Dict[str, List[SpanType]] = {} if intervals is not None: @@ -286,7 +285,7 @@ def add(self, interval: SpanType) -> None: interval: the interval to add to this detector """ if interval.refname not in self._refname_to_tree: - self._refname_to_tree[interval.refname] = cr.cgranges() # type: ignore + self._refname_to_tree[interval.refname] = IntervalSet() self._refname_to_indexed[interval.refname] = False self._refname_to_intervals[interval.refname] = [] @@ -295,9 +294,10 @@ def add(self, interval: SpanType) -> None: interval_idx: int = len(self._refname_to_intervals[interval.refname]) self._refname_to_intervals[interval.refname].append(interval) - # Add the interval to the tree + # Add the interval to the tree. Note that IntervalSet uses closed intervals whereas we are + # using half-open intervals, so add 1 to start tree = self._refname_to_tree[interval.refname] - tree.add(interval.refname, interval.start, interval.end, interval_idx) + tree.add(interval.start + 1, interval.end, interval_idx) # Flag this tree as needing to be indexed after adding a new interval, but defer # indexing @@ -322,18 +322,38 @@ def overlaps_any(self, interval: Span) -> bool: True if and only if the given interval overlaps with any interval in this detector. """ - tree = self._refname_to_tree.get(interval.refname) + tree = self._refname_to_tree.get(interval.refname, None) if tree is None: return False else: if not self._refname_to_indexed[interval.refname]: tree.index() - try: - next(iter(tree.overlap(interval.refname, interval.start, interval.end))) - except StopIteration: - return False - else: - return True + self._refname_to_indexed[interval.refname] = True + # IntervalSet uses closed intervals whereas we are using half-open intervals, so add 1 + # to start + return tree.any_overlaps(interval.start + 1, interval.end) + + def iter_overlaps(self, interval: Span) -> Iterator[SpanType]: + """Yields any intervals in this detector that overlap the given interval + + Args: + interval: the interval to check + + Yields: + Intervals in this detector that overlap the given interval, in insertion order. + """ + tree = self._refname_to_tree.get(interval.refname, None) + if tree is not None: + if not self._refname_to_indexed[interval.refname]: + tree.index() + self._refname_to_indexed[interval.refname] = True + ref_intervals: List[SpanType] = self._refname_to_intervals[interval.refname] + # IntervalSet uses closed intervals whereas we are using half-open intervals, so add 1 + # to start. + # Also IntervalSet yields indices in reverse insertion order, so yield intervals in + # reverse of indices list. + for index in reversed(tree.find_overlaps(interval.start + 1, interval.end)): + yield ref_intervals[index] def get_overlaps(self, interval: Span) -> List[SpanType]: """Returns any intervals in this detector that overlap the given interval. @@ -351,27 +371,15 @@ def get_overlaps(self, interval: Span) -> List[SpanType]: * The interval's strand, positive or negative (assumed to be positive if undefined) * The interval's reference sequence name (lexicographically) """ - tree = self._refname_to_tree.get(interval.refname) - if tree is None: - return [] - else: - if not self._refname_to_indexed[interval.refname]: - tree.index() - ref_intervals: List[SpanType] = self._refname_to_intervals[interval.refname] - # NB: only return unique instances of intervals - intervals: Set[SpanType] = { - ref_intervals[index] - for _, _, index in tree.overlap(interval.refname, interval.start, interval.end) - } - return sorted( - intervals, - key=lambda intv: ( - intv.start, - intv.end, - self._negative(intv), - intv.refname, - ), - ) + return sorted( + set(self.iter_overlaps(interval)), + key=lambda intv: ( + intv.start, + intv.end, + self._negative(intv), + intv.refname, + ), + ) @staticmethod def _negative(interval: Span) -> bool: diff --git a/pyproject.toml b/pyproject.toml index 06402b2..5f8e326 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ packages = [{ include = "pybedlite" }, { include = "cgranges" }] python = "^3.8.0" attrs = "^23.0.0" sphinx = { version = "^7.0.0", optional = true } +superintervals = "0.2.2" [tool.poetry.dev-dependencies] pytest = "^7.0.0"