From 24122f4721b7a3caec055418e801b60075d0cea5 Mon Sep 17 00:00:00 2001 From: tfenne Date: Fri, 17 Jan 2025 13:05:21 -0700 Subject: [PATCH] Add tabix indexing of gzipped VCFs in VcfBuilder --- fgpyo/vcf/builder.py | 6 +++++- tests/fgpyo/vcf/test_builder.py | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/fgpyo/vcf/builder.py b/fgpyo/vcf/builder.py index 3acad0a3..1b81560c 100644 --- a/fgpyo/vcf/builder.py +++ b/fgpyo/vcf/builder.py @@ -14,6 +14,7 @@ from typing import Tuple from typing import Union +import pysam from pysam import VariantHeader from pysam import VariantRecord @@ -279,6 +280,9 @@ def to_path(self, path: Optional[Path] = None) -> Path: for variant in self.to_sorted_list(): writer.write(variant) + if str(path.suffix) == ".gz": + pysam.tabix_index(str(path), preset="vcf") + return path @staticmethod @@ -291,7 +295,7 @@ def _to_vcf_path(path: Optional[Path]) -> Path: path: optionally the path to the VCF, or a directory to create a temporary VCF. """ if path is None: - with NamedTemporaryFile(suffix=".vcf", delete=False) as fp: + with NamedTemporaryFile(suffix=".vcf.gz", delete=False) as fp: path = Path(fp.name) assert path.is_file() return path diff --git a/tests/fgpyo/vcf/test_builder.py b/tests/fgpyo/vcf/test_builder.py index 27597f66..9ba4e27d 100644 --- a/tests/fgpyo/vcf/test_builder.py +++ b/tests/fgpyo/vcf/test_builder.py @@ -11,6 +11,7 @@ import pysam import pytest +from pysam.libcbcf import VariantFile from fgpyo.vcf import reader as vcf_reader from fgpyo.vcf.builder import VariantBuilder @@ -251,6 +252,25 @@ def test_zero_sample_vcf_round_trip( _assert_equal(expected_value=builder_record, actual_value=vcf_record) +def test_indexing_gzipped_vcf(temp_path: Path) -> None: + vcf = temp_path / "test.vcf.gz" + builder = VariantBuilder() + _add_headers(builder) + builder.add(contig="chr1", pos=1000, ref="A", alts="G") + builder.add(contig="chr1", pos=2000, ref="A", alts="G") + builder.add(contig="chr1", pos=3000, ref="A", alts="G") + builder.add(contig="chr2", pos=1000, ref="A", alts="G") + builder.to_path(vcf) + + reader: VariantFile + with vcf_reader(vcf) as reader: + assert len(list(reader.fetch(contig="chr1", start=900, end=1100))) == 1 + assert len(list(reader.fetch(contig="chr1", start=900, end=2100))) == 2 + assert len(list(reader.fetch(contig="chr1", start=900, end=3100))) == 3 + assert len(list(reader.fetch(contig="chr2", start=900, end=1100))) == 1 + assert len(list(reader.fetch(contig="chr2", start=5000, end=6000))) == 0 + + def _add_random_genotypes( random_generator: random.Random, record_input: Mapping[str, Any],