Skip to content

Commit

Permalink
Add tabix indexing of gzipped VCFs in VariantBuilder (#214)
Browse files Browse the repository at this point in the history
  • Loading branch information
tfenne authored Jan 21, 2025
1 parent 951ec0d commit c5eb469
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 2 deletions.
13 changes: 11 additions & 2 deletions fgpyo/vcf/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from typing import Tuple
from typing import Union

import pysam
from pysam import VariantHeader
from pysam import VariantRecord

Expand Down Expand Up @@ -267,7 +268,12 @@ def add(
return variant

def to_path(self, path: Optional[Path] = None) -> Path:
"""Returns a path to a VCF for variants added to this builder.
"""
Returns a path to a VCF for variants added to this builder.
If the path given ends in ".gz" then the generated file will be bgzipped and
a tabix index generated for the file with the suffix ".gz.tbi".
Args:
path: optional path to the VCF
"""
Expand All @@ -279,6 +285,9 @@ def to_path(self, path: Optional[Path] = None) -> Path:
for variant in self.to_sorted_list():
writer.write(variant)

if str(path.suffix) == ".gz":
pysam.tabix_index(str(path), preset="vcf", force=True)

return path

@staticmethod
Expand All @@ -291,7 +300,7 @@ def _to_vcf_path(path: Optional[Path]) -> Path:
path: optionally the path to the VCF, or a directory to create a temporary VCF.
"""
if path is None:
with NamedTemporaryFile(suffix=".vcf", delete=False) as fp:
with NamedTemporaryFile(suffix=".vcf.gz", delete=False) as fp:
path = Path(fp.name)
assert path.is_file()
return path
Expand Down
20 changes: 20 additions & 0 deletions tests/fgpyo/vcf/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import pysam
import pytest
from pysam.libcbcf import VariantFile

from fgpyo.vcf import reader as vcf_reader
from fgpyo.vcf.builder import VariantBuilder
Expand Down Expand Up @@ -251,6 +252,25 @@ def test_zero_sample_vcf_round_trip(
_assert_equal(expected_value=builder_record, actual_value=vcf_record)


def test_indexing_gzipped_vcf(temp_path: Path) -> None:
vcf = temp_path / "test.vcf.gz"
builder = VariantBuilder()
_add_headers(builder)
builder.add(contig="chr1", pos=1000, ref="A", alts="G")
builder.add(contig="chr1", pos=2000, ref="A", alts="G")
builder.add(contig="chr1", pos=3000, ref="A", alts="G")
builder.add(contig="chr2", pos=1000, ref="A", alts="G")
builder.to_path(vcf)

reader: VariantFile
with vcf_reader(vcf) as reader:
assert len(list(reader.fetch(contig="chr1", start=900, end=1100))) == 1
assert len(list(reader.fetch(contig="chr1", start=900, end=2100))) == 2
assert len(list(reader.fetch(contig="chr1", start=900, end=3100))) == 3
assert len(list(reader.fetch(contig="chr2", start=900, end=1100))) == 1
assert len(list(reader.fetch(contig="chr2", start=5000, end=6000))) == 0


def _add_random_genotypes(
random_generator: random.Random,
record_input: Mapping[str, Any],
Expand Down

0 comments on commit c5eb469

Please sign in to comment.