Skip to content

Commit

Permalink
Filter out overlapping variants
Browse files Browse the repository at this point in the history
In the initial implementation, variants overlapping a variant of interest were included in the output.
This update fixes this bug, using post-processing to filter tabix output by rsID.
  • Loading branch information
standage committed May 21, 2019
1 parent 4ffe129 commit f01287e
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 1 deletion.
12 changes: 11 additions & 1 deletion rsidx/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ def trim_rsid(rsid):
return rsidstr


def filter_by_rsid(instream, rsidlist, header=False):
for line in instream:
if line.startswith('#'):
yield line
continue
chrom, coord, rsid, *values = line.split('\t')
if rsid[2:] in rsidlist:
yield line


def search(rsidlist, dbconn, vcffile, header=False):
c = dbconn.cursor()
rsids = ', '.join(map(trim_rsid, rsidlist))
Expand All @@ -39,7 +49,7 @@ def fmt(row):
tabixcmd.append('-h')
tabixcmd.extend(coords)
proc = Popen(tabixcmd, stdout=PIPE, universal_newlines=True)
for line in proc.stdout:
for line in filter_by_rsid(proc.stdout, rsids, header=header):
yield line


Expand Down
Binary file added rsidx/tests/data/overlap.sqlite3
Binary file not shown.
Binary file added rsidx/tests/data/overlap.vcf.gz
Binary file not shown.
Binary file added rsidx/tests/data/overlap.vcf.gz.tbi
Binary file not shown.
15 changes: 15 additions & 0 deletions rsidx/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,18 @@ def test_search_stdout(capsys):
terminal = capsys.readouterr()
outlines = terminal.out.strip().split('\n')
assert len(outlines) == 5


@pytest.mark.parametrize('doheader,numlines', [
(False, 1),
(True, 57 + 1),
])
def test_search_overlapping_variants(doheader, numlines):
rsidlist = ['rs8051733']
vcffile = data_file('overlap.vcf.gz')
idxfile = data_file('overlap.sqlite3')
conn = sqlite3.connect(idxfile)
outlines = list(rsidx.search.search(rsidlist, conn, vcffile, doheader))
assert len(outlines) == numlines
assert '\trs8051733\t' in outlines[-1]
assert '\rs967556605\t' not in outlines[-1]

0 comments on commit f01287e

Please sign in to comment.