Skip to content

Commit

Permalink
fixed bug with 1-bit encoded genotypes
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremymcrae committed Jan 10, 2024
1 parent 337cbf4 commit 3306147
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 3 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def build_zstd():
description='Package for loading data from bgen files',
long_description=io.open('README.md', encoding='utf-8').read(),
long_description_content_type='text/markdown',
version='1.6.0',
version='1.6.1',
author='Jeremy McRae',
author_email='[email protected]',
license="MIT",
Expand Down
4 changes: 2 additions & 2 deletions src/writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ std::uint32_t encode_unphased(std::vector<std::uint8_t> &encoded,
bit_idx += bit_depth;
}
}
return genotype_offset + (bit_idx / 8);
return genotype_offset + (bit_idx / 8) + (std::uint32_t)((bit_idx % 8) > 0);
}

std::uint32_t encode_phased(std::vector<std::uint8_t> &encoded,
Expand Down Expand Up @@ -383,7 +383,7 @@ std::uint32_t encode_phased(std::vector<std::uint8_t> &encoded,
i += (max_probs * max_ploidy) - (n_probs * _ploid);
sample_idx += 1;
}
return genotype_offset + (bit_idx / 8);
return genotype_offset + (bit_idx / 8) + (std::uint32_t)((bit_idx % 8) > 0);
}

std::vector<std::uint8_t> encode_layout2(
Expand Down
45 changes: 45 additions & 0 deletions tests/test_bgenwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,48 @@ def test_multiple_read_writes(self):
as_integers = (probs * max_val).round()
self.assertTrue(probs_close(geno, probs, bit_depth=bit_depth))
self.assertTrue((integer_values == as_integers).all())

def test_all_possible_genotypes(self):
''' check all possible values in the range available to the bit depths
'''
max_samples = 10000000
for bit_depth in range(1, 24):
print(bit_depth)
first_path = self.tmpdir / f'temp_{bit_depth}.v1.bgen'
second_path = self.tmpdir / f'temp2_{bit_depth}.v2.bgen'

max_val = (2 ** bit_depth) - 1
increment = 1
if (max_val / 2) > max_samples:
increment = ((max_val + 1) / 2) / max_samples

integer_values = np.arange(0, max_val + 1, increment)

remainder = max_val - (integer_values[::2] + integer_values[::-2])

integer_values = np.array([integer_values[::2], integer_values[::-2],
remainder], dtype=np.uint32)
integer_values = np.ascontiguousarray(integer_values.T)
geno = integer_values / max_val

# write first round
with BgenWriter(first_path, n_samples=len(geno)) as bfile:
bfile.add_variant('var1', 'rs1', 'chr1', 10, ['A', 'C'], geno,
bit_depth=bit_depth)

# check the first write
with BgenWriter(second_path, n_samples=len(geno)) as out_bfile:
with BgenReader(first_path, delay_parsing=True) as bfile:
probs = next(bfile).probabilities
as_integers = (probs * max_val).round()
self.assertTrue(probs_close(geno, probs, bit_depth=bit_depth))
self.assertTrue((integer_values == as_integers).all())
out_bfile.add_variant(
'var1', 'rs1', 'chr1', 10, ['A', 'C'], probs, bit_depth=bit_depth)

# check the re-written bgen
with BgenReader(second_path, delay_parsing=True) as bfile:
probs = next(bfile).probabilities
as_integers = (probs * max_val).round()
self.assertTrue(probs_close(geno, probs, bit_depth=bit_depth))
self.assertTrue((integer_values == as_integers).all())

0 comments on commit 3306147

Please sign in to comment.