-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(compression): Add Fingerprint compression code
- Loading branch information
Showing
2 changed files
with
201 additions
and
126 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,158 +1,232 @@ | ||
/// Pack N least significant bits from each one value into a bitstream. | ||
pub fn pack<const N: usize>(values: &[u32]) -> Vec<u8> { | ||
let mut buf = vec![]; | ||
let mut writer = BitWriter::new(&mut buf); | ||
writer.buffer.reserve((values.len() * N + 7) / 8); | ||
for val in values { | ||
writer.write_bits::<N>(*val as u8); | ||
} | ||
writer.flush(); | ||
buf | ||
} | ||
use crate::Configuration; | ||
|
||
/// Unpack bitstream of N bit numbers into an array. | ||
pub fn unpack<const N: usize>(bytes: &[u8]) -> Vec<u32> { | ||
let mut buf = vec![]; | ||
let mut reader = BitReader::new(bytes); | ||
while let Some(bits) = reader.read_bits::<N>() { | ||
buf.push(bits as u32); | ||
} | ||
buf | ||
} | ||
/// Number of "normal" bits. | ||
const NORMAL_BITS: u8 = 3; | ||
/// Maximum "normal" value above which a value becomes "exceptional". | ||
const MAX_NORMAL_VALUE: u8 = (1 << NORMAL_BITS) - 1; | ||
|
||
/// Create a bitmask with `n` least significant bits set to `1`. | ||
const fn mask_n_bits(n: usize) -> usize { | ||
(1 << n) - 1 | ||
/// Turns an object (e.g. an `u32`) over an iterator of bits. | ||
trait IntoBitIterator { | ||
/// Converts the item into an an iterator over its bits. | ||
fn into_bit_iter(self) -> impl Iterator<Item = bool>; | ||
} | ||
|
||
struct BitWriter<'b> { | ||
buffer: &'b mut Vec<u8>, | ||
current_byte: u8, | ||
/// Number of bits written into `current_byte`. | ||
written_bits: usize, | ||
impl IntoBitIterator for u32 { | ||
fn into_bit_iter(self) -> impl Iterator<Item = bool> { | ||
(0..Self::BITS).map(move |index| ((self >> index) & 1) == 1) | ||
} | ||
} | ||
|
||
impl<'b> BitWriter<'b> { | ||
fn new(buffer: &'b mut Vec<u8>) -> Self { | ||
Self { | ||
buffer, | ||
current_byte: 0, | ||
written_bits: 0, | ||
} | ||
pub struct FingerprintCompressor<'a>(&'a Configuration); | ||
|
||
impl<'a> FingerprintCompressor<'a> { | ||
/// Compress a sub-fingerprint. | ||
fn compress_subfingerprint(subfingerprint: u32) -> impl Iterator<Item = (u8, Option<u8>)> { | ||
subfingerprint | ||
.into_bit_iter() | ||
.enumerate() | ||
.filter_map(|(bit_index, is_bit_set)| { | ||
is_bit_set.then_some(u8::try_from(bit_index).unwrap()) | ||
}) | ||
.scan(0, |last_bit_index, bit_index| { | ||
let value = bit_index - *last_bit_index; | ||
let result = if value >= MAX_NORMAL_VALUE { | ||
(MAX_NORMAL_VALUE, Some(value - MAX_NORMAL_VALUE)) | ||
} else { | ||
(value, None) | ||
}; | ||
|
||
*last_bit_index = bit_index; | ||
Some(result) | ||
}) | ||
.chain(std::iter::once((0, None))) | ||
} | ||
|
||
#[inline] | ||
fn write_bits<const BITS: usize>(&mut self, val: u8) { | ||
assert!(BITS <= 8); | ||
// Mask out bits we don't need. | ||
let val = val & mask_n_bits(BITS) as u8; | ||
if self.written_bits + BITS < 8 { | ||
// We have space for new bits in the current byte so just add them to it. | ||
self.current_byte <<= BITS; | ||
self.current_byte |= val; | ||
self.written_bits += BITS; | ||
} else if self.written_bits + BITS == 8 { | ||
// We have just enough space for new bits to make a single byte. | ||
self.current_byte <<= BITS; | ||
self.current_byte |= val; | ||
self.buffer.push(self.current_byte); | ||
self.current_byte = 0; | ||
self.written_bits = 0; | ||
} else { | ||
// We will overflow some bits... | ||
let overflowing_bits = (self.written_bits + BITS) - 8; | ||
// ... and create a new whole byte from previously saved bits and some of new bits. | ||
let fitting_bits = BITS - overflowing_bits; | ||
self.current_byte <<= fitting_bits; | ||
self.current_byte |= val >> overflowing_bits; | ||
self.buffer.push(self.current_byte); | ||
// Now we just save the remaining bits. | ||
self.current_byte = val & mask_n_bits(overflowing_bits) as u8; | ||
self.written_bits = overflowing_bits; | ||
} | ||
/// Compress the fingerprint. | ||
pub fn compress(&self, fingerprint: &[u32]) -> Vec<u8> { | ||
let size = fingerprint.len(); | ||
let (normal_bits, exceptional_bits) = fingerprint | ||
.iter() | ||
.scan(0, |last_subfp, current_subfp| { | ||
let value = current_subfp ^ *last_subfp; | ||
*last_subfp = *current_subfp; | ||
Some(value) | ||
}) | ||
.flat_map(Self::compress_subfingerprint) | ||
.fold( | ||
( | ||
Vec::<u8>::with_capacity(size), | ||
Vec::<u8>::with_capacity(size), | ||
), | ||
|(mut normal_bits, mut exceptional_bits), (normal_value, exceptional_value)| { | ||
normal_bits.push(normal_value); | ||
if let Some(exceptional_value) = exceptional_value { | ||
exceptional_bits.push(exceptional_value); | ||
} | ||
(normal_bits, exceptional_bits) | ||
}, | ||
); | ||
|
||
let header_size = 4; | ||
let normal_size = packed_intn_array_len(normal_bits.len(), 3); | ||
let exceptional_size = packed_intn_array_len(exceptional_bits.len(), 5); | ||
let expected_size = header_size + normal_size + exceptional_size; | ||
|
||
#[allow(clippy::cast_possible_truncation)] | ||
let output = [ | ||
self.0.id(), | ||
((size >> 16) & 0xFF) as u8, | ||
((size >> 8) & 0xFF) as u8, | ||
(size & 0xFF) as u8, | ||
]; | ||
|
||
let output = output | ||
.into_iter() | ||
.chain(iter_packed_intn_array::<3>(&normal_bits)) | ||
.chain(iter_packed_intn_array::<5>(&exceptional_bits)) | ||
.collect::<Vec<u8>>(); | ||
debug_assert_eq!(output.len(), expected_size); | ||
output | ||
} | ||
} | ||
|
||
fn flush(&mut self) { | ||
if self.written_bits != 0 { | ||
// Finish the current byte by adding some padding. | ||
self.buffer.push(self.current_byte << (8 - self.written_bits as u32)); | ||
self.written_bits = 0; | ||
self.current_byte = 0; | ||
} | ||
impl<'a> From<&'a Configuration> for FingerprintCompressor<'a> { | ||
fn from(value: &'a Configuration) -> Self { | ||
Self(value) | ||
} | ||
} | ||
|
||
struct BitReader<'b> { | ||
bytes: &'b [u8], | ||
current_byte: u8, | ||
remaining_bits: usize, | ||
/// Calculate the size of a packed Int<N> array. | ||
const fn packed_intn_array_len(array_len: usize, n: usize) -> usize { | ||
(array_len * n + 7) / 8 | ||
} | ||
|
||
impl<'b> BitReader<'b> { | ||
fn new(bytes: &'b [u8]) -> Self { | ||
Self { | ||
bytes, | ||
current_byte: 0, | ||
remaining_bits: 0, | ||
} | ||
} | ||
|
||
#[inline] | ||
fn read_bits<const BITS: usize>(&mut self) -> Option<u8> { | ||
assert!(BITS > 0 && BITS <= 8); | ||
|
||
if self.remaining_bits >= BITS { | ||
// Just read bits from the current byte. | ||
let bits = (self.current_byte >> (8 - BITS)) & (mask_n_bits(BITS) as u8); | ||
self.current_byte <<= BITS; | ||
self.remaining_bits -= BITS; | ||
Some(bits) | ||
} else { | ||
// Try read next byte. | ||
let [next_byte, rest @ ..] = self.bytes else { | ||
return None; | ||
}; | ||
self.bytes = rest; | ||
|
||
let bits_from_next_byte = BITS - self.remaining_bits; | ||
let remaining_bits_from_next_byte = 8 - bits_from_next_byte; | ||
let bits = (self.current_byte >> (8 - BITS)) | (next_byte >> remaining_bits_from_next_byte); | ||
self.current_byte = next_byte << bits_from_next_byte; | ||
self.remaining_bits = remaining_bits_from_next_byte; | ||
Some(bits) | ||
} | ||
} | ||
/// Iterate bytes as packed Int<N> array. | ||
fn iter_packed_intn_array<const N: usize>(array: &[u8]) -> impl Iterator<Item = u8> + '_ { | ||
let mask = (0xFF << (8 - N)) >> (8 - N); | ||
array.chunks(8).flat_map(move |slice| { | ||
let (size, result) = slice.iter().map(|s| s & mask).enumerate().fold( | ||
(0, [0u8; N]), | ||
|(_, mut result), (i, bits)| { | ||
let rightmost_bit_index = i * N; | ||
let leftmost_bit_index = rightmost_bit_index + N - 1; | ||
|
||
let right_byte = rightmost_bit_index / 8; | ||
let left_byte = leftmost_bit_index / 8; | ||
|
||
result[right_byte] |= bits << (rightmost_bit_index % 8); | ||
if left_byte != right_byte { | ||
result[left_byte] |= bits >> ((8 - (rightmost_bit_index % 8)) % 8); | ||
} | ||
|
||
(left_byte + 1, result) | ||
}, | ||
); | ||
result.into_iter().take(size) | ||
}) | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::{mask_n_bits, pack, unpack}; | ||
use super::*; | ||
|
||
const ONE_BYTE: [u8; 1] = [0b1011_1010]; | ||
const NINE_BYTES: [u8; 9] = [ | ||
0b1010_1010, | ||
0b0011_0011, | ||
0b1100_1100, | ||
0b1100_0111, | ||
0b0101_0101, | ||
0b1100_1100, | ||
0b1010_1010, | ||
0b0000_0000, | ||
0b1111_1111, | ||
]; | ||
const SIXTYFOUR_BYTES: [u8; 64] = [ | ||
0xA2, 0x87, 0xE3, 0xED, 0xAA, 0xD7, 0xE8, 0x94, 0x53, 0x4E, 0x9B, 0xD5, 0x83, 0x12, 0x05, | ||
0x43, 0x67, 0x7E, 0x0A, 0xAF, 0x2D, 0x85, 0xB4, 0x03, 0xEB, 0x13, 0x8E, 0x47, 0x07, 0xA6, | ||
0x76, 0x5D, 0x43, 0x67, 0x8D, 0x9F, 0xEA, 0xAD, 0x3F, 0x34, 0x86, 0xF4, 0x25, 0xC8, 0xA2, | ||
0xBF, 0xF1, 0x22, 0xB5, 0xA6, 0xB8, 0x4A, 0xED, 0xA2, 0xF5, 0x25, 0xDB, 0x62, 0x70, 0xC2, | ||
0xB7, 0x9C, 0xB1, 0x3C, | ||
]; | ||
|
||
fn packing_n<const N: usize>() { | ||
let values: Vec<_> = (0..1024 * 1024).collect(); | ||
#[test] | ||
fn test_iter_packed_int3_array_single_byte() { | ||
const N: usize = 3; | ||
let packed = iter_packed_intn_array::<N>(&ONE_BYTE).collect::<Vec<u8>>(); | ||
assert_eq!(packed.len(), packed_intn_array_len(ONE_BYTE.len(), N)); | ||
assert_eq!(&packed, &[0b0000_0010]); | ||
} | ||
|
||
let packed = pack::<N>(&values); | ||
let unpacked = unpack::<N>(&packed); | ||
#[test] | ||
fn test_iter_packed_int3_array_some_bytes() { | ||
const N: usize = 3; | ||
let packed = iter_packed_intn_array::<N>(&NINE_BYTES).collect::<Vec<u8>>(); | ||
assert_eq!(packed.len(), packed_intn_array_len(NINE_BYTES.len(), N)); | ||
assert_eq!( | ||
&packed, | ||
&[0b0001_1010, 0b0101_1111, 0b0000_1010, 0b0000_0111] | ||
); | ||
} | ||
|
||
for (a, b) in values.iter().copied().zip(unpacked.iter().copied()) { | ||
assert_eq!(a & mask_n_bits(N) as u32, b); | ||
} | ||
#[test] | ||
fn test_iter_packed_int3_array_many_bytes() { | ||
const N: usize = 3; | ||
let packed = iter_packed_intn_array::<N>(&SIXTYFOUR_BYTES).collect::<Vec<u8>>(); | ||
assert_eq!( | ||
packed.len(), | ||
packed_intn_array_len(SIXTYFOUR_BYTES.len(), N) | ||
); | ||
assert_eq!( | ||
&packed, | ||
&[ | ||
0xFA, 0xAA, 0x83, 0xF3, 0x3A, 0x75, 0xB7, 0xDE, 0x72, 0x9B, 0x7F, 0xBB, 0x7B, 0xAF, | ||
0x9E, 0x66, 0xA1, 0x47, 0x35, 0x54, 0xB5, 0x13, 0x74, 0x86 | ||
], | ||
); | ||
} | ||
|
||
#[test] | ||
fn packing_3() { | ||
packing_n::<3>(); | ||
fn test_iter_packed_int5_array_many_bytes() { | ||
const N: usize = 5; | ||
let packed = iter_packed_intn_array::<N>(&SIXTYFOUR_BYTES).collect::<Vec<u8>>(); | ||
assert_eq!( | ||
packed.len(), | ||
packed_intn_array_len(SIXTYFOUR_BYTES.len(), N) | ||
); | ||
assert_eq!( | ||
&packed, | ||
&[ | ||
0xE2, 0x8C, 0xA6, 0x2E, 0xA2, 0xD3, 0xED, 0x3A, 0x64, 0x19, 0xC7, 0xAB, 0xD7, 0x0A, | ||
0x1D, 0x6B, 0xBA, 0x73, 0x8C, 0xED, 0xE3, 0xB4, 0xAF, 0xDA, 0xA7, 0x86, 0x16, 0x24, | ||
0x7E, 0x14, 0xD5, 0x60, 0xD5, 0x44, 0x2D, 0x5B, 0x40, 0x71, 0x79, 0xE4, | ||
], | ||
); | ||
} | ||
|
||
#[test] | ||
fn packing_5() { | ||
packing_n::<5>(); | ||
fn test_iter_packed_int5_array_single_byte() { | ||
const N: usize = 5; | ||
let packed = iter_packed_intn_array::<N>(&ONE_BYTE).collect::<Vec<u8>>(); | ||
assert_eq!(packed.len(), packed_intn_array_len(ONE_BYTE.len(), N)); | ||
assert_eq!(&packed, &[0b0001_1010]); | ||
} | ||
|
||
#[test] | ||
fn padding() { | ||
let vals = vec![0b11100000u8]; | ||
let unpacked = unpack::<3>(&vals); | ||
assert_eq!(unpacked, &[7, 0]); | ||
fn test_iter_packed_int5_array_some_bytes() { | ||
const N: usize = 5; | ||
let packed = iter_packed_intn_array::<N>(&NINE_BYTES).collect::<Vec<u8>>(); | ||
assert_eq!(packed.len(), packed_intn_array_len(NINE_BYTES.len(), N)); | ||
assert_eq!( | ||
&packed, | ||
&[ | ||
0b0110_1010, | ||
0b1011_0010, | ||
0b0101_0011, | ||
0b1001_1001, | ||
0b0000_0010, | ||
0b0001_1111 | ||
] | ||
); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters