feat(compression): Add Fingerprint compression code

darksv · Oct 4, 2024 · 17f09d7 · 17f09d7
1 parent a7c4e5e
commit 17f09d7
Show file tree

Hide file tree

Showing 2 changed files with 201 additions and 126 deletions.
diff --git a/chromaprint/src/compression.rs b/chromaprint/src/compression.rs
@@ -1,158 +1,232 @@
-/// Pack N least significant bits from each one value into a bitstream.
-pub fn pack<const N: usize>(values: &[u32]) -> Vec<u8> {
-    let mut buf = vec![];
-    let mut writer = BitWriter::new(&mut buf);
-    writer.buffer.reserve((values.len() * N + 7) / 8);
-    for val in values {
-        writer.write_bits::<N>(*val as u8);
-    }
-    writer.flush();
-    buf
-}
+use crate::Configuration;
 
-/// Unpack bitstream of N bit numbers into an array.
-pub fn unpack<const N: usize>(bytes: &[u8]) -> Vec<u32> {
-    let mut buf = vec![];
-    let mut reader = BitReader::new(bytes);
-    while let Some(bits) = reader.read_bits::<N>() {
-        buf.push(bits as u32);
-    }
-    buf
-}
+/// Number of "normal" bits.
+const NORMAL_BITS: u8 = 3;
+/// Maximum "normal" value above which a value becomes "exceptional".
+const MAX_NORMAL_VALUE: u8 = (1 << NORMAL_BITS) - 1;
 
-/// Create a bitmask with `n` least significant bits set to `1`.
-const fn mask_n_bits(n: usize) -> usize {
-    (1 << n) - 1
+/// Turns an object (e.g. an `u32`) over an iterator of bits.
+trait IntoBitIterator {
+    /// Converts the item into an an iterator over its bits.
+    fn into_bit_iter(self) -> impl Iterator<Item = bool>;
 }
 
-struct BitWriter<'b> {
-    buffer: &'b mut Vec<u8>,
-    current_byte: u8,
-    /// Number of bits written into `current_byte`.
-    written_bits: usize,
+impl IntoBitIterator for u32 {
+    fn into_bit_iter(self) -> impl Iterator<Item = bool> {
+        (0..Self::BITS).map(move |index| ((self >> index) & 1) == 1)
+    }
 }
 
-impl<'b> BitWriter<'b> {
-    fn new(buffer: &'b mut Vec<u8>) -> Self {
-        Self {
-            buffer,
-            current_byte: 0,
-            written_bits: 0,
-        }
+pub struct FingerprintCompressor<'a>(&'a Configuration);
+
+impl<'a> FingerprintCompressor<'a> {
+    /// Compress a sub-fingerprint.
+    fn compress_subfingerprint(subfingerprint: u32) -> impl Iterator<Item = (u8, Option<u8>)> {
+        subfingerprint
+            .into_bit_iter()
+            .enumerate()
+            .filter_map(|(bit_index, is_bit_set)| {
+                is_bit_set.then_some(u8::try_from(bit_index).unwrap())
+            })
+            .scan(0, |last_bit_index, bit_index| {
+                let value = bit_index - *last_bit_index;
+                let result = if value >= MAX_NORMAL_VALUE {
+                    (MAX_NORMAL_VALUE, Some(value - MAX_NORMAL_VALUE))
+                } else {
+                    (value, None)
+                };
+
+                *last_bit_index = bit_index;
+                Some(result)
+            })
+            .chain(std::iter::once((0, None)))
     }
 
-    #[inline]
-    fn write_bits<const BITS: usize>(&mut self, val: u8) {
-        assert!(BITS <= 8);
-        // Mask out bits we don't need.
-        let val = val & mask_n_bits(BITS) as u8;
-        if self.written_bits + BITS < 8 {
-            // We have space for new bits in the current byte so just add them to it.
-            self.current_byte <<= BITS;
-            self.current_byte |= val;
-            self.written_bits += BITS;
-        } else if self.written_bits + BITS == 8 {
-            // We have just enough space for new bits to make a single byte.
-            self.current_byte <<= BITS;
-            self.current_byte |= val;
-            self.buffer.push(self.current_byte);
-            self.current_byte = 0;
-            self.written_bits = 0;
-        } else {
-            // We will overflow some bits...
-            let overflowing_bits = (self.written_bits + BITS) - 8;
-            // ... and create a new whole byte from previously saved bits and some of new bits.
-            let fitting_bits = BITS - overflowing_bits;
-            self.current_byte <<= fitting_bits;
-            self.current_byte |= val >> overflowing_bits;
-            self.buffer.push(self.current_byte);
-            // Now we just save the remaining bits.
-            self.current_byte = val & mask_n_bits(overflowing_bits) as u8;
-            self.written_bits = overflowing_bits;
-        }
+    /// Compress the fingerprint.
+    pub fn compress(&self, fingerprint: &[u32]) -> Vec<u8> {
+        let size = fingerprint.len();
+        let (normal_bits, exceptional_bits) = fingerprint
+            .iter()
+            .scan(0, |last_subfp, current_subfp| {
+                let value = current_subfp ^ *last_subfp;
+                *last_subfp = *current_subfp;
+                Some(value)
+            })
+            .flat_map(Self::compress_subfingerprint)
+            .fold(
+                (
+                    Vec::<u8>::with_capacity(size),
+                    Vec::<u8>::with_capacity(size),
+                ),
+                |(mut normal_bits, mut exceptional_bits), (normal_value, exceptional_value)| {
+                    normal_bits.push(normal_value);
+                    if let Some(exceptional_value) = exceptional_value {
+                        exceptional_bits.push(exceptional_value);
+                    }
+                    (normal_bits, exceptional_bits)
+                },
+            );
+
+        let header_size = 4;
+        let normal_size = packed_intn_array_len(normal_bits.len(), 3);
+        let exceptional_size = packed_intn_array_len(exceptional_bits.len(), 5);
+        let expected_size = header_size + normal_size + exceptional_size;
+
+        #[allow(clippy::cast_possible_truncation)]
+        let output = [
+            self.0.id(),
+            ((size >> 16) & 0xFF) as u8,
+            ((size >> 8) & 0xFF) as u8,
+            (size & 0xFF) as u8,
+        ];
+
+        let output = output
+            .into_iter()
+            .chain(iter_packed_intn_array::<3>(&normal_bits))
+            .chain(iter_packed_intn_array::<5>(&exceptional_bits))
+            .collect::<Vec<u8>>();
+        debug_assert_eq!(output.len(), expected_size);
+        output
     }
+}
 
-    fn flush(&mut self) {
-        if self.written_bits != 0 {
-            // Finish the current byte by adding some padding.
-            self.buffer.push(self.current_byte << (8 - self.written_bits as u32));
-            self.written_bits = 0;
-            self.current_byte = 0;
-        }
+impl<'a> From<&'a Configuration> for FingerprintCompressor<'a> {
+    fn from(value: &'a Configuration) -> Self {
+        Self(value)
     }
 }
 
-struct BitReader<'b> {
-    bytes: &'b [u8],
-    current_byte: u8,
-    remaining_bits: usize,
+/// Calculate the size of a packed Int<N> array.
+const fn packed_intn_array_len(array_len: usize, n: usize) -> usize {
+    (array_len * n + 7) / 8
 }
 
-impl<'b> BitReader<'b> {
-    fn new(bytes: &'b [u8]) -> Self {
-        Self {
-            bytes,
-            current_byte: 0,
-            remaining_bits: 0,
-        }
-    }
-
-    #[inline]
-    fn read_bits<const BITS: usize>(&mut self) -> Option<u8> {
-        assert!(BITS > 0 && BITS <= 8);
-
-        if self.remaining_bits >= BITS {
-            // Just read bits from the current byte.
-            let bits = (self.current_byte >> (8 - BITS)) & (mask_n_bits(BITS) as u8);
-            self.current_byte <<= BITS;
-            self.remaining_bits -= BITS;
-            Some(bits)
-        } else {
-            // Try read next byte.
-            let [next_byte, rest @ ..] = self.bytes else {
-                return None;
-            };
-            self.bytes = rest;
-
-            let bits_from_next_byte = BITS - self.remaining_bits;
-            let remaining_bits_from_next_byte = 8 - bits_from_next_byte;
-            let bits = (self.current_byte >> (8 - BITS)) | (next_byte >> remaining_bits_from_next_byte);
-            self.current_byte = next_byte << bits_from_next_byte;
-            self.remaining_bits = remaining_bits_from_next_byte;
-            Some(bits)
-        }
-    }
+/// Iterate bytes as packed Int<N> array.
+fn iter_packed_intn_array<const N: usize>(array: &[u8]) -> impl Iterator<Item = u8> + '_ {
+    let mask = (0xFF << (8 - N)) >> (8 - N);
+    array.chunks(8).flat_map(move |slice| {
+        let (size, result) = slice.iter().map(|s| s & mask).enumerate().fold(
+            (0, [0u8; N]),
+            |(_, mut result), (i, bits)| {
+                let rightmost_bit_index = i * N;
+                let leftmost_bit_index = rightmost_bit_index + N - 1;
+
+                let right_byte = rightmost_bit_index / 8;
+                let left_byte = leftmost_bit_index / 8;
+
+                result[right_byte] |= bits << (rightmost_bit_index % 8);
+                if left_byte != right_byte {
+                    result[left_byte] |= bits >> ((8 - (rightmost_bit_index % 8)) % 8);
+                }
+
+                (left_byte + 1, result)
+            },
+        );
+        result.into_iter().take(size)
+    })
 }
 
 #[cfg(test)]
 mod tests {
-    use super::{mask_n_bits, pack, unpack};
+    use super::*;
+
+    const ONE_BYTE: [u8; 1] = [0b1011_1010];
+    const NINE_BYTES: [u8; 9] = [
+        0b1010_1010,
+        0b0011_0011,
+        0b1100_1100,
+        0b1100_0111,
+        0b0101_0101,
+        0b1100_1100,
+        0b1010_1010,
+        0b0000_0000,
+        0b1111_1111,
+    ];
+    const SIXTYFOUR_BYTES: [u8; 64] = [
+        0xA2, 0x87, 0xE3, 0xED, 0xAA, 0xD7, 0xE8, 0x94, 0x53, 0x4E, 0x9B, 0xD5, 0x83, 0x12, 0x05,
+        0x43, 0x67, 0x7E, 0x0A, 0xAF, 0x2D, 0x85, 0xB4, 0x03, 0xEB, 0x13, 0x8E, 0x47, 0x07, 0xA6,
+        0x76, 0x5D, 0x43, 0x67, 0x8D, 0x9F, 0xEA, 0xAD, 0x3F, 0x34, 0x86, 0xF4, 0x25, 0xC8, 0xA2,
+        0xBF, 0xF1, 0x22, 0xB5, 0xA6, 0xB8, 0x4A, 0xED, 0xA2, 0xF5, 0x25, 0xDB, 0x62, 0x70, 0xC2,
+        0xB7, 0x9C, 0xB1, 0x3C,
+    ];
 
-    fn packing_n<const N: usize>() {
-        let values: Vec<_> = (0..1024 * 1024).collect();
+    #[test]
+    fn test_iter_packed_int3_array_single_byte() {
+        const N: usize = 3;
+        let packed = iter_packed_intn_array::<N>(&ONE_BYTE).collect::<Vec<u8>>();
+        assert_eq!(packed.len(), packed_intn_array_len(ONE_BYTE.len(), N));
+        assert_eq!(&packed, &[0b0000_0010]);
+    }
 
-        let packed = pack::<N>(&values);
-        let unpacked = unpack::<N>(&packed);
+    #[test]
+    fn test_iter_packed_int3_array_some_bytes() {
+        const N: usize = 3;
+        let packed = iter_packed_intn_array::<N>(&NINE_BYTES).collect::<Vec<u8>>();
+        assert_eq!(packed.len(), packed_intn_array_len(NINE_BYTES.len(), N));
+        assert_eq!(
+            &packed,
+            &[0b0001_1010, 0b0101_1111, 0b0000_1010, 0b0000_0111]
+        );
+    }
 
-        for (a, b) in values.iter().copied().zip(unpacked.iter().copied()) {
-            assert_eq!(a & mask_n_bits(N) as u32, b);
-        }
+    #[test]
+    fn test_iter_packed_int3_array_many_bytes() {
+        const N: usize = 3;
+        let packed = iter_packed_intn_array::<N>(&SIXTYFOUR_BYTES).collect::<Vec<u8>>();
+        assert_eq!(
+            packed.len(),
+            packed_intn_array_len(SIXTYFOUR_BYTES.len(), N)
+        );
+        assert_eq!(
+            &packed,
+            &[
+                0xFA, 0xAA, 0x83, 0xF3, 0x3A, 0x75, 0xB7, 0xDE, 0x72, 0x9B, 0x7F, 0xBB, 0x7B, 0xAF,
+                0x9E, 0x66, 0xA1, 0x47, 0x35, 0x54, 0xB5, 0x13, 0x74, 0x86
+            ],
+        );
     }
 
     #[test]
-    fn packing_3() {
-        packing_n::<3>();
+    fn test_iter_packed_int5_array_many_bytes() {
+        const N: usize = 5;
+        let packed = iter_packed_intn_array::<N>(&SIXTYFOUR_BYTES).collect::<Vec<u8>>();
+        assert_eq!(
+            packed.len(),
+            packed_intn_array_len(SIXTYFOUR_BYTES.len(), N)
+        );
+        assert_eq!(
+            &packed,
+            &[
+                0xE2, 0x8C, 0xA6, 0x2E, 0xA2, 0xD3, 0xED, 0x3A, 0x64, 0x19, 0xC7, 0xAB, 0xD7, 0x0A,
+                0x1D, 0x6B, 0xBA, 0x73, 0x8C, 0xED, 0xE3, 0xB4, 0xAF, 0xDA, 0xA7, 0x86, 0x16, 0x24,
+                0x7E, 0x14, 0xD5, 0x60, 0xD5, 0x44, 0x2D, 0x5B, 0x40, 0x71, 0x79, 0xE4,
+            ],
+        );
     }
 
     #[test]
-    fn packing_5() {
-        packing_n::<5>();
+    fn test_iter_packed_int5_array_single_byte() {
+        const N: usize = 5;
+        let packed = iter_packed_intn_array::<N>(&ONE_BYTE).collect::<Vec<u8>>();
+        assert_eq!(packed.len(), packed_intn_array_len(ONE_BYTE.len(), N));
+        assert_eq!(&packed, &[0b0001_1010]);
     }
 
     #[test]
-    fn padding() {
-        let vals = vec![0b11100000u8];
-        let unpacked = unpack::<3>(&vals);
-        assert_eq!(unpacked, &[7, 0]);
+    fn test_iter_packed_int5_array_some_bytes() {
+        const N: usize = 5;
+        let packed = iter_packed_intn_array::<N>(&NINE_BYTES).collect::<Vec<u8>>();
+        assert_eq!(packed.len(), packed_intn_array_len(NINE_BYTES.len(), N));
+        assert_eq!(
+            &packed,
+            &[
+                0b0110_1010,
+                0b1011_0010,
+                0b0101_0011,
+                0b1001_1001,
+                0b0000_0010,
+                0b0001_1111
+            ]
+        );
     }
-}
+}
diff --git a/chromaprint/src/lib.rs b/chromaprint/src/lib.rs
@@ -3,6 +3,7 @@
 pub use audio_processor::ResetError;
 pub use fingerprint_matcher::{match_fingerprints, Segment, MatchError};
 pub use fingerprinter::{Configuration, Fingerprinter};
+pub use compression::FingerprintCompressor;
 
 mod audio_processor;
 mod chroma;