From 9ab77186cc2cbd927d8d034cef4cc6c7ec2ba376 Mon Sep 17 00:00:00 2001 From: Michael Macias Date: Sun, 9 Mar 2025 14:52:53 -0500 Subject: [PATCH] cram: Buffer normalized bases Instead of pushing single bytes to the hasher, slices of valid bases are given instead. When normalizing lower to uppercase, the converted bases are written to a buffer before updating the hasher. --- noodles-cram/src/lib.rs | 52 ++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/noodles-cram/src/lib.rs b/noodles-cram/src/lib.rs index 38c41d052..0ac27224f 100644 --- a/noodles-cram/src/lib.rs +++ b/noodles-cram/src/lib.rs @@ -14,6 +14,8 @@ mod huffman; pub mod io; pub mod record; +use md5::{Digest, Md5}; + pub use self::{file_definition::FileDefinition, record::Record}; #[deprecated(since = "0.78.0", note = "Use `cram::container` instead.")] @@ -37,19 +39,53 @@ pub use self::r#async::io::Reader as AsyncReader; pub use self::r#async::io::Writer as AsyncWriter; const MAGIC_NUMBER: [u8; 4] = *b"CRAM"; +const MD5_OUTPUT_SIZE: usize = 16; // _Sequence Alignment/Map Format Specification_ (2021-06-03) ยง 1.3.2 "Reference MD5 calculation" -fn calculate_normalized_sequence_digest(sequence: &[u8]) -> [u8; 16] { - use md5::{Digest, Md5}; +fn calculate_normalized_sequence_digest(mut sequence: &[u8]) -> [u8; MD5_OUTPUT_SIZE] { + const MD5_BLOCK_SIZE: usize = 64; + const CHUNK_SIZE: usize = 8 * MD5_BLOCK_SIZE; let mut hasher = Md5::new(); + let mut buf = [0; CHUNK_SIZE]; + + while !sequence.is_empty() { + let n = sequence + .iter() + .position(|b| !b.is_ascii_graphic() || b.is_ascii_lowercase()) + .unwrap_or(sequence.len()); + + hasher.update(&sequence[..n]); + sequence = &sequence[n..]; + + // "All lowercase characters are converted to uppercase." + loop { + let mut n = 0; + + for (src, dst) in sequence + .iter() + .take_while(|b| b.is_ascii_lowercase()) + .zip(&mut buf) + { + *dst = src.to_ascii_uppercase(); + n += 1; + } + + if n == 0 { + break; + } + + hasher.update(&buf[..n]); + sequence = &sequence[n..]; + } - for &b in sequence { // "All characters outside of the inclusive range 33 ('!') to 126 ('~') are stripped out." - if b.is_ascii_graphic() { - // "All lowercase characters are converted to uppercase." - hasher.update([b.to_ascii_uppercase()]); - } + let n = sequence + .iter() + .position(|b| b.is_ascii_graphic()) + .unwrap_or(sequence.len()); + + sequence = &sequence[n..]; } hasher.finalize().into() @@ -70,7 +106,7 @@ mod tests { ); assert_eq!( - calculate_normalized_sequence_digest(b"ACgt"), + calculate_normalized_sequence_digest(b" AC\tgt\n"), [ 0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91, 0x04, 0x3e