From 9ab77186cc2cbd927d8d034cef4cc6c7ec2ba376 Mon Sep 17 00:00:00 2001
From: Michael Macias <zaeleus@gmail.com>
Date: Sun, 9 Mar 2025 14:52:53 -0500
Subject: [PATCH] cram: Buffer normalized bases

Instead of pushing single bytes to the hasher, slices of valid bases are
given instead. When normalizing lower to uppercase, the converted bases
are written to a buffer before updating the hasher.
---
 noodles-cram/src/lib.rs | 52 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/noodles-cram/src/lib.rs b/noodles-cram/src/lib.rs
index 38c41d052..0ac27224f 100644
--- a/noodles-cram/src/lib.rs
+++ b/noodles-cram/src/lib.rs
@@ -14,6 +14,8 @@ mod huffman;
 pub mod io;
 pub mod record;
 
+use md5::{Digest, Md5};
+
 pub use self::{file_definition::FileDefinition, record::Record};
 
 #[deprecated(since = "0.78.0", note = "Use `cram::container` instead.")]
@@ -37,19 +39,53 @@ pub use self::r#async::io::Reader as AsyncReader;
 pub use self::r#async::io::Writer as AsyncWriter;
 
 const MAGIC_NUMBER: [u8; 4] = *b"CRAM";
+const MD5_OUTPUT_SIZE: usize = 16;
 
 // _Sequence Alignment/Map Format Specification_ (2021-06-03) § 1.3.2 "Reference MD5 calculation"
-fn calculate_normalized_sequence_digest(sequence: &[u8]) -> [u8; 16] {
-    use md5::{Digest, Md5};
+fn calculate_normalized_sequence_digest(mut sequence: &[u8]) -> [u8; MD5_OUTPUT_SIZE] {
+    const MD5_BLOCK_SIZE: usize = 64;
+    const CHUNK_SIZE: usize = 8 * MD5_BLOCK_SIZE;
 
     let mut hasher = Md5::new();
+    let mut buf = [0; CHUNK_SIZE];
+
+    while !sequence.is_empty() {
+        let n = sequence
+            .iter()
+            .position(|b| !b.is_ascii_graphic() || b.is_ascii_lowercase())
+            .unwrap_or(sequence.len());
+
+        hasher.update(&sequence[..n]);
+        sequence = &sequence[n..];
+
+        // "All lowercase characters are converted to uppercase."
+        loop {
+            let mut n = 0;
+
+            for (src, dst) in sequence
+                .iter()
+                .take_while(|b| b.is_ascii_lowercase())
+                .zip(&mut buf)
+            {
+                *dst = src.to_ascii_uppercase();
+                n += 1;
+            }
+
+            if n == 0 {
+                break;
+            }
+
+            hasher.update(&buf[..n]);
+            sequence = &sequence[n..];
+        }
 
-    for &b in sequence {
         // "All characters outside of the inclusive range 33 ('!') to 126 ('~') are stripped out."
-        if b.is_ascii_graphic() {
-            // "All lowercase characters are converted to uppercase."
-            hasher.update([b.to_ascii_uppercase()]);
-        }
+        let n = sequence
+            .iter()
+            .position(|b| b.is_ascii_graphic())
+            .unwrap_or(sequence.len());
+
+        sequence = &sequence[n..];
     }
 
     hasher.finalize().into()
@@ -70,7 +106,7 @@ mod tests {
         );
 
         assert_eq!(
-            calculate_normalized_sequence_digest(b"ACgt"),
+            calculate_normalized_sequence_digest(b" AC\tgt\n"),
             [
                 0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91,
                 0x04, 0x3e