Skip to content

Commit

Permalink
cram: Buffer normalized bases
Browse files Browse the repository at this point in the history
Instead of pushing single bytes to the hasher, slices of valid bases are
given instead. When normalizing lower to uppercase, the converted bases
are written to a buffer before updating the hasher.
  • Loading branch information
zaeleus committed Mar 9, 2025
1 parent 7d1745f commit 9ab7718
Showing 1 changed file with 44 additions and 8 deletions.
52 changes: 44 additions & 8 deletions noodles-cram/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ mod huffman;
pub mod io;
pub mod record;

use md5::{Digest, Md5};

pub use self::{file_definition::FileDefinition, record::Record};

#[deprecated(since = "0.78.0", note = "Use `cram::container` instead.")]
Expand All @@ -37,19 +39,53 @@ pub use self::r#async::io::Reader as AsyncReader;
pub use self::r#async::io::Writer as AsyncWriter;

const MAGIC_NUMBER: [u8; 4] = *b"CRAM";
const MD5_OUTPUT_SIZE: usize = 16;

// _Sequence Alignment/Map Format Specification_ (2021-06-03) § 1.3.2 "Reference MD5 calculation"
fn calculate_normalized_sequence_digest(sequence: &[u8]) -> [u8; 16] {
use md5::{Digest, Md5};
fn calculate_normalized_sequence_digest(mut sequence: &[u8]) -> [u8; MD5_OUTPUT_SIZE] {
const MD5_BLOCK_SIZE: usize = 64;
const CHUNK_SIZE: usize = 8 * MD5_BLOCK_SIZE;

let mut hasher = Md5::new();
let mut buf = [0; CHUNK_SIZE];

while !sequence.is_empty() {
let n = sequence
.iter()
.position(|b| !b.is_ascii_graphic() || b.is_ascii_lowercase())
.unwrap_or(sequence.len());

hasher.update(&sequence[..n]);
sequence = &sequence[n..];

// "All lowercase characters are converted to uppercase."
loop {
let mut n = 0;

for (src, dst) in sequence
.iter()
.take_while(|b| b.is_ascii_lowercase())
.zip(&mut buf)
{
*dst = src.to_ascii_uppercase();
n += 1;
}

if n == 0 {
break;
}

hasher.update(&buf[..n]);
sequence = &sequence[n..];
}

for &b in sequence {
// "All characters outside of the inclusive range 33 ('!') to 126 ('~') are stripped out."
if b.is_ascii_graphic() {
// "All lowercase characters are converted to uppercase."
hasher.update([b.to_ascii_uppercase()]);
}
let n = sequence
.iter()
.position(|b| b.is_ascii_graphic())
.unwrap_or(sequence.len());

sequence = &sequence[n..];
}

hasher.finalize().into()
Expand All @@ -70,7 +106,7 @@ mod tests {
);

assert_eq!(
calculate_normalized_sequence_digest(b"ACgt"),
calculate_normalized_sequence_digest(b" AC\tgt\n"),
[
0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91,
0x04, 0x3e
Expand Down

0 comments on commit 9ab7718

Please sign in to comment.