Skip to content

Commit

Permalink
Inline memchr2 logic into Mycroft's algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
purplesyringa committed Aug 11, 2024
1 parent 3063d69 commit fce646f
Showing 1 changed file with 31 additions and 37 deletions.
68 changes: 31 additions & 37 deletions src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,53 +426,49 @@ impl<'a> SliceRead<'a> {
}
}

#[inline(always)]
fn skip_to_escape(&mut self, forbid_control_characters: bool) {
let rest = &self.slice[self.index..];
let end = self.index + memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());

if !forbid_control_characters {
self.index = end;
self.index += memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
return;
}

// We now wish to check if the chunk contains a byte in range 0x00..=0x1F. Ideally, this
// would be integrated this into the memchr2 check above, but memchr does not support this
// at the moment. Therefore, use a variation on Mycroft's algorithm [1] to provide
// performance better than a naive loop. It runs faster than just a single memchr call on
// benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so
// probably the right fit.
// We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use
// something akin to memchr3, but the memchr crate does not support this at the moment.
// Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better
// than a naive loop. It runs faster than equivalent memchr2+naive loop code on benchmarks
// and it's cross-platform, so probably the right fit.
// [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
const STEP: usize = mem::size_of::<usize>();

// Moving this to a local variable removes a spill in the hot loop.
let mut index = self.index;

if self.slice.len() >= STEP {
while index < end.min(self.slice.len() - STEP + 1) {
// We can safely overread past end in most cases. This ensures that SWAR code is
// used to handle the tail in the hot path.
const ONE_BYTES: usize = usize::MAX / 255;
let chars = usize::from_ne_bytes(self.slice[index..][..STEP].try_into().unwrap());
let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7);

if mask != 0 {
index += mask.trailing_zeros() as usize / 8;
break;
}

index += STEP;
}
}

if index < end {
if let Some(offset) = self.slice[index..end].iter().position(|&c| c <= 0x1F) {
self.index = index + offset;
type Chunk = usize;
const STEP: usize = mem::size_of::<Chunk>();
const ONE_BYTES: Chunk = Chunk::MAX / 255; // 0x0101...01

for chunk in rest.chunks_exact(STEP) {
let chars = Chunk::from_ne_bytes(chunk.try_into().unwrap());
let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars;
let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"'));
let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote;
let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\'));
let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash;
let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7);
if masked != 0 {
// SAFETY: chunk is in-bounds for slice
self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize
+ masked.trailing_zeros() as usize / 8;
return;
}
}

self.index = end;
self.skip_to_escape_naive();
}

#[cold]
#[inline(never)]
fn skip_to_escape_naive(&mut self) {
while self.index < self.slice.len() && !is_escape(self.slice[self.index]) {
self.index += 1;
}
}

/// The big optimization here over IoRead is that if the string contains no
Expand Down Expand Up @@ -823,8 +819,6 @@ pub trait Fused: private::Sealed {}
impl<'a> Fused for SliceRead<'a> {}
impl<'a> Fused for StrRead<'a> {}

// This is only used in IoRead. SliceRead hardcodes the arguments to memchr.
#[cfg(feature = "std")]
fn is_escape(ch: u8) -> bool {
ch == b'"' || ch == b'\\' || ch < 0x20
}
Expand Down

0 comments on commit fce646f

Please sign in to comment.