From 3faae037e9b025a656464d61a05988915488e291 Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 29 Jul 2024 11:54:21 +0300 Subject: [PATCH 1/9] Vectorize string parsing --- src/read.rs | 58 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/src/read.rs b/src/read.rs index e03e13f28..7435dc702 100644 --- a/src/read.rs +++ b/src/read.rs @@ -2,6 +2,7 @@ use crate::error::{Error, ErrorCode, Result}; use alloc::vec::Vec; use core::char; use core::cmp; +use core::mem; use core::ops::Deref; use core::str; @@ -425,6 +426,51 @@ impl<'a> SliceRead<'a> { } } + #[inline(always)] + fn skip_to_escape(&mut self, forbid_control_characters: bool) { + let rest = &self.slice[self.index..]; + let end = self.index + memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len()); + + if !forbid_control_characters { + self.index = end; + return; + } + + // We now wish to check if the chunk contains a byte in range 0x00..=0x1F. Ideally, this + // would be integrated this into the memchr2 check above, but memchr does not support this + // at the moment. Therefore, use a variation on Mycroft's algorithm [1] to provide + // performance better than a naive loop. It runs faster than just a single memchr call on + // benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so + // probably the right fit. + // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ + + // Pad the chunk to a whole count of units if possible. This ensures that SWAR code is used + // to handle the tail in the hot path. + let block_end = (self.index + (end - self.index).next_multiple_of(mem::size_of::())) + .min(self.slice.len()); + let mut block = &self.slice[self.index..block_end]; + + while let Some((chars, block_remainder)) = block.split_first_chunk() { + const ONE_BYTES: usize = usize::MAX / 255; + let chars = usize::from_ne_bytes(*chars); + let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7); + + if mask != 0 { + let control_index = block_end - block.len() + mask.trailing_zeros() as usize / 8; + self.index = control_index.min(end); + return; + } + + block = block_remainder; + } + + if let Some(offset) = block.iter().position(|&c| c <= 0x1F) { + self.index = (block_end - block.len() + offset).min(end); + } else { + self.index = end; + } + } + /// The big optimization here over IoRead is that if the string contains no /// backslash escape sequences, the returned &str is a slice of the raw JSON /// data so we avoid copying into the scratch space. @@ -442,9 +488,7 @@ impl<'a> SliceRead<'a> { let mut start = self.index; loop { - while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] { - self.index += 1; - } + self.skip_to_escape(validate); if self.index == self.slice.len() { return error(self, ErrorCode::EofWhileParsingString); } @@ -470,9 +514,7 @@ impl<'a> SliceRead<'a> { } _ => { self.index += 1; - if validate { - return error(self, ErrorCode::ControlCharacterWhileParsingString); - } + return error(self, ErrorCode::ControlCharacterWhileParsingString); } } } @@ -538,9 +580,7 @@ impl<'a> Read<'a> for SliceRead<'a> { fn ignore_str(&mut self) -> Result<()> { loop { - while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] { - self.index += 1; - } + self.skip_to_escape(true); if self.index == self.slice.len() { return error(self, ErrorCode::EofWhileParsingString); } From 03ceee9eb1be8a2f792da8ed4c5992cb61219396 Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 29 Jul 2024 12:17:33 +0300 Subject: [PATCH 2/9] Replace ESCAPE array with is_escape fn This is not backed by benchmarks, but it seems reasonable that we'd be more starved for cache than CPU in IO-bound tasks. It also simplifies code a bit and frees up some memory, which is probably a good thing. --- src/read.rs | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/src/read.rs b/src/read.rs index 7435dc702..a79d64c9b 100644 --- a/src/read.rs +++ b/src/read.rs @@ -222,7 +222,7 @@ where { loop { let ch = tri!(next_or_eof(self)); - if !ESCAPE[ch as usize] { + if !is_escape(ch) { scratch.push(ch); continue; } @@ -343,7 +343,7 @@ where fn ignore_str(&mut self) -> Result<()> { loop { let ch = tri!(next_or_eof(self)); - if !ESCAPE[ch as usize] { + if !is_escape(ch) { continue; } match ch { @@ -819,33 +819,11 @@ pub trait Fused: private::Sealed {} impl<'a> Fused for SliceRead<'a> {} impl<'a> Fused for StrRead<'a> {} -// Lookup table of bytes that must be escaped. A value of true at index i means -// that byte i requires an escape sequence in the input. -static ESCAPE: [bool; 256] = { - const CT: bool = true; // control character \x00..=\x1F - const QU: bool = true; // quote \x22 - const BS: bool = true; // backslash \x5C - const __: bool = false; // allow unescaped - [ - // 1 2 3 4 5 6 7 8 9 A B C D E F - CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0 - CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1 - __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 - __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F - ] -}; +// This is only used in IoRead. SliceRead hardcodes the arguments to memchr. +#[cfg(feature = "std")] +fn is_escape(ch: u8) -> bool { + ch == b'"' || ch == b'\\' || ch < 0x20 +} fn next_or_eof<'de, R>(read: &mut R) -> Result where From 63cb04d74b38ca1915d6996fd1a6291a6edb987f Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 29 Jul 2024 13:05:39 +0300 Subject: [PATCH 3/9] Bring MSRV down --- src/read.rs | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/src/read.rs b/src/read.rs index a79d64c9b..adb17850c 100644 --- a/src/read.rs +++ b/src/read.rs @@ -443,32 +443,36 @@ impl<'a> SliceRead<'a> { // benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so // probably the right fit. // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ + const STEP: usize = mem::size_of::(); + + // Moving this to a local variable removes a spill in the hot loop. + let mut index = self.index; + + if self.slice.len() >= STEP { + while index < end.min(self.slice.len() - STEP + 1) { + // We can safely overread past end in most cases. This ensures that SWAR code is + // used to handle the tail in the hot path. + const ONE_BYTES: usize = usize::MAX / 255; + let chars = usize::from_ne_bytes(self.slice[index..][..STEP].try_into().unwrap()); + let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7); + + if mask != 0 { + index += mask.trailing_zeros() as usize / 8; + break; + } - // Pad the chunk to a whole count of units if possible. This ensures that SWAR code is used - // to handle the tail in the hot path. - let block_end = (self.index + (end - self.index).next_multiple_of(mem::size_of::())) - .min(self.slice.len()); - let mut block = &self.slice[self.index..block_end]; - - while let Some((chars, block_remainder)) = block.split_first_chunk() { - const ONE_BYTES: usize = usize::MAX / 255; - let chars = usize::from_ne_bytes(*chars); - let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7); + index += STEP; + } + } - if mask != 0 { - let control_index = block_end - block.len() + mask.trailing_zeros() as usize / 8; - self.index = control_index.min(end); + if index < end { + if let Some(offset) = self.slice[index..end].iter().position(|&c| c <= 0x1F) { + self.index = index + offset; return; } - - block = block_remainder; } - if let Some(offset) = block.iter().position(|&c| c <= 0x1F) { - self.index = (block_end - block.len() + offset).min(end); - } else { - self.index = end; - } + self.index = end; } /// The big optimization here over IoRead is that if the string contains no From 3063d69fd53f075d06318c0b19a7ddd700fe57a6 Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 29 Jul 2024 13:23:01 +0300 Subject: [PATCH 4/9] Add better tests --- tests/test.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test.rs b/tests/test.rs index 71087162b..6923e6e38 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -2497,3 +2497,22 @@ fn hash_positive_and_negative_zero() { assert_eq!(rand.hash_one(k1), rand.hash_one(k2)); } } + +#[test] +fn test_control_character_search() { + // Different space circumstances + for n in 0..16 { + for m in 0..16 { + test_parse_err::(&[( + &format!("\"{}\n{}\"", ".".repeat(n), ".".repeat(m)), + "control character (\\u0000-\\u001F) found while parsing a string at line 2 column 0", + )]); + } + } + + // Multiple occurrences + test_parse_err::(&[( + &"\"\t\n\r\"", + "control character (\\u0000-\\u001F) found while parsing a string at line 1 column 2", + )]); +} From 5496579070cddd5fe14e23b27044936138b3f73c Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Sun, 11 Aug 2024 14:27:09 +0300 Subject: [PATCH 5/9] Inline memchr2 logic into Mycroft's algorithm --- src/read.rs | 68 ++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/src/read.rs b/src/read.rs index adb17850c..caa27d310 100644 --- a/src/read.rs +++ b/src/read.rs @@ -426,53 +426,49 @@ impl<'a> SliceRead<'a> { } } - #[inline(always)] fn skip_to_escape(&mut self, forbid_control_characters: bool) { let rest = &self.slice[self.index..]; - let end = self.index + memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len()); if !forbid_control_characters { - self.index = end; + self.index += memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len()); return; } - // We now wish to check if the chunk contains a byte in range 0x00..=0x1F. Ideally, this - // would be integrated this into the memchr2 check above, but memchr does not support this - // at the moment. Therefore, use a variation on Mycroft's algorithm [1] to provide - // performance better than a naive loop. It runs faster than just a single memchr call on - // benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so - // probably the right fit. + // We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use + // something akin to memchr3, but the memchr crate does not support this at the moment. + // Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better + // than a naive loop. It runs faster than equivalent two-pass memchr2+SWAR code on + // benchmarks and it's cross-platform, so probably the right fit. // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ - const STEP: usize = mem::size_of::(); - - // Moving this to a local variable removes a spill in the hot loop. - let mut index = self.index; - - if self.slice.len() >= STEP { - while index < end.min(self.slice.len() - STEP + 1) { - // We can safely overread past end in most cases. This ensures that SWAR code is - // used to handle the tail in the hot path. - const ONE_BYTES: usize = usize::MAX / 255; - let chars = usize::from_ne_bytes(self.slice[index..][..STEP].try_into().unwrap()); - let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7); - - if mask != 0 { - index += mask.trailing_zeros() as usize / 8; - break; - } - - index += STEP; - } - } - - if index < end { - if let Some(offset) = self.slice[index..end].iter().position(|&c| c <= 0x1F) { - self.index = index + offset; + type Chunk = usize; + const STEP: usize = mem::size_of::(); + const ONE_BYTES: Chunk = Chunk::MAX / 255; // 0x0101...01 + + for chunk in rest.chunks_exact(STEP) { + let chars = Chunk::from_ne_bytes(chunk.try_into().unwrap()); + let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars; + let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"')); + let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote; + let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\')); + let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash; + let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7); + if masked != 0 { + // SAFETY: chunk is in-bounds for slice + self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize + + masked.trailing_zeros() as usize / 8; return; } } - self.index = end; + self.skip_to_escape_slow(); + } + + #[cold] + #[inline(never)] + fn skip_to_escape_slow(&mut self) { + while self.index < self.slice.len() && !is_escape(self.slice[self.index]) { + self.index += 1; + } } /// The big optimization here over IoRead is that if the string contains no @@ -823,8 +819,6 @@ pub trait Fused: private::Sealed {} impl<'a> Fused for SliceRead<'a> {} impl<'a> Fused for StrRead<'a> {} -// This is only used in IoRead. SliceRead hardcodes the arguments to memchr. -#[cfg(feature = "std")] fn is_escape(ch: u8) -> bool { ch == b'"' || ch == b'\\' || ch < 0x20 } From a95d6df9d08611c9a11ac6524903d693921b8eae Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Sun, 11 Aug 2024 15:30:26 +0300 Subject: [PATCH 6/9] Big endian support --- src/read.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/read.rs b/src/read.rs index caa27d310..96784b827 100644 --- a/src/read.rs +++ b/src/read.rs @@ -453,9 +453,14 @@ impl<'a> SliceRead<'a> { let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash; let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7); if masked != 0 { + let addresswise_first_bit = if cfg!(target_endian = "little") { + masked.trailing_zeros() + } else { + masked.leading_zeros() + }; // SAFETY: chunk is in-bounds for slice self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize - + masked.trailing_zeros() as usize / 8; + + addresswise_first_bit as usize / 8; return; } } From 1f0dcf791ab1756d7ad07c20889e50bd9a7887fb Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Sun, 11 Aug 2024 15:38:14 +0300 Subject: [PATCH 7/9] Allow clippy::items_after_statements --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index 3225e26f0..4931f00da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -340,6 +340,7 @@ clippy::wildcard_imports, // things are often more readable this way clippy::cast_lossless, + clippy::items_after_statements, clippy::module_name_repetitions, clippy::redundant_else, clippy::shadow_unrelated, From 8389d8a11293616ce5a4358651aede271871248d Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Sun, 11 Aug 2024 16:10:35 +0300 Subject: [PATCH 8/9] Don't run the slow algorithm from the beginning --- src/read.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/read.rs b/src/read.rs index 96784b827..9e652d760 100644 --- a/src/read.rs +++ b/src/read.rs @@ -465,6 +465,7 @@ impl<'a> SliceRead<'a> { } } + self.index += rest.len() / STEP * STEP; self.skip_to_escape_slow(); } From e43da5ee0e64819972f08254e8ce799796238791 Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Sun, 11 Aug 2024 19:21:45 +0300 Subject: [PATCH 9/9] Immediately bail-out on empty strings --- src/read.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/read.rs b/src/read.rs index 9e652d760..1494ec6d2 100644 --- a/src/read.rs +++ b/src/read.rs @@ -222,7 +222,7 @@ where { loop { let ch = tri!(next_or_eof(self)); - if !is_escape(ch) { + if !is_escape(ch, true) { scratch.push(ch); continue; } @@ -343,7 +343,7 @@ where fn ignore_str(&mut self) -> Result<()> { loop { let ch = tri!(next_or_eof(self)); - if !is_escape(ch) { + if !is_escape(ch, true) { continue; } match ch { @@ -427,6 +427,14 @@ impl<'a> SliceRead<'a> { } fn skip_to_escape(&mut self, forbid_control_characters: bool) { + // Immediately bail-out on empty strings and consecutive escapes (e.g. \u041b\u0435) + if self.index == self.slice.len() + || is_escape(self.slice[self.index], forbid_control_characters) + { + return; + } + self.index += 1; + let rest = &self.slice[self.index..]; if !forbid_control_characters { @@ -472,7 +480,7 @@ impl<'a> SliceRead<'a> { #[cold] #[inline(never)] fn skip_to_escape_slow(&mut self) { - while self.index < self.slice.len() && !is_escape(self.slice[self.index]) { + while self.index < self.slice.len() && !is_escape(self.slice[self.index], true) { self.index += 1; } } @@ -825,8 +833,8 @@ pub trait Fused: private::Sealed {} impl<'a> Fused for SliceRead<'a> {} impl<'a> Fused for StrRead<'a> {} -fn is_escape(ch: u8) -> bool { - ch == b'"' || ch == b'\\' || ch < 0x20 +fn is_escape(ch: u8, including_control_characters: bool) -> bool { + ch == b'"' || ch == b'\\' || (including_control_characters && ch < 0x20) } fn next_or_eof<'de, R>(read: &mut R) -> Result