From 3faae037e9b025a656464d61a05988915488e291 Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Mon, 29 Jul 2024 11:54:21 +0300
Subject: [PATCH 1/9] Vectorize string parsing

---
 src/read.rs | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 9 deletions(-)
diff --git a/src/read.rs b/src/read.rs
index e03e13f28..7435dc702 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -2,6 +2,7 @@ use crate::error::{Error, ErrorCode, Result};
 use alloc::vec::Vec;
 use core::char;
 use core::cmp;
+use core::mem;
 use core::ops::Deref;
 use core::str;
 
@@ -425,6 +426,51 @@ impl<'a> SliceRead<'a> {
         }
     }
 
+    #[inline(always)]
+    fn skip_to_escape(&mut self, forbid_control_characters: bool) {
+        let rest = &self.slice[self.index..];
+        let end = self.index + memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
+
+        if !forbid_control_characters {
+            self.index = end;
+            return;
+        }
+
+        // We now wish to check if the chunk contains a byte in range 0x00..=0x1F. Ideally, this
+        // would be integrated this into the memchr2 check above, but memchr does not support this
+        // at the moment. Therefore, use a variation on Mycroft's algorithm [1] to provide
+        // performance better than a naive loop. It runs faster than just a single memchr call on
+        // benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so
+        // probably the right fit.
+        // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
+
+        // Pad the chunk to a whole count of units if possible. This ensures that SWAR code is used
+        // to handle the tail in the hot path.
+        let block_end = (self.index + (end - self.index).next_multiple_of(mem::size_of::<usize>()))
+            .min(self.slice.len());
+        let mut block = &self.slice[self.index..block_end];
+
+        while let Some((chars, block_remainder)) = block.split_first_chunk() {
+            const ONE_BYTES: usize = usize::MAX / 255;
+            let chars = usize::from_ne_bytes(*chars);
+            let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7);
+
+            if mask != 0 {
+                let control_index = block_end - block.len() + mask.trailing_zeros() as usize / 8;
+                self.index = control_index.min(end);
+                return;
+            }
+
+            block = block_remainder;
+        }
+
+        if let Some(offset) = block.iter().position(|&c| c <= 0x1F) {
+            self.index = (block_end - block.len() + offset).min(end);
+        } else {
+            self.index = end;
+        }
+    }
+
     /// The big optimization here over IoRead is that if the string contains no
     /// backslash escape sequences, the returned &str is a slice of the raw JSON
     /// data so we avoid copying into the scratch space.
@@ -442,9 +488,7 @@ impl<'a> SliceRead<'a> {
         let mut start = self.index;
 
         loop {
-            while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] {
-                self.index += 1;
-            }
+            self.skip_to_escape(validate);
             if self.index == self.slice.len() {
                 return error(self, ErrorCode::EofWhileParsingString);
             }
@@ -470,9 +514,7 @@ impl<'a> SliceRead<'a> {
                 }
                 _ => {
                     self.index += 1;
-                    if validate {
-                        return error(self, ErrorCode::ControlCharacterWhileParsingString);
-                    }
+                    return error(self, ErrorCode::ControlCharacterWhileParsingString);
                 }
             }
         }
@@ -538,9 +580,7 @@ impl<'a> Read<'a> for SliceRead<'a> {
 
     fn ignore_str(&mut self) -> Result<()> {
         loop {
-            while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] {
-                self.index += 1;
-            }
+            self.skip_to_escape(true);
             if self.index == self.slice.len() {
                 return error(self, ErrorCode::EofWhileParsingString);
             }

From 03ceee9eb1be8a2f792da8ed4c5992cb61219396 Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Mon, 29 Jul 2024 12:17:33 +0300
Subject: [PATCH 2/9] Replace ESCAPE array with is_escape fn

This is not backed by benchmarks, but it seems reasonable that we'd be
more starved for cache than CPU in IO-bound tasks. It also simplifies
code a bit and frees up some memory, which is probably a good thing.
---
 src/read.rs | 36 +++++++-----------------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 7435dc702..a79d64c9b 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -222,7 +222,7 @@ where
     {
         loop {
             let ch = tri!(next_or_eof(self));
-            if !ESCAPE[ch as usize] {
+            if !is_escape(ch) {
                 scratch.push(ch);
                 continue;
             }
@@ -343,7 +343,7 @@ where
     fn ignore_str(&mut self) -> Result<()> {
         loop {
             let ch = tri!(next_or_eof(self));
-            if !ESCAPE[ch as usize] {
+            if !is_escape(ch) {
                 continue;
             }
             match ch {
@@ -819,33 +819,11 @@ pub trait Fused: private::Sealed {}
 impl<'a> Fused for SliceRead<'a> {}
 impl<'a> Fused for StrRead<'a> {}
 
-// Lookup table of bytes that must be escaped. A value of true at index i means
-// that byte i requires an escape sequence in the input.
-static ESCAPE: [bool; 256] = {
-    const CT: bool = true; // control character \x00..=\x1F
-    const QU: bool = true; // quote \x22
-    const BS: bool = true; // backslash \x5C
-    const __: bool = false; // allow unescaped
-    [
-        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
-        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0
-        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1
-        __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
-        __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
-    ]
-};
+// This is only used in IoRead. SliceRead hardcodes the arguments to memchr.
+#[cfg(feature = "std")]
+fn is_escape(ch: u8) -> bool {
+    ch == b'"' || ch == b'\\' || ch < 0x20
+}
 
 fn next_or_eof<'de, R>(read: &mut R) -> Result<u8>
 where

From 63cb04d74b38ca1915d6996fd1a6291a6edb987f Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Mon, 29 Jul 2024 13:05:39 +0300
Subject: [PATCH 3/9] Bring MSRV down

---
 src/read.rs | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index a79d64c9b..adb17850c 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -443,32 +443,36 @@ impl<'a> SliceRead<'a> {
         // benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so
         // probably the right fit.
         // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
+        const STEP: usize = mem::size_of::<usize>();
+
+        // Moving this to a local variable removes a spill in the hot loop.
+        let mut index = self.index;
+
+        if self.slice.len() >= STEP {
+            while index < end.min(self.slice.len() - STEP + 1) {
+                // We can safely overread past end in most cases. This ensures that SWAR code is
+                // used to handle the tail in the hot path.
+                const ONE_BYTES: usize = usize::MAX / 255;
+                let chars = usize::from_ne_bytes(self.slice[index..][..STEP].try_into().unwrap());
+                let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7);
+
+                if mask != 0 {
+                    index += mask.trailing_zeros() as usize / 8;
+                    break;
+                }
 
-        // Pad the chunk to a whole count of units if possible. This ensures that SWAR code is used
-        // to handle the tail in the hot path.
-        let block_end = (self.index + (end - self.index).next_multiple_of(mem::size_of::<usize>()))
-            .min(self.slice.len());
-        let mut block = &self.slice[self.index..block_end];
-
-        while let Some((chars, block_remainder)) = block.split_first_chunk() {
-            const ONE_BYTES: usize = usize::MAX / 255;
-            let chars = usize::from_ne_bytes(*chars);
-            let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7);
+                index += STEP;
+            }
+        }
 
-            if mask != 0 {
-                let control_index = block_end - block.len() + mask.trailing_zeros() as usize / 8;
-                self.index = control_index.min(end);
+        if index < end {
+            if let Some(offset) = self.slice[index..end].iter().position(|&c| c <= 0x1F) {
+                self.index = index + offset;
                 return;
             }
-
-            block = block_remainder;
         }
 
-        if let Some(offset) = block.iter().position(|&c| c <= 0x1F) {
-            self.index = (block_end - block.len() + offset).min(end);
-        } else {
-            self.index = end;
-        }
+        self.index = end;
     }
 
     /// The big optimization here over IoRead is that if the string contains no

From 3063d69fd53f075d06318c0b19a7ddd700fe57a6 Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Mon, 29 Jul 2024 13:23:01 +0300
Subject: [PATCH 4/9] Add better tests

---
 tests/test.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/test.rs b/tests/test.rs
index 71087162b..6923e6e38 100644
--- a/tests/test.rs
+++ b/tests/test.rs
@@ -2497,3 +2497,22 @@ fn hash_positive_and_negative_zero() {
         assert_eq!(rand.hash_one(k1), rand.hash_one(k2));
     }
 }
+
+#[test]
+fn test_control_character_search() {
+    // Different space circumstances
+    for n in 0..16 {
+        for m in 0..16 {
+            test_parse_err::<String>(&[(
+                &format!("\"{}\n{}\"", ".".repeat(n), ".".repeat(m)),
+                "control character (\\u0000-\\u001F) found while parsing a string at line 2 column 0",
+            )]);
+        }
+    }
+
+    // Multiple occurrences
+    test_parse_err::<String>(&[(
+        &"\"\t\n\r\"",
+        "control character (\\u0000-\\u001F) found while parsing a string at line 1 column 2",
+    )]);
+}

From 5496579070cddd5fe14e23b27044936138b3f73c Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Sun, 11 Aug 2024 14:27:09 +0300
Subject: [PATCH 5/9] Inline memchr2 logic into Mycroft's algorithm

---
 src/read.rs | 68 ++++++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index adb17850c..caa27d310 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -426,53 +426,49 @@ impl<'a> SliceRead<'a> {
         }
     }
 
-    #[inline(always)]
     fn skip_to_escape(&mut self, forbid_control_characters: bool) {
         let rest = &self.slice[self.index..];
-        let end = self.index + memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
 
         if !forbid_control_characters {
-            self.index = end;
+            self.index += memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
             return;
         }
 
-        // We now wish to check if the chunk contains a byte in range 0x00..=0x1F. Ideally, this
-        // would be integrated this into the memchr2 check above, but memchr does not support this
-        // at the moment. Therefore, use a variation on Mycroft's algorithm [1] to provide
-        // performance better than a naive loop. It runs faster than just a single memchr call on
-        // benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so
-        // probably the right fit.
+        // We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use
+        // something akin to memchr3, but the memchr crate does not support this at the moment.
+        // Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better
+        // than a naive loop. It runs faster than equivalent two-pass memchr2+SWAR code on
+        // benchmarks and it's cross-platform, so probably the right fit.
         // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
-        const STEP: usize = mem::size_of::<usize>();
-
-        // Moving this to a local variable removes a spill in the hot loop.
-        let mut index = self.index;
-
-        if self.slice.len() >= STEP {
-            while index < end.min(self.slice.len() - STEP + 1) {
-                // We can safely overread past end in most cases. This ensures that SWAR code is
-                // used to handle the tail in the hot path.
-                const ONE_BYTES: usize = usize::MAX / 255;
-                let chars = usize::from_ne_bytes(self.slice[index..][..STEP].try_into().unwrap());
-                let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7);
-
-                if mask != 0 {
-                    index += mask.trailing_zeros() as usize / 8;
-                    break;
-                }
-
-                index += STEP;
-            }
-        }
-
-        if index < end {
-            if let Some(offset) = self.slice[index..end].iter().position(|&c| c <= 0x1F) {
-                self.index = index + offset;
+        type Chunk = usize;
+        const STEP: usize = mem::size_of::<Chunk>();
+        const ONE_BYTES: Chunk = Chunk::MAX / 255; // 0x0101...01
+
+        for chunk in rest.chunks_exact(STEP) {
+            let chars = Chunk::from_ne_bytes(chunk.try_into().unwrap());
+            let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars;
+            let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"'));
+            let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote;
+            let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\'));
+            let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash;
+            let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7);
+            if masked != 0 {
+                // SAFETY: chunk is in-bounds for slice
+                self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize
+                    + masked.trailing_zeros() as usize / 8;
                 return;
             }
         }
 
-        self.index = end;
+        self.skip_to_escape_slow();
+    }
+
+    #[cold]
+    #[inline(never)]
+    fn skip_to_escape_slow(&mut self) {
+        while self.index < self.slice.len() && !is_escape(self.slice[self.index]) {
+            self.index += 1;
+        }
     }
 
     /// The big optimization here over IoRead is that if the string contains no
@@ -823,8 +819,6 @@ pub trait Fused: private::Sealed {}
 impl<'a> Fused for SliceRead<'a> {}
 impl<'a> Fused for StrRead<'a> {}
 
-// This is only used in IoRead. SliceRead hardcodes the arguments to memchr.
-#[cfg(feature = "std")]
 fn is_escape(ch: u8) -> bool {
     ch == b'"' || ch == b'\\' || ch < 0x20
 }

From a95d6df9d08611c9a11ac6524903d693921b8eae Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Sun, 11 Aug 2024 15:30:26 +0300
Subject: [PATCH 6/9] Big endian support

---
 src/read.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/read.rs b/src/read.rs
index caa27d310..96784b827 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -453,9 +453,14 @@ impl<'a> SliceRead<'a> {
             let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash;
             let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7);
             if masked != 0 {
+                let addresswise_first_bit = if cfg!(target_endian = "little") {
+                    masked.trailing_zeros()
+                } else {
+                    masked.leading_zeros()
+                };
                 // SAFETY: chunk is in-bounds for slice
                 self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize
-                    + masked.trailing_zeros() as usize / 8;
+                    + addresswise_first_bit as usize / 8;
                 return;
             }
         }

From 1f0dcf791ab1756d7ad07c20889e50bd9a7887fb Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Sun, 11 Aug 2024 15:38:14 +0300
Subject: [PATCH 7/9] Allow clippy::items_after_statements

---
 src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lib.rs b/src/lib.rs
index 3225e26f0..4931f00da 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -340,6 +340,7 @@
     clippy::wildcard_imports,
     // things are often more readable this way
     clippy::cast_lossless,
+    clippy::items_after_statements,
     clippy::module_name_repetitions,
     clippy::redundant_else,
     clippy::shadow_unrelated,

From 8389d8a11293616ce5a4358651aede271871248d Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Sun, 11 Aug 2024 16:10:35 +0300
Subject: [PATCH 8/9] Don't run the slow algorithm from the beginning

---
 src/read.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/read.rs b/src/read.rs
index 96784b827..9e652d760 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -465,6 +465,7 @@ impl<'a> SliceRead<'a> {
             }
         }
 
+        self.index += rest.len() / STEP * STEP;
         self.skip_to_escape_slow();
     }
 

From e43da5ee0e64819972f08254e8ce799796238791 Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Sun, 11 Aug 2024 19:21:45 +0300
Subject: [PATCH 9/9] Immediately bail-out on empty strings

---
 src/read.rs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 9e652d760..1494ec6d2 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -222,7 +222,7 @@ where
     {
         loop {
             let ch = tri!(next_or_eof(self));
-            if !is_escape(ch) {
+            if !is_escape(ch, true) {
                 scratch.push(ch);
                 continue;
             }
@@ -343,7 +343,7 @@ where
     fn ignore_str(&mut self) -> Result<()> {
         loop {
             let ch = tri!(next_or_eof(self));
-            if !is_escape(ch) {
+            if !is_escape(ch, true) {
                 continue;
             }
             match ch {
@@ -427,6 +427,14 @@ impl<'a> SliceRead<'a> {
     }
 
     fn skip_to_escape(&mut self, forbid_control_characters: bool) {
+        // Immediately bail-out on empty strings and consecutive escapes (e.g. \u041b\u0435)
+        if self.index == self.slice.len()
+            || is_escape(self.slice[self.index], forbid_control_characters)
+        {
+            return;
+        }
+        self.index += 1;
+
         let rest = &self.slice[self.index..];
 
         if !forbid_control_characters {
@@ -472,7 +480,7 @@ impl<'a> SliceRead<'a> {
     #[cold]
     #[inline(never)]
     fn skip_to_escape_slow(&mut self) {
-        while self.index < self.slice.len() && !is_escape(self.slice[self.index]) {
+        while self.index < self.slice.len() && !is_escape(self.slice[self.index], true) {
             self.index += 1;
         }
     }
@@ -825,8 +833,8 @@ pub trait Fused: private::Sealed {}
 impl<'a> Fused for SliceRead<'a> {}
 impl<'a> Fused for StrRead<'a> {}
 
-fn is_escape(ch: u8) -> bool {
-    ch == b'"' || ch == b'\\' || ch < 0x20
+fn is_escape(ch: u8, including_control_characters: bool) -> bool {
+    ch == b'"' || ch == b'\\' || (including_control_characters && ch < 0x20)
 }
 
 fn next_or_eof<'de, R>(read: &mut R) -> Result<u8>