From dc7d766a90e8e100f8d0e94a37c81235b2fa3fce Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Fri, 3 Jan 2025 19:41:08 +0100
Subject: [PATCH] Improve lookup tables for string escaping.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a simplified table for the most common case, which is
`script_safe: false, ascii_only: false`.

On the `script_safe` table, now only `0xE2` does a multi-byte check.

Merge back `convert_ASCII_to_JSON`, as it no longer help much with
the simplified escape table.

```
== Encoding mixed utf8 (5003001 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after    38.000 i/100ms
Calculating -------------------------------------
               after    398.220 (± 3.0%) i/s    (2.51 ms/i) -      2.014k in   5.061659s

Comparison:
              before:      381.8 i/s
               after:      398.2 i/s - same-ish: difference falls within error

== Encoding mostly utf8 (5001001 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after    39.000 i/100ms
Calculating -------------------------------------
               after    393.337 (± 2.5%) i/s    (2.54 ms/i) -      1.989k in   5.059397s

Comparison:
              before:      304.3 i/s
               after:      393.3 i/s - 1.29x  faster

== Encoding twitter.json (466906 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after   244.000 i/100ms
Calculating -------------------------------------
               after      2.436k (± 0.9%) i/s  (410.43 μs/i) -     12.200k in   5.007702s

Comparison:
              before:     2125.9 i/s
               after:     2436.5 i/s - 1.15x  faster
```
---
 benchmark/encoder.rb               |   4 +-
 ext/json/ext/generator/generator.c | 187 ++++++++++++-----------------
 2 files changed, 80 insertions(+), 111 deletions(-)

diff --git a/benchmark/encoder.rb b/benchmark/encoder.rb
index acc5fa07..b42154f5 100644
--- a/benchmark/encoder.rb
+++ b/benchmark/encoder.rb
@@ -68,12 +68,10 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
 benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500)
 benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500)
 
-# On these benchmarks we perform well, we're on par or better.
+# On these benchmarks we perform well, we're on par or a bit better.
 benchmark_encoding "integers", (1_000_000..1_001_000).to_a, except: %i(json_state)
 benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json")
 benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json")
-
-# On twitter.json we're still about 6% slower, this is worth investigating.
 benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json")
 
 # This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation
diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c
index d5c8bfd4..a76cf7d8 100644
--- a/ext/json/ext/generator/generator.c
+++ b/ext/json/ext/generator/generator.c
@@ -96,6 +96,73 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
     raise_generator_error_str(invalid_object, str);
 }
 
+// 0 - single byte char that don't need to be escaped.
+// (x | 8) - char that needs to be escaped.
+static const unsigned char CHAR_LENGTH_MASK = 7;
+
+static const unsigned char escape_table[256] = {
+    // ASCII Control Characters
+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    // ASCII Characters
+     0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const unsigned char ascii_only_escape_table[256] = {
+    // ASCII Control Characters
+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    // ASCII Characters
+     0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    // Continuation byte
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // First byte of a  2-byte code point
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    // First byte of a 3-byte code point
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    //First byte of a 4+ byte code point
+     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
+};
+
+static const unsigned char script_safe_escape_table[256] = {
+    // ASCII Control Characters
+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    // ASCII Characters
+     0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    // Continuation byte
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // First byte of a 2-byte code point
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    // First byte of a 3-byte code point
+     3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
+    //First byte of a 4+ byte code point
+     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
+};
+
 /* Converts in_string to a JSON string (without the wrapping '"'
  * characters) in FBuffer out_buffer.
  *
@@ -106,13 +173,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
  *
  * - If out_ascii_only: non-ASCII characters (>0x7F)
  *
- * - If out_script_safe: forwardslash, line separator (U+2028), and
+ * - If script_safe: forwardslash (/), line separator (U+2028), and
  *   paragraph separator (U+2029)
  *
  * Everything else (should be UTF-8) is just passed through and
  * appended to the result.
  */
-static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
+static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
 {
     const char *hexdig = "0123456789abcdef";
     char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -131,7 +198,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
 
         if (RB_UNLIKELY(ch_len)) {
             switch (ch_len) {
-                case 1: {
+                case 9: {
                     FLUSH_POS(1);
                     switch (ch) {
                         case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -153,9 +220,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
                     }
                     break;
                 }
-                case 3: {
+                case 11: {
                     unsigned char b2 = ptr[pos + 1];
-                    if (RB_UNLIKELY(out_script_safe && ch == 0xE2 && b2 == 0x80)) {
+                    if (RB_UNLIKELY(b2 == 0x80)) {
                         unsigned char b3 = ptr[pos + 2];
                         if (b3 == 0xA8) {
                             FLUSH_POS(3);
@@ -167,6 +234,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
                             break;
                         }
                     }
+                    ch_len = 3;
                     // fallthrough
                 }
                 default:
@@ -186,104 +254,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
     RB_GC_GUARD(str);
 }
 
-static const char escape_table[256] = {
-    // ASCII Control Characters
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    // ASCII Characters
-    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    // Continuation byte
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    // First byte of a 2-byte code point
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    // First byte of a 4-byte code point
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-    //First byte of a 4+byte code point
-    4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
-};
-
-static const char script_safe_escape_table[256] = {
-    // ASCII Control Characters
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    // ASCII Characters
-    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    // Continuation byte
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    // First byte of a 2-byte code point
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    // First byte of a 4-byte code point
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-    //First byte of a 4+byte code point
-    4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
-};
-
-static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
-{
-    const char *hexdig = "0123456789abcdef";
-    char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
-
-    const char *ptr = RSTRING_PTR(str);
-    unsigned long len = RSTRING_LEN(str);
-
-    unsigned long beg = 0, pos;
-
-    for (pos = 0; pos < len;) {
-        unsigned char ch = ptr[pos];
-        /* JSON encoding */
-        if (escape_table[ch]) {
-            if (pos > beg) {
-                fbuffer_append(out_buffer, &ptr[beg], pos - beg);
-            }
-
-            beg = pos + 1;
-            switch (ch) {
-                case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
-                case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
-                case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
-                case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
-                case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
-                case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
-                case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
-                case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
-                default:
-                    scratch[2] = '0';
-                    scratch[3] = '0';
-                    scratch[4] = hexdig[(ch >> 4) & 0xf];
-                    scratch[5] = hexdig[ch & 0xf];
-                    fbuffer_append(out_buffer, scratch, 6);
-            }
-        }
-
-        pos++;
-    }
-
-    if (beg < len) {
-        fbuffer_append(out_buffer, &ptr[beg], len - beg);
-    }
-
-    RB_GC_GUARD(str);
-}
-
-static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
+static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
 {
     const char *hexdig = "0123456789abcdef";
     char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -301,7 +272,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
 
         if (RB_UNLIKELY(ch_len)) {
             switch (ch_len) {
-                case 1: {
+                case 9: {
                     FLUSH_POS(1);
                     switch (ch) {
                         case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -325,6 +296,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
                 }
                 default: {
                     uint32_t wchar = 0;
+                    ch_len = ch_len & CHAR_LENGTH_MASK;
+
                     switch(ch_len) {
                         case 2:
                             wchar = ptr[pos] & 0x1F;
@@ -935,13 +908,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
 
     switch(rb_enc_str_coderange(obj)) {
         case ENC_CODERANGE_7BIT:
-            convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
-            break;
         case ENC_CODERANGE_VALID:
             if (RB_UNLIKELY(state->ascii_only)) {
-                convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
+                convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
             } else {
-                convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
+                convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
             }
             break;
         default: