From dc7d766a90e8e100f8d0e94a37c81235b2fa3fce Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Fri, 3 Jan 2025 19:41:08 +0100 Subject: [PATCH] Improve lookup tables for string escaping. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a simplified table for the most common case, which is `script_safe: false, ascii_only: false`. On the `script_safe` table, now only `0xE2` does a multi-byte check. Merge back `convert_ASCII_to_JSON`, as it no longer help much with the simplified escape table. ``` == Encoding mixed utf8 (5003001 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 38.000 i/100ms Calculating ------------------------------------- after 398.220 (± 3.0%) i/s (2.51 ms/i) - 2.014k in 5.061659s Comparison: before: 381.8 i/s after: 398.2 i/s - same-ish: difference falls within error == Encoding mostly utf8 (5001001 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 39.000 i/100ms Calculating ------------------------------------- after 393.337 (± 2.5%) i/s (2.54 ms/i) - 1.989k in 5.059397s Comparison: before: 304.3 i/s after: 393.3 i/s - 1.29x faster == Encoding twitter.json (466906 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 244.000 i/100ms Calculating ------------------------------------- after 2.436k (± 0.9%) i/s (410.43 μs/i) - 12.200k in 5.007702s Comparison: before: 2125.9 i/s after: 2436.5 i/s - 1.15x faster ``` --- benchmark/encoder.rb | 4 +- ext/json/ext/generator/generator.c | 187 ++++++++++++----------------- 2 files changed, 80 insertions(+), 111 deletions(-) diff --git a/benchmark/encoder.rb b/benchmark/encoder.rb index acc5fa07..b42154f5 100644 --- a/benchmark/encoder.rb +++ b/benchmark/encoder.rb @@ -68,12 +68,10 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [ benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500) benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500) -# On these benchmarks we perform well, we're on par or better. +# On these benchmarks we perform well, we're on par or a bit better. benchmark_encoding "integers", (1_000_000..1_001_000).to_a, except: %i(json_state) benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json") benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json") - -# On twitter.json we're still about 6% slower, this is worth investigating. benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json") # This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d5c8bfd4..a76cf7d8 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -96,6 +96,73 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...) raise_generator_error_str(invalid_object, str); } +// 0 - single byte char that don't need to be escaped. +// (x | 8) - char that needs to be escaped. +static const unsigned char CHAR_LENGTH_MASK = 7; + +static const unsigned char escape_table[256] = { + // ASCII Control Characters + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + // ASCII Characters + 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const unsigned char ascii_only_escape_table[256] = { + // ASCII Control Characters + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + // ASCII Characters + 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Continuation byte + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // First byte of a 2-byte code point + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + // First byte of a 3-byte code point + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + //First byte of a 4+ byte code point + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9, +}; + +static const unsigned char script_safe_escape_table[256] = { + // ASCII Control Characters + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + // ASCII Characters + 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Continuation byte + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // First byte of a 2-byte code point + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + // First byte of a 3-byte code point + 3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029 + //First byte of a 4+ byte code point + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9, +}; + /* Converts in_string to a JSON string (without the wrapping '"' * characters) in FBuffer out_buffer. * @@ -106,13 +173,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...) * * - If out_ascii_only: non-ASCII characters (>0x7F) * - * - If out_script_safe: forwardslash, line separator (U+2028), and + * - If script_safe: forwardslash (/), line separator (U+2028), and * paragraph separator (U+2029) * * Everything else (should be UTF-8) is just passed through and * appended to the result. */ -static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe) +static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; @@ -131,7 +198,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca if (RB_UNLIKELY(ch_len)) { switch (ch_len) { - case 1: { + case 9: { FLUSH_POS(1); switch (ch) { case '"': fbuffer_append(out_buffer, "\\\"", 2); break; @@ -153,9 +220,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca } break; } - case 3: { + case 11: { unsigned char b2 = ptr[pos + 1]; - if (RB_UNLIKELY(out_script_safe && ch == 0xE2 && b2 == 0x80)) { + if (RB_UNLIKELY(b2 == 0x80)) { unsigned char b3 = ptr[pos + 2]; if (b3 == 0xA8) { FLUSH_POS(3); @@ -167,6 +234,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca break; } } + ch_len = 3; // fallthrough } default: @@ -186,104 +254,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca RB_GC_GUARD(str); } -static const char escape_table[256] = { - // ASCII Control Characters - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - // ASCII Characters - 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"' - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\' - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - // Continuation byte - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - // First byte of a 2-byte code point - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - // First byte of a 4-byte code point - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - //First byte of a 4+byte code point - 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, -}; - -static const char script_safe_escape_table[256] = { - // ASCII Control Characters - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - // ASCII Characters - 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/' - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\' - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - // Continuation byte - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - // First byte of a 2-byte code point - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - // First byte of a 4-byte code point - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - //First byte of a 4+byte code point - 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, -}; - -static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256]) -{ - const char *hexdig = "0123456789abcdef"; - char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; - - const char *ptr = RSTRING_PTR(str); - unsigned long len = RSTRING_LEN(str); - - unsigned long beg = 0, pos; - - for (pos = 0; pos < len;) { - unsigned char ch = ptr[pos]; - /* JSON encoding */ - if (escape_table[ch]) { - if (pos > beg) { - fbuffer_append(out_buffer, &ptr[beg], pos - beg); - } - - beg = pos + 1; - switch (ch) { - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; - default: - scratch[2] = '0'; - scratch[3] = '0'; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - } - } - - pos++; - } - - if (beg < len) { - fbuffer_append(out_buffer, &ptr[beg], len - beg); - } - - RB_GC_GUARD(str); -} - -static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe) +static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; @@ -301,7 +272,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons if (RB_UNLIKELY(ch_len)) { switch (ch_len) { - case 1: { + case 9: { FLUSH_POS(1); switch (ch) { case '"': fbuffer_append(out_buffer, "\\\"", 2); break; @@ -325,6 +296,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons } default: { uint32_t wchar = 0; + ch_len = ch_len & CHAR_LENGTH_MASK; + switch(ch_len) { case 2: wchar = ptr[pos] & 0x1F; @@ -935,13 +908,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: - convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); - break; case ENC_CODERANGE_VALID: if (RB_UNLIKELY(state->ascii_only)) { - convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe); + convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table); } else { - convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe); + convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); } break; default: