Skip to content

Commit

Permalink
Improve lookup tables for string escaping.
Browse files Browse the repository at this point in the history
Introduce a simplified table for the most common case, which is
`script_safe: false, ascii_only: false`.

On the `script_safe` table, now only `0xE2` does a multi-byte check.

Merge back `convert_ASCII_to_JSON`, as it no longer help much with
the simplified escape table.

```
== Encoding mixed utf8 (5003001 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after    38.000 i/100ms
Calculating -------------------------------------
               after    398.220 (± 3.0%) i/s    (2.51 ms/i) -      2.014k in   5.061659s

Comparison:
              before:      381.8 i/s
               after:      398.2 i/s - same-ish: difference falls within error

== Encoding mostly utf8 (5001001 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after    39.000 i/100ms
Calculating -------------------------------------
               after    393.337 (± 2.5%) i/s    (2.54 ms/i) -      1.989k in   5.059397s

Comparison:
              before:      304.3 i/s
               after:      393.3 i/s - 1.29x  faster

== Encoding twitter.json (466906 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
               after   244.000 i/100ms
Calculating -------------------------------------
               after      2.436k (± 0.9%) i/s  (410.43 μs/i) -     12.200k in   5.007702s

Comparison:
              before:     2125.9 i/s
               after:     2436.5 i/s - 1.15x  faster
```
  • Loading branch information
byroot committed Jan 3, 2025
1 parent f745ec1 commit 67246de
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 111 deletions.
4 changes: 1 addition & 3 deletions benchmark/encoder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,10 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500)
benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500)

# On these benchmarks we perform well, we're on par or better.
# On these benchmarks we perform well, we're on par or a bit better.
benchmark_encoding "integers", (1_000_000..1_001_000).to_a, except: %i(json_state)
benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json")
benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json")

# On twitter.json we're still about 6% slower, this is worth investigating.
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json")

# This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation
Expand Down
188 changes: 80 additions & 108 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,74 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
raise_generator_error_str(invalid_object, str);
}

// 0 - single byte char that don't need to be escaped.
// (x | 8) - char that needs to be escaped.
static const unsigned char CHAR_LENGTH_MASK = 7;
static const unsigned char ESCAPE_MASK = 8;

static const unsigned char escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

static const unsigned char ascii_only_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};

static const unsigned char script_safe_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
Expand All @@ -106,13 +174,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* - If out_script_safe: forwardslash, line separator (U+2028), and
* - If script_safe: forwardslash (/), line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
Expand All @@ -131,7 +199,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 1: {
case 9: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
Expand All @@ -153,9 +221,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
}
break;
}
case 3: {
case 11: {
unsigned char b2 = ptr[pos + 1];
if (RB_UNLIKELY(out_script_safe && ch == 0xE2 && b2 == 0x80)) {
if (RB_UNLIKELY(b2 == 0x80)) {
unsigned char b3 = ptr[pos + 2];
if (b3 == 0xA8) {
FLUSH_POS(3);
Expand All @@ -167,6 +235,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
break;
}
}
ch_len = 3;
// fallthrough
}
default:
Expand All @@ -186,104 +255,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
RB_GC_GUARD(str);
}

static const char escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};

static const char script_safe_escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};

static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);

unsigned long beg = 0, pos;

for (pos = 0; pos < len;) {
unsigned char ch = ptr[pos];
/* JSON encoding */
if (escape_table[ch]) {
if (pos > beg) {
fbuffer_append(out_buffer, &ptr[beg], pos - beg);
}

beg = pos + 1;
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
}
}

pos++;
}

if (beg < len) {
fbuffer_append(out_buffer, &ptr[beg], len - beg);
}

RB_GC_GUARD(str);
}

static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
Expand All @@ -301,7 +273,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 1: {
case 1 | ESCAPE_MASK: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
Expand All @@ -325,6 +297,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
}
default: {
uint32_t wchar = 0;
ch_len = ch_len & CHAR_LENGTH_MASK;

switch(ch_len) {
case 2:
wchar = ptr[pos] & 0x1F;
Expand Down Expand Up @@ -935,13 +909,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat

switch(rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
break;
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else {
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
}
break;
default:
Expand Down

0 comments on commit 67246de

Please sign in to comment.