Skip to content

Commit

Permalink
Merge pull request #595 from casperisfine/avoid-utf8-checks
Browse files Browse the repository at this point in the history
JSON.dump: avoid redundant UTF-8 validation
  • Loading branch information
hsbt authored Oct 3, 2024
2 parents 572aa64 + 4b04c46 commit 9513e46
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 50 deletions.
65 changes: 16 additions & 49 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,42 +66,6 @@ static const char trailingBytesForUTF8[256] = {
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };

/*
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from ConvertUTF8to*, then the length can be set by:
* length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns 0. The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/
static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length)
{
UTF8 a;
const UTF8 *srcptr = source+length;
switch (length) {
default: return 0;
/* Everything else falls through when "1"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 2: if ((a = (*--srcptr)) > 0xBF) return 0;

switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return 0; break;
case 0xED: if (a > 0x9F) return 0; break;
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0; break;
default: if (a < 0x80) return 0;
}

case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
}
if (*source > 0xF4) return 0;
return 1;
}

/* Escapes the UTF16 character and stores the result in the buffer buf. */
static void unicode_escape(char *buf, UTF16 character)
{
Expand Down Expand Up @@ -130,17 +94,18 @@ static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char scrip
const UTF8 *sourceEnd = source + RSTRING_LEN(string);
char buf[6] = { '\\', 'u' };

while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"partial character in source, but hit end");
}
if (!isLegalUTF8(source, extraBytesToRead+1)) {
int ascii_only = rb_enc_str_asciionly_p(string);

if (!ascii_only) {
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}

while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
/*
* The cases all fall through. See "Note A" below.
*/
Expand Down Expand Up @@ -238,6 +203,13 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
char buf[6] = { '\\', 'u' };
int ascii_only = rb_enc_str_asciionly_p(string);

if (!ascii_only) {
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}

for (start = 0, end = 0; end < len;) {
p = ptr + end;
c = (unsigned char) *p;
Expand Down Expand Up @@ -309,11 +281,6 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
continue;
}
}

if (!isLegalUTF8((UTF8 *) p, clen)) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}
end += clen;
}
Expand Down
1 change: 0 additions & 1 deletion ext/json/ext/generator/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ static const int halfShift = 10; /* used for shifting by 10 bits */
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;

static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length);
static void unicode_escape(char *buf, UTF16 character);
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);
Expand Down

0 comments on commit 9513e46

Please sign in to comment.