diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 766e199f7..9ecbfbeaa 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -66,42 +66,6 @@ static const char trailingBytesForUTF8[256] = { static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; -/* - * Utility routine to tell whether a sequence of bytes is legal UTF-8. - * This must be called with the length pre-determined by the first byte. - * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; - * and the sequence is illegal right away if there aren't that many bytes - * available. - * If presented with a length > 4, this returns 0. The Unicode - * definition of UTF-8 goes up to 4-byte sequences. - */ -static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length) -{ - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return 0; - /* Everything else falls through when "1"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; - case 2: if ((a = (*--srcptr)) > 0xBF) return 0; - - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return 0; break; - case 0xED: if (a > 0x9F) return 0; break; - case 0xF0: if (a < 0x90) return 0; break; - case 0xF4: if (a > 0x8F) return 0; break; - default: if (a < 0x80) return 0; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return 0; - } - if (*source > 0xF4) return 0; - return 1; -} - /* Escapes the UTF16 character and stores the result in the buffer buf. */ static void unicode_escape(char *buf, UTF16 character) { @@ -130,17 +94,18 @@ static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char scrip const UTF8 *sourceEnd = source + RSTRING_LEN(string); char buf[6] = { '\\', 'u' }; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - rb_raise(rb_path2class("JSON::GeneratorError"), - "partial character in source, but hit end"); - } - if (!isLegalUTF8(source, extraBytesToRead+1)) { + int ascii_only = rb_enc_str_asciionly_p(string); + + if (!ascii_only) { + if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) { rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); } + } + + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; /* * The cases all fall through. See "Note A" below. */ @@ -238,6 +203,13 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe char buf[6] = { '\\', 'u' }; int ascii_only = rb_enc_str_asciionly_p(string); + if (!ascii_only) { + if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) { + rb_raise(rb_path2class("JSON::GeneratorError"), + "source sequence is illegal/malformed utf-8"); + } + } + for (start = 0, end = 0; end < len;) { p = ptr + end; c = (unsigned char) *p; @@ -309,11 +281,6 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe continue; } } - - if (!isLegalUTF8((UTF8 *) p, clen)) { - rb_raise(rb_path2class("JSON::GeneratorError"), - "source sequence is illegal/malformed utf-8"); - } } end += clen; } diff --git a/ext/json/ext/generator/generator.h b/ext/json/ext/generator/generator.h index 98b7d833f..8dde7be82 100644 --- a/ext/json/ext/generator/generator.h +++ b/ext/json/ext/generator/generator.h @@ -46,7 +46,6 @@ static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; static const UTF32 halfMask = 0x3FFUL; -static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length); static void unicode_escape(char *buf, UTF16 character); static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character); static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);