Merge pull request #595 from casperisfine/avoid-utf8-checks

JSON.dump: avoid redundant UTF-8 validation
ruby · Oct 3, 2024 · 9513e46 · 9513e46
2 parents 572aa64 + 4b04c46
commit 9513e46
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 50 deletions.
diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c
@@ -66,42 +66,6 @@ static const char trailingBytesForUTF8[256] = {
 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 
-/*
- * Utility routine to tell whether a sequence of bytes is legal UTF-8.
- * This must be called with the length pre-determined by the first byte.
- * If not calling this from ConvertUTF8to*, then the length can be set by:
- *  length = trailingBytesForUTF8[*source]+1;
- * and the sequence is illegal right away if there aren't that many bytes
- * available.
- * If presented with a length > 4, this returns 0.  The Unicode
- * definition of UTF-8 goes up to 4-byte sequences.
- */
-static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length)
-{
-    UTF8 a;
-    const UTF8 *srcptr = source+length;
-    switch (length) {
-        default: return 0;
-                 /* Everything else falls through when "1"... */
-        case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-        case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-        case 2: if ((a = (*--srcptr)) > 0xBF) return 0;
-
-                    switch (*source) {
-                        /* no fall-through in this inner switch */
-                        case 0xE0: if (a < 0xA0) return 0; break;
-                        case 0xED: if (a > 0x9F) return 0; break;
-                        case 0xF0: if (a < 0x90) return 0; break;
-                        case 0xF4: if (a > 0x8F) return 0; break;
-                        default:   if (a < 0x80) return 0;
-                    }
-
-        case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
-    }
-    if (*source > 0xF4) return 0;
-    return 1;
-}
-
 /* Escapes the UTF16 character and stores the result in the buffer buf. */
 static void unicode_escape(char *buf, UTF16 character)
 {
@@ -130,17 +94,18 @@ static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char scrip
     const UTF8 *sourceEnd = source + RSTRING_LEN(string);
     char buf[6] = { '\\', 'u' };
 
-    while (source < sourceEnd) {
-        UTF32 ch = 0;
-        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
-        if (source + extraBytesToRead >= sourceEnd) {
-            rb_raise(rb_path2class("JSON::GeneratorError"),
-                    "partial character in source, but hit end");
-        }
-        if (!isLegalUTF8(source, extraBytesToRead+1)) {
+    int ascii_only = rb_enc_str_asciionly_p(string);
+
+    if (!ascii_only) {
+        if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
             rb_raise(rb_path2class("JSON::GeneratorError"),
                     "source sequence is illegal/malformed utf-8");
         }
+    }
+
+    while (source < sourceEnd) {
+        UTF32 ch = 0;
+        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
         /*
          * The cases all fall through. See "Note A" below.
          */
@@ -238,6 +203,13 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
     char buf[6] = { '\\', 'u' };
     int ascii_only = rb_enc_str_asciionly_p(string);
 
+    if (!ascii_only) {
+        if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
+            rb_raise(rb_path2class("JSON::GeneratorError"),
+                    "source sequence is illegal/malformed utf-8");
+        }
+    }
+
     for (start = 0, end = 0; end < len;) {
         p = ptr + end;
         c = (unsigned char) *p;
@@ -309,11 +281,6 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
                                     continue;
                                 }
                             }
-
-                            if (!isLegalUTF8((UTF8 *) p, clen)) {
-                                rb_raise(rb_path2class("JSON::GeneratorError"),
-                                        "source sequence is illegal/malformed utf-8");
-                            }
                         }
                         end += clen;
                     }

diff --git a/ext/json/ext/generator/generator.h b/ext/json/ext/generator/generator.h
@@ -46,7 +46,6 @@ static const int halfShift  = 10; /* used for shifting by 10 bits */
 static const UTF32 halfBase = 0x0010000UL;
 static const UTF32 halfMask = 0x3FFUL;
 
-static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length);
 static void unicode_escape(char *buf, UTF16 character);
 static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
 static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);