Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test for parsing broken strings and use String#encode instead of rb_str_conv_enc() in parser #665

Merged
merged 4 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 17 additions & 18 deletions ext/json/ext/parser/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,14 @@ static void raise_parse_error(const char *format, const char *start)
rb_enc_raise(rb_utf8_encoding(), rb_path2class("JSON::ParserError"), format, ptr);
}

static VALUE mJSON, mExt, cParser, eNestingError;
static VALUE mJSON, mExt, cParser, eNestingError, Encoding_UTF_8;
static VALUE CNaN, CInfinity, CMinusInfinity;

static ID i_json_creatable_p, i_json_create, i_create_id, i_create_additions,
i_chr, i_max_nesting, i_allow_nan, i_symbolize_names,
i_object_class, i_array_class, i_decimal_class,
i_deep_const_get, i_match, i_match_string, i_aset, i_aref,
i_leftshift, i_new, i_try_convert, i_freeze, i_uminus;
i_leftshift, i_new, i_try_convert, i_freeze, i_uminus, i_encode;

static int binary_encindex;
static int utf8_encindex;
Expand Down Expand Up @@ -1797,16 +1797,11 @@ static VALUE convert_encoding(VALUE source)
}

if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
return utf8_string;
}
// For historical reason, we silently reinterpret binary strings as UTF-8
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
}

return rb_str_conv_enc(source, rb_enc_from_index(encindex), rb_utf8_encoding());
return rb_funcall(source, i_encode, 1, Encoding_UTF_8);
}

/*
Expand Down Expand Up @@ -1958,15 +1953,15 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
}


#line 1962 "parser.c"
#line 1957 "parser.c"
enum {JSON_start = 1};
enum {JSON_first_final = 10};
enum {JSON_error = 0};

enum {JSON_en_main = 1};


#line 870 "parser.rl"
#line 865 "parser.rl"


/*
Expand All @@ -1984,16 +1979,16 @@ static VALUE cParser_parse(VALUE self)
GET_PARSER;


#line 1988 "parser.c"
#line 1983 "parser.c"
{
cs = JSON_start;
}

#line 887 "parser.rl"
#line 882 "parser.rl"
p = json->source;
pe = p + json->len;

#line 1997 "parser.c"
#line 1992 "parser.c"
{
if ( p == pe )
goto _test_eof;
Expand Down Expand Up @@ -2027,7 +2022,7 @@ case 1:
cs = 0;
goto _out;
tr2:
#line 862 "parser.rl"
#line 857 "parser.rl"
{
char *np = JSON_parse_value(json, p, pe, &result, 0);
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
Expand All @@ -2037,7 +2032,7 @@ cs = 0;
if ( ++p == pe )
goto _test_eof10;
case 10:
#line 2041 "parser.c"
#line 2036 "parser.c"
switch( (*p) ) {
case 13: goto st10;
case 32: goto st10;
Expand Down Expand Up @@ -2126,7 +2121,7 @@ case 9:
_out: {}
}

#line 890 "parser.rl"
#line 885 "parser.rl"

if (cs >= JSON_first_final && p == pe) {
return result;
Expand Down Expand Up @@ -2214,6 +2209,9 @@ void Init_parser(void)
CMinusInfinity = rb_const_get(mJSON, rb_intern("MinusInfinity"));
rb_gc_register_mark_object(CMinusInfinity);

rb_global_variable(&Encoding_UTF_8);
Encoding_UTF_8 = rb_const_get(rb_path2class("Encoding"), rb_intern("UTF_8"));

i_json_creatable_p = rb_intern("json_creatable?");
i_json_create = rb_intern("json_create");
i_create_id = rb_intern("create_id");
Expand All @@ -2235,6 +2233,7 @@ void Init_parser(void)
i_try_convert = rb_intern("try_convert");
i_freeze = rb_intern("freeze");
i_uminus = rb_intern("-@");
i_encode = rb_intern("encode");

binary_encindex = rb_ascii8bit_encindex();
utf8_encindex = rb_utf8_encindex();
Expand Down
19 changes: 9 additions & 10 deletions ext/json/ext/parser/parser.rl
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,14 @@ static void raise_parse_error(const char *format, const char *start)
rb_enc_raise(rb_utf8_encoding(), rb_path2class("JSON::ParserError"), format, ptr);
}

static VALUE mJSON, mExt, cParser, eNestingError;
static VALUE mJSON, mExt, cParser, eNestingError, Encoding_UTF_8;
static VALUE CNaN, CInfinity, CMinusInfinity;

static ID i_json_creatable_p, i_json_create, i_create_id, i_create_additions,
i_chr, i_max_nesting, i_allow_nan, i_symbolize_names,
i_object_class, i_array_class, i_decimal_class,
i_deep_const_get, i_match, i_match_string, i_aset, i_aref,
i_leftshift, i_new, i_try_convert, i_freeze, i_uminus;
i_leftshift, i_new, i_try_convert, i_freeze, i_uminus, i_encode;

static int binary_encindex;
static int utf8_encindex;
Expand Down Expand Up @@ -692,16 +692,11 @@ static VALUE convert_encoding(VALUE source)
}

if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
return utf8_string;
}
// For historical reason, we silently reinterpret binary strings as UTF-8
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
}

return rb_str_conv_enc(source, rb_enc_from_index(encindex), rb_utf8_encoding());
return rb_funcall(source, i_encode, 1, Encoding_UTF_8);
}

/*
Expand Down Expand Up @@ -974,6 +969,9 @@ void Init_parser(void)
CMinusInfinity = rb_const_get(mJSON, rb_intern("MinusInfinity"));
rb_gc_register_mark_object(CMinusInfinity);

rb_global_variable(&Encoding_UTF_8);
Encoding_UTF_8 = rb_const_get(rb_path2class("Encoding"), rb_intern("UTF_8"));

i_json_creatable_p = rb_intern("json_creatable?");
i_json_create = rb_intern("json_create");
i_create_id = rb_intern("create_id");
Expand All @@ -995,6 +993,7 @@ void Init_parser(void)
i_try_convert = rb_intern("try_convert");
i_freeze = rb_intern("freeze");
i_uminus = rb_intern("-@");
i_encode = rb_intern("encode");

binary_encindex = rb_ascii8bit_encindex();
utf8_encindex = rb_utf8_encindex();
Expand Down
2 changes: 1 addition & 1 deletion lib/json/pure/generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def utf8_to_json_ascii(string, script_safe = false) # :nodoc:
)/nx) { |c|
c.size == 1 and raise GeneratorError, "invalid utf8 byte: '#{c}'"
s = c.encode(::Encoding::UTF_16BE, ::Encoding::UTF_8).unpack('H*')[0]
s.force_encoding(::Encoding::ASCII_8BIT)
s.force_encoding(::Encoding::BINARY)
s.gsub!(/.{4}/n, '\\\\u\&')
s.force_encoding(::Encoding::UTF_8)
}
Expand Down
4 changes: 2 additions & 2 deletions lib/json/pure/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ def convert_encoding(source)
raise TypeError,
"#{source.inspect} is not like a string"
end
if source.encoding != ::Encoding::ASCII_8BIT
if source.encoding != ::Encoding::BINARY
source = source.encode(::Encoding::UTF_8)
source.force_encoding(::Encoding::ASCII_8BIT)
source.force_encoding(::Encoding::BINARY)
end
source
end
Expand Down
17 changes: 17 additions & 0 deletions test/json/json_parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,23 @@ def test_parse_some_strings
)
end

if RUBY_ENGINE != "jruby" # https://github.com/ruby/json/issues/138
def test_parse_broken_string
s = parse(%{["\x80"]})[0]
assert_equal("\x80", s)
assert_equal Encoding::UTF_8, s.encoding
assert_equal false, s.valid_encoding?

s = parse(%{["\x80"]}.b)[0]
assert_equal("\x80", s)
assert_equal Encoding::UTF_8, s.encoding
assert_equal false, s.valid_encoding?

input = %{["\x80"]}.dup.force_encoding(Encoding::US_ASCII)
assert_raise(Encoding::InvalidByteSequenceError) { parse(input) }
end
end

def test_parse_big_integers
json1 = JSON(orig = (1 << 31) - 1)
assert_equal orig, parse(json1)
Expand Down
Loading