From 095e6f2babdae56750289fce8539030d6bdf47ee Mon Sep 17 00:00:00 2001 From: Mikael Brockman Date: Thu, 2 Mar 2023 00:06:27 +0200 Subject: [PATCH] ADDED: #158: Handle surrogate pairs in http/json The JSON string "\ud83d\udc95" has one codepoint, not two. This is because the spec allows extended characters to be encoded as a pair of 16-bit values, called a "surrogate pair". From RFC 4627: > To escape an extended character that is not in the Basic Multilingual > Plane, the character is represented as a twelve-character sequence, > encoding the UTF-16 surrogate pair. So, for example, a string > containing only the G clef character (U+1D11E) may be represented as > "\uD834\uDD1E". This commit fixes the JSON parser to handle such surrogate pairs. --- json.pl | 53 ++++++++++++++++++++++++++++++++++++++++++---------- test_json.pl | 4 ++++ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/json.pl b/json.pl index b1c9080..fb7dedd 100644 --- a/json.pl +++ b/json.pl @@ -3,7 +3,7 @@ Author: Jan Wielemaker E-mail: J.Wielemaker@vu.nl WWW: http://www.swi-prolog.org - Copyright (c) 2007-2021, University of Amsterdam + Copyright (c) 2007-2023, University of Amsterdam VU University Amsterdam CWI, Amsterdam SWI-Prolog Solutions b.v. @@ -387,17 +387,47 @@ escape(0'r, _, 0'\r) :- !. escape(0't, _, 0'\t) :- !. escape(0'u, Stream, C) :- - !, - get_code(Stream, C1), - get_code(Stream, C2), - get_code(Stream, C3), - get_code(Stream, C4), - code_type(C1, xdigit(D1)), - code_type(C2, xdigit(D2)), - code_type(C3, xdigit(D3)), - code_type(C4, xdigit(D4)), + get_XXXX(Stream, H), + ( hi_surrogate(H) + -> get_surrogate_tail(Stream, H, C) + ; C = H + ). + +get_XXXX(Stream, C) :- + get_xdigit(Stream, D1), + get_xdigit(Stream, D2), + get_xdigit(Stream, D3), + get_xdigit(Stream, D4), C is D1<<12+D2<<8+D3<<4+D4. +get_xdigit(Stream, D) :- + get_code(Stream, C), + code_type(C, xdigit(D)), + !. +get_xdigit(Stream, _) :- + syntax_error(hexdigit_expected, Stream). + +get_surrogate_tail(Stream, Hi, Codepoint) :- + ( get_code(Stream, 0'\\), + get_code(Stream, 0'u), + get_XXXX(Stream, Lo), + surrogate([Hi, Lo], Codepoint) + -> true + ; syntax_error(illegal_surrogate_pair, Stream) + ). + + +hi_surrogate(C) :- + C >= 0xD800, C < 0xDC00. + +lo_surrogate(C) :- + C >= 0xDC00, C < 0xE000. + +surrogate([Hi, Lo], Codepoint) :- + hi_surrogate(Hi), + lo_surrogate(Lo), + Codepoint is (Hi - 0xD800) * 0x400 + (Lo - 0xDC00) + 0x10000. + json_read_constant(0't, Stream, true) :- !, must_see(`rue`, Stream, true). @@ -1091,3 +1121,6 @@ [ 'Illegal comment' ]. json_syntax_error(illegal_string_escape) --> [ 'Illegal escape sequence in string' ]. +json_syntax_error(illegal_surrogate_pair) --> + [ 'Illegal escaped surrogate pair in string' ]. + diff --git a/test_json.pl b/test_json.pl index 2f04efc..10a3d25 100644 --- a/test_json.pl +++ b/test_json.pl @@ -60,6 +60,10 @@ test(string, X == '\u1234') :- atom_json_term('"\\u1234"', X, []). +% surrogate pair (an emoji) +test(string, X == '\U0001F495') :- + atom_json_term('"\\ud83d\\udc95"', X, []). + test(int, X == 42) :- atom_json_term('42', X, []). test(int, X == -42) :-