From 65bae698b6134f3dd41777eddb0d3d3cb6586b94 Mon Sep 17 00:00:00 2001 From: Mikael Brockman Date: Thu, 2 Mar 2023 00:06:27 +0200 Subject: [PATCH] Fix #158: Handle surrogate pairs in http/json The JSON string "\ud83d\udc95" has one codepoint, not two. This is because the spec allows extended characters to be encoded as a pair of 16-bit values, called a "surrogate pair". From RFC 4627: > To escape an extended character that is not in the Basic Multilingual > Plane, the character is represented as a twelve-character sequence, > encoding the UTF-16 surrogate pair. So, for example, a string > containing only the G clef character (U+1D11E) may be represented as > "\uD834\uDD1E". This commit fixes the JSON parser to handle such surrogate pairs. --- json.pl | 42 ++++++++++++++++++++++++++++++++++++++++++ test_json.pl | 4 ++++ 2 files changed, 46 insertions(+) diff --git a/json.pl b/json.pl index b1c9080..08a1e4b 100644 --- a/json.pl +++ b/json.pl @@ -378,6 +378,45 @@ get_code(Stream, C1), json_string_codes(C1, Stream, T). +hi_surrogate(C) :- + C >= 0xD800, C < 0xDC00. + +lo_surrogate(C) :- + C >= 0xDC00, C < 0xE000. + +surrogate([Hi, Lo], Codepoint) :- + hi_surrogate(Hi), + lo_surrogate(Lo), + Codepoint is (Hi - 0xD800) * 0x400 + (Lo - 0xDC00) + 0x10000. + +get_XXXX(Stream, C) :- + get_code(Stream, C1), + get_code(Stream, C2), + get_code(Stream, C3), + get_code(Stream, C4), + code_type(C1, xdigit(D1)), + code_type(C2, xdigit(D2)), + code_type(C3, xdigit(D3)), + code_type(C4, xdigit(D4)), + C is D1<<12+D2<<8+D3<<4+D4. + +get_surrogate_tail(Stream, Hi, Codepoint) :- + get_code(Stream, 0'\\), + get_code(Stream, 0'u), + get_XXXX(Stream, Lo), + ( surrogate([Hi, Lo], Codepoint) + -> true + ; syntax_error(illegal_surrogate_pair, Stream) + ). + +json:escape(0'u, Stream, C) :- + !, + get_XXXX(Stream, H), + ( hi_surrogate(H) -> + get_surrogate_tail(Stream, H, C) + ; C = H + ). + escape(0'", _, 0'") :- !. escape(0'\\, _, 0'\\) :- !. escape(0'/, _, 0'/) :- !. @@ -1091,3 +1130,6 @@ [ 'Illegal comment' ]. json_syntax_error(illegal_string_escape) --> [ 'Illegal escape sequence in string' ]. +json_syntax_error(illegal_surrogate_pair) --> + [ 'Illegal escaped surrogate pair in string' ]. + diff --git a/test_json.pl b/test_json.pl index 2f04efc..10a3d25 100644 --- a/test_json.pl +++ b/test_json.pl @@ -60,6 +60,10 @@ test(string, X == '\u1234') :- atom_json_term('"\\u1234"', X, []). +% surrogate pair (an emoji) +test(string, X == '\U0001F495') :- + atom_json_term('"\\ud83d\\udc95"', X, []). + test(int, X == 42) :- atom_json_term('42', X, []). test(int, X == -42) :-