Skip to content

Commit

Permalink
Fix SWI-Prolog#158: Handle surrogate pairs in http/json
Browse files Browse the repository at this point in the history
The JSON string "\ud83d\udc95" has one codepoint, not two.

This is because the spec allows extended characters to be
encoded as a pair of 16-bit values, called a "surrogate pair".

From RFC 4627:

> To escape an extended character that is not in the Basic Multilingual
> Plane, the character is represented as a twelve-character sequence,
> encoding the UTF-16 surrogate pair.  So, for example, a string
> containing only the G clef character (U+1D11E) may be represented as
> "\uD834\uDD1E".

This commit fixes the JSON parser to handle such surrogate pairs.
  • Loading branch information
mbrock committed Mar 1, 2023
1 parent 4a2484e commit 65bae69
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
42 changes: 42 additions & 0 deletions json.pl
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,45 @@
get_code(Stream, C1),
json_string_codes(C1, Stream, T).

hi_surrogate(C) :-
C >= 0xD800, C < 0xDC00.

lo_surrogate(C) :-
C >= 0xDC00, C < 0xE000.

surrogate([Hi, Lo], Codepoint) :-
hi_surrogate(Hi),
lo_surrogate(Lo),
Codepoint is (Hi - 0xD800) * 0x400 + (Lo - 0xDC00) + 0x10000.

get_XXXX(Stream, C) :-
get_code(Stream, C1),
get_code(Stream, C2),
get_code(Stream, C3),
get_code(Stream, C4),
code_type(C1, xdigit(D1)),
code_type(C2, xdigit(D2)),
code_type(C3, xdigit(D3)),
code_type(C4, xdigit(D4)),
C is D1<<12+D2<<8+D3<<4+D4.

get_surrogate_tail(Stream, Hi, Codepoint) :-
get_code(Stream, 0'\\),
get_code(Stream, 0'u),
get_XXXX(Stream, Lo),
( surrogate([Hi, Lo], Codepoint)
-> true
; syntax_error(illegal_surrogate_pair, Stream)
).

json:escape(0'u, Stream, C) :-
!,
get_XXXX(Stream, H),
( hi_surrogate(H) ->
get_surrogate_tail(Stream, H, C)
; C = H
).

escape(0'", _, 0'") :- !.
escape(0'\\, _, 0'\\) :- !.
escape(0'/, _, 0'/) :- !.
Expand Down Expand Up @@ -1091,3 +1130,6 @@
[ 'Illegal comment' ].
json_syntax_error(illegal_string_escape) -->
[ 'Illegal escape sequence in string' ].
json_syntax_error(illegal_surrogate_pair) -->
[ 'Illegal escaped surrogate pair in string' ].

4 changes: 4 additions & 0 deletions test_json.pl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@
test(string, X == '\u1234') :-
atom_json_term('"\\u1234"', X, []).

% surrogate pair (an emoji)
test(string, X == '\U0001F495') :-
atom_json_term('"\\ud83d\\udc95"', X, []).

test(int, X == 42) :-
atom_json_term('42', X, []).
test(int, X == -42) :-
Expand Down

0 comments on commit 65bae69

Please sign in to comment.