Skip to content

Commit

Permalink
ADDED: #158: Handle surrogate pairs in http/json
Browse files Browse the repository at this point in the history
The JSON string "\ud83d\udc95" has one codepoint, not two.

This is because the spec allows extended characters to be
encoded as a pair of 16-bit values, called a "surrogate pair".

From RFC 4627:

> To escape an extended character that is not in the Basic Multilingual
> Plane, the character is represented as a twelve-character sequence,
> encoding the UTF-16 surrogate pair.  So, for example, a string
> containing only the G clef character (U+1D11E) may be represented as
> "\uD834\uDD1E".

This commit fixes the JSON parser to handle such surrogate pairs.
  • Loading branch information
mbrock authored and JanWielemaker committed Mar 2, 2023
1 parent 4a2484e commit 095e6f2
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 10 deletions.
53 changes: 43 additions & 10 deletions json.pl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Author: Jan Wielemaker
E-mail: [email protected]
WWW: http://www.swi-prolog.org
Copyright (c) 2007-2021, University of Amsterdam
Copyright (c) 2007-2023, University of Amsterdam
VU University Amsterdam
CWI, Amsterdam
SWI-Prolog Solutions b.v.
Expand Down Expand Up @@ -387,17 +387,47 @@
escape(0'r, _, 0'\r) :- !.
escape(0't, _, 0'\t) :- !.
escape(0'u, Stream, C) :-
!,
get_code(Stream, C1),
get_code(Stream, C2),
get_code(Stream, C3),
get_code(Stream, C4),
code_type(C1, xdigit(D1)),
code_type(C2, xdigit(D2)),
code_type(C3, xdigit(D3)),
code_type(C4, xdigit(D4)),
get_XXXX(Stream, H),
( hi_surrogate(H)
-> get_surrogate_tail(Stream, H, C)
; C = H
).

get_XXXX(Stream, C) :-
get_xdigit(Stream, D1),
get_xdigit(Stream, D2),
get_xdigit(Stream, D3),
get_xdigit(Stream, D4),
C is D1<<12+D2<<8+D3<<4+D4.

get_xdigit(Stream, D) :-
get_code(Stream, C),
code_type(C, xdigit(D)),
!.
get_xdigit(Stream, _) :-
syntax_error(hexdigit_expected, Stream).

get_surrogate_tail(Stream, Hi, Codepoint) :-
( get_code(Stream, 0'\\),
get_code(Stream, 0'u),
get_XXXX(Stream, Lo),
surrogate([Hi, Lo], Codepoint)
-> true
; syntax_error(illegal_surrogate_pair, Stream)
).


hi_surrogate(C) :-
C >= 0xD800, C < 0xDC00.

lo_surrogate(C) :-
C >= 0xDC00, C < 0xE000.

surrogate([Hi, Lo], Codepoint) :-
hi_surrogate(Hi),
lo_surrogate(Lo),
Codepoint is (Hi - 0xD800) * 0x400 + (Lo - 0xDC00) + 0x10000.

json_read_constant(0't, Stream, true) :-
!,
must_see(`rue`, Stream, true).
Expand Down Expand Up @@ -1091,3 +1121,6 @@
[ 'Illegal comment' ].
json_syntax_error(illegal_string_escape) -->
[ 'Illegal escape sequence in string' ].
json_syntax_error(illegal_surrogate_pair) -->
[ 'Illegal escaped surrogate pair in string' ].

4 changes: 4 additions & 0 deletions test_json.pl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@
test(string, X == '\u1234') :-
atom_json_term('"\\u1234"', X, []).

% surrogate pair (an emoji)
test(string, X == '\U0001F495') :-
atom_json_term('"\\ud83d\\udc95"', X, []).

test(int, X == 42) :-
atom_json_term('42', X, []).
test(int, X == -42) :-
Expand Down

0 comments on commit 095e6f2

Please sign in to comment.