-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ADDED: #158: Handle surrogate pairs in http/json
The JSON string "\ud83d\udc95" has one codepoint, not two. This is because the spec allows extended characters to be encoded as a pair of 16-bit values, called a "surrogate pair". From RFC 4627: > To escape an extended character that is not in the Basic Multilingual > Plane, the character is represented as a twelve-character sequence, > encoding the UTF-16 surrogate pair. So, for example, a string > containing only the G clef character (U+1D11E) may be represented as > "\uD834\uDD1E". This commit fixes the JSON parser to handle such surrogate pairs.
- Loading branch information
1 parent
4a2484e
commit 095e6f2
Showing
2 changed files
with
47 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
Author: Jan Wielemaker | ||
E-mail: [email protected] | ||
WWW: http://www.swi-prolog.org | ||
Copyright (c) 2007-2021, University of Amsterdam | ||
Copyright (c) 2007-2023, University of Amsterdam | ||
VU University Amsterdam | ||
CWI, Amsterdam | ||
SWI-Prolog Solutions b.v. | ||
|
@@ -387,17 +387,47 @@ | |
escape(0'r, _, 0'\r) :- !. | ||
escape(0't, _, 0'\t) :- !. | ||
escape(0'u, Stream, C) :- | ||
!, | ||
get_code(Stream, C1), | ||
get_code(Stream, C2), | ||
get_code(Stream, C3), | ||
get_code(Stream, C4), | ||
code_type(C1, xdigit(D1)), | ||
code_type(C2, xdigit(D2)), | ||
code_type(C3, xdigit(D3)), | ||
code_type(C4, xdigit(D4)), | ||
get_XXXX(Stream, H), | ||
( hi_surrogate(H) | ||
-> get_surrogate_tail(Stream, H, C) | ||
; C = H | ||
). | ||
|
||
get_XXXX(Stream, C) :- | ||
get_xdigit(Stream, D1), | ||
get_xdigit(Stream, D2), | ||
get_xdigit(Stream, D3), | ||
get_xdigit(Stream, D4), | ||
C is D1<<12+D2<<8+D3<<4+D4. | ||
|
||
get_xdigit(Stream, D) :- | ||
get_code(Stream, C), | ||
code_type(C, xdigit(D)), | ||
!. | ||
get_xdigit(Stream, _) :- | ||
syntax_error(hexdigit_expected, Stream). | ||
|
||
get_surrogate_tail(Stream, Hi, Codepoint) :- | ||
( get_code(Stream, 0'\\), | ||
get_code(Stream, 0'u), | ||
get_XXXX(Stream, Lo), | ||
surrogate([Hi, Lo], Codepoint) | ||
-> true | ||
; syntax_error(illegal_surrogate_pair, Stream) | ||
). | ||
|
||
|
||
hi_surrogate(C) :- | ||
C >= 0xD800, C < 0xDC00. | ||
|
||
lo_surrogate(C) :- | ||
C >= 0xDC00, C < 0xE000. | ||
|
||
surrogate([Hi, Lo], Codepoint) :- | ||
hi_surrogate(Hi), | ||
lo_surrogate(Lo), | ||
Codepoint is (Hi - 0xD800) * 0x400 + (Lo - 0xDC00) + 0x10000. | ||
|
||
json_read_constant(0't, Stream, true) :- | ||
!, | ||
must_see(`rue`, Stream, true). | ||
|
@@ -1091,3 +1121,6 @@ | |
[ 'Illegal comment' ]. | ||
json_syntax_error(illegal_string_escape) --> | ||
[ 'Illegal escape sequence in string' ]. | ||
json_syntax_error(illegal_surrogate_pair) --> | ||
[ 'Illegal escaped surrogate pair in string' ]. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters