Skip to content

Commit

Permalink
Add support for ES6 Unicode code point escapes
Browse files Browse the repository at this point in the history
  • Loading branch information
mathiasbynens committed Nov 21, 2013
1 parent b3c2416 commit 942b3af
Show file tree
Hide file tree
Showing 4 changed files with 286 additions and 19 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "regjsparser",
"version": "0.0.2",
"author": "'Juilan Viereck' <[email protected]>",
"author": "'Julian Viereck' <[email protected]>",
"license": "BSD",
"main": "./parser",
"bin": "bin/parser",
Expand Down
41 changes: 23 additions & 18 deletions parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@
// CharacterEscape
// CharacterClassEscape
//
// CharacterEscape ::
// CharacterEscape[U] ::

This comment has been minimized.

Copy link
@jviereck

jviereck Nov 21, 2013

Owner

What's the purpose of the [U] annotation?

This comment has been minimized.

Copy link
@mathiasbynens

mathiasbynens Nov 21, 2013

Author Collaborator

It’s what’s being used here: http://people.mozilla.org/~jorendorff/es6-draft.html#sec-patterns

Step 8 on http://people.mozilla.org/~jorendorff/es6-draft.html#sec-runtime-semantics-regexpinitialise-abstract-operation says:

Parse P interpreted as UTF-16 encoded Unicode code points using the grammars in 21.2.1. If F contains u the goal symbol for the parse is Pattern[U]. Otherwise the goal symbol for the parse is Pattern.Throw a SyntaxError exception if P did not conform to the grammar or if all characters of P where not matched by the parse.

This comment has been minimized.

Copy link
@mathiasbynens

This comment has been minimized.

Copy link
@mathiasbynens

mathiasbynens Nov 21, 2013

Author Collaborator

Okay, so I hadn’t considered that \u{123} is actually a valid ES5.1 regular expression. For that reason, Unicode code point escapes are only allowed in regular expressions with the u flag enabled. This patch should be amended accordingly.

Since this is the first feature that depends on the u flag, let’s do #3 first.

This comment has been minimized.

Copy link
@jviereck

jviereck Nov 22, 2013

Owner

Since this is the first feature that depends on the u flag, let’s do #3 first.

:)

// ControlEscape
// c ControlLetter
// HexEscapeSequence
// UnicodeEscapeSequence
// IdentityEscape
// RegExpUnicodeEscapeSequence[?U] (ES6)
// IdentityEscape[?U]
//
// ControlEscape ::
// one of f n r t v
Expand Down Expand Up @@ -631,7 +631,7 @@ function parse(str) {
} else if (res = matchReg(/^[dDsSwW]/)) {
return createEscapedChar(res[0]);
}
return false;
return false;
}

function parseCharacterEscape() {
Expand All @@ -644,19 +644,22 @@ function parse(str) {

var res;
if (res = matchReg(/^[fnrtv]/)) {
// ControlEscape
// ControlEscape
return createEscapedChar(res[0]);
} else if (res = matchReg(/^c([a-zA-Z])/)) {
// c ControlLetter
// c ControlLetter
return createEscaped('controlLetter', res[1], 1);
} else if (res = matchReg(/^x([0-9a-fA-F]{2})/)) {
// HexEscapeSequence
// HexEscapeSequence
return createEscaped('hex', res[1], 1);
} else if (res = matchReg(/^u([0-9a-fA-F]{4})/)) {
// UnicodeEscapeSequence
} else if (res = matchReg(/^u([0-9a-fA-F]{4})/)) {
// UnicodeEscapeSequence
return createEscaped('unicode', res[1], 1);
} else if (res = matchReg(/^u\{([0-9a-fA-F]{1,6})\}/)) {
// RegExpUnicodeEscapeSequence (ES6 Unicode code point escape)
return createEscaped('codePoint', res[1], 3);
} else {
// IdentityEscape
// IdentityEscape
return parseIdentityEscape();
}
}
Expand Down Expand Up @@ -689,10 +692,10 @@ function parse(str) {
}

if (match(ZWJ)) {
// <ZWJ>
// <ZWJ>
return createEscaped('identifier', ZWJ);
} else if (match(ZWNJ)) {
// <ZWNJ>
// <ZWNJ>
return createEscaped('identifier', ZWNJ);
}

Expand Down Expand Up @@ -739,7 +742,7 @@ function parse(str) {
function parseHelperClassRanges(atom) {
var from = pos, to, res;
if (current('-') && !next(']')) {
// ClassAtom - ClassAtom ClassRanges
// ClassAtom - ClassAtom ClassRanges
skip('-');

res = parseClassAtom();
Expand Down Expand Up @@ -777,12 +780,12 @@ function parse(str) {
}

if (current(']')) {
// ClassAtom
// ClassAtom
return [atom];
}

// ClassAtom NonemptyClassRangesNoDash
// ClassAtom - ClassAtom ClassRanges
// ClassAtom NonemptyClassRangesNoDash
// ClassAtom - ClassAtom ClassRanges
return parseHelperClassRanges(atom);
}

Expand All @@ -801,8 +804,8 @@ function parse(str) {
return res;
}

// ClassAtomNoDash NonemptyClassRangesNoDash
// ClassAtomNoDash - ClassAtom ClassRanges
// ClassAtomNoDash NonemptyClassRangesNoDash
// ClassAtomNoDash - ClassAtom ClassRanges
return parseHelperClassRanges(res);
}

Expand Down Expand Up @@ -857,6 +860,8 @@ function nodeToCharCode(node) {
switch (node.name) {
case 'unicode':
return parseInt(node.value, 16);
case 'codePoint':
return parseInt(node.value, 16);
case 'controlLetter':
return node.value.charCodeAt(0) % 32;
case 'identifier':
Expand Down
12 changes: 12 additions & 0 deletions test/parse_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
"[\\td-G]",
"[\\u0020]",
"[\\u0061d-G]",
"[\\u{0}-\\u{A}]",
"[\\u{02}-\\u{003}]",
"[\\vd-G]",
"[\\wb-G]",
"[\\x0061d-G]",
Expand Down Expand Up @@ -403,6 +405,16 @@
"\\u044F",
"\\u0451",
"\\undefined",
"\\u{000000}",
"\\u{0}",
"\\u{1}",
"\\u{02}",
"\\u{003}",
"\\u{0004}",
"\\u{00005}",
"\\u{1D306}",
"\\u{01D306}",
"\\u{10FFFF}",
"\\w",
"\\x41",
"\\x42",
Expand Down
Loading

0 comments on commit 942b3af

Please sign in to comment.