From 066c6719d25f1a017d61a7b1285cec1a07e4d9ad Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Thu, 2 Sep 2021 21:32:56 +0900 Subject: [PATCH 01/27] CI: Drop Java 14 and 15 testing, add Java 17 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e44ead17..168eb0db 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - java: [11, 14, 15-ea, 16-ea] + java: [11, 16, 17-ea] os: [ubuntu-latest, macos-latest, windows-latest] name: Java ${{ matrix.java }} steps: From 372b1e10be9319128f4c8390def735e4a0d5d699 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 24 Sep 2021 21:22:10 +0900 Subject: [PATCH 02/27] CI: Update to Java 17 (from 17-ea) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 168eb0db..a764250a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - java: [11, 16, 17-ea] + java: [11, 16, 17] os: [ubuntu-latest, macos-latest, windows-latest] name: Java ${{ matrix.java }} steps: From fbc2cfb9508add16323886eca38aec7e753ae841 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 29 Sep 2021 14:49:14 +0900 Subject: [PATCH 03/27] CI: Add Java 18-ea; drop 16; pin to 11.0.3; use zulu --- .github/workflows/build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a764250a..00f9be43 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - java: [11, 16, 17] + java: [17, 11.0.3, 18-ea] os: [ubuntu-latest, macos-latest, windows-latest] name: Java ${{ matrix.java }} steps: @@ -17,6 +17,7 @@ jobs: - name: Set up java uses: actions/setup-java@v1 with: + distribution: zulu java-version: ${{ matrix.java }} - name: Cache Maven packages uses: actions/cache@v2 From cb4ac1a90312de85ad63ec16da1b1b3f3038f331 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Wed, 29 Sep 2021 14:16:43 +0300 Subject: [PATCH 04/27] Mozilla bug 1724243 - Make text/plain and MediaDocuments use the Standards Mode. r=smaug. Differential Revision: https://phabricator.services.mozilla.com/D123318 --- .../htmlparser/impl/TreeBuilder.java | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/TreeBuilder.java b/src/nu/validator/htmlparser/impl/TreeBuilder.java index f2acce6e..997ae015 100644 --- a/src/nu/validator/htmlparser/impl/TreeBuilder.java +++ b/src/nu/validator/htmlparser/impl/TreeBuilder.java @@ -434,7 +434,7 @@ public abstract class TreeBuilder implements TokenHandler, private boolean quirks = false; - private boolean isSrcdocDocument = false; + private boolean forceNoQuirks = false; // [NOCPP[ @@ -4064,7 +4064,7 @@ private void documentModeInternal(DocumentMode m, String publicIdentifier, String systemIdentifier) throws SAXException { - if (isSrcdocDocument) { + if (forceNoQuirks) { // Srcdoc documents are always rendered in standards mode. quirks = false; if (documentModeHandler != null) { @@ -5807,8 +5807,13 @@ public void setScriptingEnabled(boolean scriptingEnabled) { this.scriptingEnabled = scriptingEnabled; } + public void setForceNoQuirks(boolean forceNoQuirks) { + this.forceNoQuirks = forceNoQuirks; + } + + // Redundant method retained because previously public. public void setIsSrcdocDocument(boolean isSrcdocDocument) { - this.isSrcdocDocument = isSrcdocDocument; + this.setForceNoQuirks(isSrcdocDocument); } // [NOCPP[ @@ -6304,13 +6309,13 @@ private void errStrayDoctype() throws SAXException { } private void errAlmostStandardsDoctype() throws SAXException { - if (!isSrcdocDocument) { + if (!forceNoQuirks) { err("Almost standards mode doctype. Expected \u201C\u201D."); } } private void errQuirkyDoctype() throws SAXException { - if (!isSrcdocDocument) { + if (!forceNoQuirks) { err("Quirky doctype. Expected \u201C\u201D."); } } @@ -6347,7 +6352,7 @@ private void errFooBetweenHeadAndBody(@Local String name) throws SAXException { } private void errStartTagWithoutDoctype() throws SAXException { - if (!isSrcdocDocument) { + if (!forceNoQuirks) { err("Start tag seen without seeing a doctype first. Expected \u201C\u201D."); } } @@ -6424,7 +6429,7 @@ private void errStartTagInTableBody(@Local String name) throws SAXException { } private void errEndTagSeenWithoutDoctype() throws SAXException { - if (!isSrcdocDocument) { + if (!forceNoQuirks) { err("End tag seen without seeing a doctype first. Expected \u201C\u201D."); } } From 5bf8aa3bf2076ea9bc5b1ca89f1ee1fa7ba55b97 Mon Sep 17 00:00:00 2001 From: mkull Date: Thu, 7 Oct 2021 12:15:12 +0200 Subject: [PATCH 05/27] increase bufferSpace by at least 25% to remedy an O(n^2) performance problem when parsing e.g. inline images --- src/nu/validator/htmlparser/impl/Tokenizer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index a59055f7..96b5c48e 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -1463,7 +1463,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { // but not doing that without profiling. In C++ with jemalloc, // the corresponding method should do math to round up here // to avoid slop. - char[] newBuf = new char[worstCase]; + char[] newBuf = new char[Math.max(worstCase, (strBuf.length*5)/4)]; System.arraycopy(strBuf, 0, newBuf, 0, strBufLen); strBuf = newBuf; } From 6205b94843e7d96ed5313e81bb1e790f2bf708b7 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 13 Oct 2021 15:04:51 +0900 Subject: [PATCH 06/27] Use setup-java@v2 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 00f9be43..07884b51 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: with: submodules: true - name: Set up java - uses: actions/setup-java@v1 + uses: actions/setup-java@v2 with: distribution: zulu java-version: ${{ matrix.java }} From 0beae2cb86a1b17ea02c81a263485640dc028ab4 Mon Sep 17 00:00:00 2001 From: Nick Schonning Date: Wed, 13 Oct 2021 02:13:48 -0400 Subject: [PATCH 07/27] chore: setup Dependabot --- .github/dependabot.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..c9b2105b --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + - package-ecosystem: "maven" + directory: "/" + schedule: + interval: "weekly" From 84ed5c9d3d00d938b3fa782e4847448633c55fb1 Mon Sep 17 00:00:00 2001 From: Nick Schonning Date: Wed, 13 Oct 2021 02:16:02 -0400 Subject: [PATCH 08/27] chore: restrict push builds to main --- .github/workflows/build.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 07884b51..1b587a76 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,6 +1,10 @@ name: Build -on: [push, pull_request] +on: + push: + branches: + - main + pull_request: jobs: build: From 190ac350926886a8a8330fa89eee34659b2e0a18 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Fri, 1 Oct 2021 14:44:57 +0300 Subject: [PATCH 09/27] Mozilla bug 1701828 - Meta charset rewrite for Gecko. Differential Revision: https://phabricator.services.mozilla.com/D125808 --- .../validator/htmlparser/impl/Tokenizer.java | 251 ++++++++++++++++++ .../htmlparser/cpptranslate/CppTypes.java | 2 +- .../htmlparser/cpptranslate/CppVisitor.java | 2 +- .../htmlparser/cpptranslate/Main.java | 2 - 4 files changed, 253 insertions(+), 4 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 96b5c48e..37763644 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -365,6 +365,8 @@ public class Tokenizer implements Locator, Locator2 { private boolean seenDigits; + private boolean suspendAfterCurrentNonTextToken; + protected int cstart; /** @@ -543,6 +545,7 @@ public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { this.charRefBufMark = 0; this.value = 0; this.seenDigits = false; + this.suspendAfterCurrentNonTextToken = false; this.cstart = 0; this.strBufLen = 0; this.newAttributesEachTime = newAttributesEachTime; @@ -601,6 +604,7 @@ public Tokenizer(TokenHandler tokenHandler this.charRefBufMark = 0; this.value = 0; this.seenDigits = false; + this.suspendAfterCurrentNonTextToken = false; this.cstart = 0; this.strBufLen = 0; // ∳ is the longest valid char ref and @@ -1086,6 +1090,16 @@ private void appendStrBuf(@NoLength char[] buffer, int offset, int length) throw /** * Emits the current comment token. * + * NOTE: The method may set shouldSuspend, so the caller + * must have this pattern after the state's transition call: + * + *
+     * if (shouldSuspend) {
+     *     break stateloop;
+     * }
+     * continue stateloop;
+     * 
+ * * @param pos * TODO * @@ -1093,6 +1107,7 @@ private void appendStrBuf(@NoLength char[] buffer, int offset, int length) throw */ private void emitComment(int provisionalHyphens, int pos) throws SAXException { + // CPPONLY: RememberGt(pos); // [NOCPP[ if (wantsComments) { // ]NOCPP] @@ -1103,6 +1118,7 @@ private void emitComment(int provisionalHyphens, int pos) // ]NOCPP] clearStrBufAfterUse(); cstart = pos + 1; + suspendIfRequestedAfterCurrentNonTextToken(); } /** @@ -1213,8 +1229,26 @@ private void strBufToElementNameString() { clearStrBufAfterUse(); } + /** + * Emits a tag token. + * + * NOTE: The method may set shouldSuspend, so the caller + * must have this pattern after the state's transition call: + *
+     * if (shouldSuspend) {
+     *     break stateloop;
+     * }
+     * continue stateloop;
+     * 
+ * + * @param selfClosing + * @param pos + * @return + * @throws SAXException + */ private int emitCurrentTagToken(boolean selfClosing, int pos) throws SAXException { + // CPPONLY: RememberGt(pos); cstart = pos + 1; maybeErrSlashInEndTag(selfClosing); stateSave = Tokenizer.DATA; @@ -1252,6 +1286,7 @@ private int emitCurrentTagToken(boolean selfClosing, int pos) * The token handler may have called setStateAndEndTagExpectation * and changed stateSave since the start of this method. */ + suspendIfRequestedAfterCurrentNonTextToken(); return stateSave; } @@ -2582,6 +2617,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '<': appendStrBuf(c); @@ -2729,6 +2767,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '-': /* U+002D HYPHEN-MINUS (-) Parse error. */ @@ -2798,6 +2839,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '-': /* @@ -2945,6 +2989,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { appendStrBuf(c); emitComment(3, pos); state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '-': errNestedComment(); @@ -3010,6 +3057,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '<': appendStrBuf(c); @@ -3131,6 +3181,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '>': cstart = pos + 1; state = transition(state, Tokenizer.DATA, reconsume, pos); + suspendIfRequestedAfterCurrentNonTextToken(); + if (shouldSuspend) { + break stateloop; + } continue stateloop; default: tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); @@ -4130,6 +4184,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '>': emitComment(0, pos); state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '-': appendStrBuf(c); @@ -4163,6 +4220,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { // ]NOCPP] emitComment(0, pos); state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '-': appendSecondHyphenToBogusComment(); @@ -4964,6 +5024,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '\u0000': c = '\uFFFD'; @@ -5037,6 +5100,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '\u0000': c = '\uFFFD'; @@ -5100,6 +5166,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case 'p': case 'P': @@ -5246,6 +5315,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; default: bogusDoctype(); @@ -5331,6 +5403,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; default: bogusDoctype(); @@ -5385,6 +5460,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '\r': appendStrBufCarriageReturn(); @@ -5449,6 +5527,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '"': /* @@ -5533,6 +5614,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '"': /* @@ -5613,6 +5697,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '\r': appendStrBufCarriageReturn(); @@ -5672,6 +5759,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; default: /* @@ -5710,6 +5800,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '\r': silentCarriageReturn(); @@ -5840,6 +5933,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; default: bogusDoctype(); @@ -5925,6 +6021,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; default: bogusDoctype(); @@ -5975,6 +6074,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '\r': appendStrBufCarriageReturn(); @@ -6033,6 +6135,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the data state. */ state = transition(state, Tokenizer.DATA, reconsume, pos); + if (shouldSuspend) { + break stateloop; + } continue stateloop; case '\r': appendStrBufCarriageReturn(); @@ -6085,6 +6190,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '>': state = transition(state, Tokenizer.DATA, reconsume, pos); + suspendIfRequestedAfterCurrentNonTextToken(); + if (shouldSuspend) { + break stateloop; + } continue stateloop; default: state = transition(state, @@ -6826,7 +6935,23 @@ public void eof() throws SAXException { return; } + /** + * Emits a doctype token. + * + * NOTE: The method may set shouldSuspend, so the caller + * must have this pattern after the state's transition call: + *
+     * if (shouldSuspend) {
+     *     break stateloop;
+     * }
+     * continue stateloop;
+     * 
+ * + * @param pos + * @throws SAXException + */ private void emitDoctypeToken(int pos) throws SAXException { + // CPPONLY: RememberGt(pos); cstart = pos + 1; tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, forceQuirks); @@ -6838,6 +6963,130 @@ private void emitDoctypeToken(int pos) throws SAXException { publicIdentifier = null; Portability.releaseString(systemIdentifier); systemIdentifier = null; + suspendIfRequestedAfterCurrentNonTextToken(); + } + + private void suspendIfRequestedAfterCurrentNonTextToken() { + if (suspendAfterCurrentNonTextToken) { + suspendAfterCurrentNonTextToken = false; + shouldSuspend = true; + } + } + + // Making this private until the full Java implementation is done. + /** + * Request suspension after the current token if the tokenizer is currently + * in a non-text state (i.e. it's known that the next token will be a + * non-text token). + * + * Must not be called when tokenizeBuffer() is on the call + * stack. + */ + @SuppressWarnings("unused") private void suspendAfterCurrentTokenIfNotInText() { + switch (stateSave) { + case DATA: + case RCDATA: + case SCRIPT_DATA: + case RAWTEXT: + case SCRIPT_DATA_ESCAPED: + case PLAINTEXT: + case NON_DATA_END_TAG_NAME: // We haven't yet committed to the next + // token being a non-text token, though + // it could be. + case SCRIPT_DATA_LESS_THAN_SIGN: + case SCRIPT_DATA_ESCAPE_START: + case SCRIPT_DATA_ESCAPE_START_DASH: + case SCRIPT_DATA_ESCAPED_DASH: + case SCRIPT_DATA_ESCAPED_DASH_DASH: + case RAWTEXT_RCDATA_LESS_THAN_SIGN: + case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: + case SCRIPT_DATA_DOUBLE_ESCAPE_START: + case SCRIPT_DATA_DOUBLE_ESCAPED: + case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: + case SCRIPT_DATA_DOUBLE_ESCAPE_END: + return; + case TAG_NAME: + case BEFORE_ATTRIBUTE_NAME: + case ATTRIBUTE_NAME: + case AFTER_ATTRIBUTE_NAME: + case BEFORE_ATTRIBUTE_VALUE: + case AFTER_ATTRIBUTE_VALUE_QUOTED: + case BOGUS_COMMENT: + case MARKUP_DECLARATION_OPEN: + case DOCTYPE: + case BEFORE_DOCTYPE_NAME: + case DOCTYPE_NAME: + case AFTER_DOCTYPE_NAME: + case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: + case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: + case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: + case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: + case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: + case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: + case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: + case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: + case BOGUS_DOCTYPE: + case COMMENT_START: + case COMMENT_START_DASH: + case COMMENT: + case COMMENT_END_DASH: + case COMMENT_END: + case COMMENT_END_BANG: + case TAG_OPEN: + case CLOSE_TAG_OPEN: + case MARKUP_DECLARATION_HYPHEN: + case MARKUP_DECLARATION_OCTYPE: + case DOCTYPE_UBLIC: + case DOCTYPE_YSTEM: + case AFTER_DOCTYPE_PUBLIC_KEYWORD: + case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: + case AFTER_DOCTYPE_SYSTEM_KEYWORD: + case SELF_CLOSING_START_TAG: + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: + case ATTRIBUTE_VALUE_SINGLE_QUOTED: + case ATTRIBUTE_VALUE_UNQUOTED: + case BOGUS_COMMENT_HYPHEN: + case COMMENT_LESSTHAN: + case COMMENT_LESSTHAN_BANG: + case COMMENT_LESSTHAN_BANG_DASH: + case COMMENT_LESSTHAN_BANG_DASH_DASH: + case CDATA_START: + case CDATA_SECTION: + case CDATA_RSQB: + case CDATA_RSQB_RSQB: + case PROCESSING_INSTRUCTION: + case PROCESSING_INSTRUCTION_QUESTION_MARK: + break; + case CONSUME_CHARACTER_REFERENCE: + case CONSUME_NCR: + case CHARACTER_REFERENCE_TAIL: + case HEX_NCR_LOOP: + case DECIMAL_NRC_LOOP: + case HANDLE_NCR_VALUE: + case HANDLE_NCR_VALUE_RECONSUME: + case CHARACTER_REFERENCE_HILO_LOOKUP: + if (returnStateSave == DATA || returnStateSave == RCDATA) { + return; + } + break; + default: + assert false : "Incomplete switch"; + return; + } + suspendAfterCurrentNonTextToken = true; + } + + // Making this private until the full Java implementation is done. + /** + * Queries if we are about to suspend after the current non-text token due to a request + * from suspendAfterCurrentTokenIfNotInText(). + * @return true iff suspendAfterCurrentTokenIfNotInText() was + * called in a non-text position and the then-current token has not been emitted yet. + */ + @SuppressWarnings("unused") private boolean suspensionAfterCurrentNonTextTokenPending() { + return suspendAfterCurrentNonTextToken; } @Inline protected char checkChar(@NoLength char[] buf, int pos) @@ -6966,6 +7215,7 @@ public void resetToDataState() { charRefBufMark = 0; value = 0; seenDigits = false; + suspendAfterCurrentNonTextToken = false; endTag = false; shouldSuspend = false; initDoctypeFields(); @@ -7009,6 +7259,7 @@ public void loadState(Tokenizer other) throws SAXException { seenDigits = other.seenDigits; endTag = other.endTag; shouldSuspend = false; + suspendAfterCurrentNonTextToken = false; doctypeName = other.doctypeName; Portability.releaseString(systemIdentifier); diff --git a/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java b/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java index 0724c00e..a63f3547 100644 --- a/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java +++ b/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java @@ -115,7 +115,7 @@ public class CppTypes { private static final String[] FORWARD_DECLARATIONS = { "nsHtml5StreamParser" }; private static final String[] CLASSES_THAT_NEED_SUPPLEMENT = { - "MetaScanner", "Tokenizer", "TreeBuilder", "UTF16Buffer", }; + "Tokenizer", "TreeBuilder", "UTF16Buffer", }; private static final String[] STATE_LOOP_POLICIES = { "nsHtml5ViewSourcePolicy", "nsHtml5SilentPolicy" }; diff --git a/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java b/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java index e832e3bb..9f559dce 100755 --- a/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java +++ b/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java @@ -125,7 +125,7 @@ public class CppVisitor extends AnnotationHelperVisitor { private static final String[] CLASS_NAMES = { "AttributeName", - "ElementName", "HtmlAttributes", "LocatorImpl", "MetaScanner", + "ElementName", "HtmlAttributes", "LocatorImpl", "NamedCharacters", "NamedCharactersAccel", "Portability", "StackNode", "Tokenizer", "TreeBuilder", "UTF16Buffer" }; diff --git a/translator-src/nu/validator/htmlparser/cpptranslate/Main.java b/translator-src/nu/validator/htmlparser/cpptranslate/Main.java index 741b7419..898f87f1 100644 --- a/translator-src/nu/validator/htmlparser/cpptranslate/Main.java +++ b/translator-src/nu/validator/htmlparser/cpptranslate/Main.java @@ -56,7 +56,6 @@ public class Main { "ElementName", "Tokenizer", "TreeBuilder", - "MetaScanner", "StackNode", "UTF16Buffer", "StateSnapshot", @@ -68,7 +67,6 @@ public class Main { "ElementName", "Tokenizer", "TreeBuilder", - "MetaScanner", "StackNode", "UTF16Buffer", "StateSnapshot", From 5fc01b154167669d3f5a848977e997e7e1d73b79 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Mon, 3 Jan 2022 15:02:32 +0200 Subject: [PATCH 10/27] =?UTF-8?q?Mozilla=20bug=201701828=20addendum=20-=20?= =?UTF-8?q?More=20comments=20for=20meta=20charset=20rewrite=E2=80=A6=20(#6?= =?UTF-8?q?4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Mozilla bug 1701828 addendum - More comments for meta charset rewrite for Gecko. Differential Revision: https://phabricator.services.mozilla.com/D125808 Co-authored-by: Michael[tm] Smith --- .../validator/htmlparser/impl/Tokenizer.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 37763644..bdfb4afb 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -3181,6 +3181,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '>': cstart = pos + 1; state = transition(state, Tokenizer.DATA, reconsume, pos); + // Since a CDATA section starts with a less-than sign, it + // participates in the suspension-after-current-token + // behavior. (The suspension can be requested when the + // less-than sign has been seen but we don't yet know the + // resulting token type.) Therefore, we need to deal with + // a potential request here. suspendIfRequestedAfterCurrentNonTextToken(); if (shouldSuspend) { break stateloop; @@ -6190,6 +6196,19 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '>': state = transition(state, Tokenizer.DATA, reconsume, pos); + // Processing instruction syntax goes through these + // states only in Gecko's XML View Source--not in HTML + // parsing in Java or in Gecko. + // Since XML View Source doesn't use the + // suspension-after-current-token facility, its extension + // to processing-instruction states is strictly unnecessary + // at the moment. However, if these states ever were to be + // used together with the suspension-after-current-token + // facility, these states would need to participate, since + // suspension could be requested when only less-than has been + // seen and we don't yet know if we end up here. Handling + // the currently-unnecessary case in order to avoid leaving + // a trap for future modification. suspendIfRequestedAfterCurrentNonTextToken(); if (shouldSuspend) { break stateloop; @@ -6966,6 +6985,11 @@ private void emitDoctypeToken(int pos) throws SAXException { suspendIfRequestedAfterCurrentNonTextToken(); } + /** + * If a previous call to suspendAfterCurrentTokenIfNotInText() + * happened in a non-text context, this method turns that deferred suspension + * request into an immediately-pending suspension request. + */ private void suspendIfRequestedAfterCurrentNonTextToken() { if (suspendAfterCurrentNonTextToken) { suspendAfterCurrentNonTextToken = false; From dad0086c50c76340ab613a5bea96a8a85e553e1d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Mar 2022 01:30:52 +0000 Subject: [PATCH 11/27] Bump actions/checkout from 2 to 3 Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1b587a76..3b59394c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: os: [ubuntu-latest, macos-latest, windows-latest] name: Java ${{ matrix.java }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Set up java From 80a5fc5d3446f53925ff069d94507cd0ea8b352e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Mar 2022 01:23:33 +0000 Subject: [PATCH 12/27] Bump actions/cache from 2 to 3 Bumps [actions/cache](https://github.com/actions/cache) from 2 to 3. - [Release notes](https://github.com/actions/cache/releases) - [Commits](https://github.com/actions/cache/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3b59394c..fbae6988 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -24,7 +24,7 @@ jobs: distribution: zulu java-version: ${{ matrix.java }} - name: Cache Maven packages - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.m2 key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} From 07c3df85b7df6d5dc9ee9d6591c5c19db6082ce2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Apr 2022 01:30:30 +0000 Subject: [PATCH 13/27] Bump actions/setup-java from 2 to 3 Bumps [actions/setup-java](https://github.com/actions/setup-java) from 2 to 3. - [Release notes](https://github.com/actions/setup-java/releases) - [Commits](https://github.com/actions/setup-java/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/setup-java dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fbae6988..34ff3458 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: with: submodules: true - name: Set up java - uses: actions/setup-java@v2 + uses: actions/setup-java@v3 with: distribution: zulu java-version: ${{ matrix.java }} From 83602ed6cd8c554073ed52380e465f32ba831b66 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Apr 2022 08:44:34 +0900 Subject: [PATCH 14/27] Bump icu4j from 4.0.1 to 71.1 (#68) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1b233355..6b665c69 100644 --- a/pom.xml +++ b/pom.xml @@ -144,7 +144,7 @@ com.ibm.icu icu4j - 4.0.1 + 71.1 compile true From 296f4a580292b54b4f568d6b4fe0c8a8d06e60c3 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Wed, 4 May 2022 15:21:45 +0300 Subject: [PATCH 15/27] Mozilla bug 1747388. r=smaug Differential Revision: https://phabricator.services.mozilla.com/D145571 --- .../validator/htmlparser/impl/Tokenizer.java | 376 ++++++++++-------- .../htmlparser/test/TokenizerTester.java | 8 +- 2 files changed, 217 insertions(+), 167 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index bdfb4afb..495b8db7 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -1610,8 +1610,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { flushChars(buf, pos); state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); - break dataloop; // FALL THROUGH continue - // stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break dataloop; case '\u0000': maybeEmitReplacementCharacter(buf, pos); continue; @@ -1667,8 +1667,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * (Don't emit the token yet; further details will * be filled in before it is emitted.) */ + // `break` optimizes; `continue stateloop;` would be valid break tagopenloop; - // continue stateloop; } else if (c >= 'a' && c <= 'z') { /* * U+0061 LATIN SMALL LETTER A through to U+007A @@ -1688,8 +1688,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * (Don't emit the token yet; further details will * be filled in before it is emitted.) */ + // `break` optimizes; `continue stateloop;` would be valid break tagopenloop; - // continue stateloop; } switch (c) { case '!': @@ -1788,8 +1788,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ strBufToElementNameString(); state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break tagnameloop; - // continue stateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing @@ -1939,8 +1939,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the attribute name state. */ state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break beforeattributenameloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -1989,8 +1989,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ attributeNameComplete(); state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break attributenameloop; - // continue stateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current @@ -2079,8 +2079,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { // CPPONLY: attributeLine = line; clearStrBufBeforeUse(); state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break beforeattributevalueloop; - // continue stateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the attribute @@ -2176,8 +2176,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { addAttributeWithValue(); state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break attributevaluedoublequotedloop; - // continue stateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character @@ -2247,8 +2247,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * start tag state. */ state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break afterattributevaluequotedloop; - // continue stateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current @@ -2311,6 +2311,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); continue stateloop; } + // no fallthrough, reordering opportunity case ATTRIBUTE_VALUE_UNQUOTED: for (;;) { if (reconsume) { @@ -2402,7 +2403,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue; } } - case AFTER_ATTRIBUTE_NAME: + // no fallthrough, reordering opportunity + case AFTER_ATTRIBUTE_NAME: for (;;) { if (++pos == endPos) { break stateloop; @@ -2501,6 +2503,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; } } + // no fallthrough, reordering opportunity case MARKUP_DECLARATION_OPEN: markupdeclarationopenloop: for (;;) { if (++pos == endPos) { @@ -2537,8 +2540,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { clearStrBufBeforeUse(); appendStrBuf(c); state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break markupdeclarationopenloop; - // continue stateloop; case 'd': case 'D': clearStrBufBeforeUse(); @@ -2574,8 +2577,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '-': clearStrBufAfterOneHyphen(); state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break markupdeclarationhyphenloop; - // continue stateloop; default: errBogusComment(); reconsume = true; @@ -2646,8 +2649,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the comment state. */ state = transition(state, Tokenizer.COMMENT, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break commentstartloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -2668,8 +2671,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break commentloop; - // continue stateloop; case '<': appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); @@ -2714,8 +2717,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break commentenddashloop; - // continue stateloop; case '<': appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); @@ -2798,7 +2801,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '!': appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); - continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break commentendloop; case '\u0000': c = '\uFFFD'; // CPPONLY: MOZ_FALLTHROUGH; @@ -2817,6 +2821,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; } } + // CPPONLY: MOZ_FALLTHROUGH; case COMMENT_END_BANG: for (;;) { if (++pos == endPos) { @@ -2882,8 +2887,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; } } + // no fallthrough, reordering opportunity case COMMENT_LESSTHAN: - for (;;) { + commentlessthanloop: for (;;) { if (++pos == endPos) { break stateloop; } @@ -2892,7 +2898,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '!': appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG, reconsume, pos); - continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break commentlessthanloop; case '<': appendStrBuf(c); continue; @@ -2902,10 +2909,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; case '\r': appendStrBufCarriageReturn(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); break stateloop; case '\n': appendStrBufLineFeed(); - continue; + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; case '\u0000': c = '\uFFFD'; // CPPONLY: MOZ_FALLTHROUGH; @@ -2917,7 +2926,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } // CPPONLY: MOZ_FALLTHROUGH; case COMMENT_LESSTHAN_BANG: - for (;;) { + commentlessthanbangloop: for (;;) { if (++pos == endPos) { break stateloop; } @@ -2926,17 +2935,20 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '-': appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH, reconsume, pos); - continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break commentlessthanbangloop; case '<': appendStrBuf(c); state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); continue stateloop; case '\r': appendStrBufCarriageReturn(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); break stateloop; case '\n': appendStrBufLineFeed(); - continue; + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; case '\u0000': c = '\uFFFD'; // CPPONLY: MOZ_FALLTHROUGH; @@ -2948,88 +2960,106 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } // CPPONLY: MOZ_FALLTHROUGH; case COMMENT_LESSTHAN_BANG_DASH: - for (;;) { - if (++pos == endPos) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '-': + appendStrBuf(c); + state = transition(state, + Tokenizer.COMMENT_LESSTHAN_BANG_DASH_DASH, + reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid + break; + case '<': + appendStrBuf(c); + state = transition(state, + Tokenizer.COMMENT_LESSTHAN, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + state = transition(state, Tokenizer.COMMENT, + reconsume, pos); break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '-': - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH_DASH, reconsume, pos); - continue stateloop; - case '<': - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // CPPONLY: MOZ_FALLTHROUGH; - default: - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - } + case '\n': + appendStrBufLineFeed(); + state = transition(state, Tokenizer.COMMENT, + reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // CPPONLY: MOZ_FALLTHROUGH; + default: + appendStrBuf(c); + state = transition(state, Tokenizer.COMMENT, + reconsume, pos); + continue stateloop; } // CPPONLY: MOZ_FALLTHROUGH; case COMMENT_LESSTHAN_BANG_DASH_DASH: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '>': - appendStrBuf(c); - emitComment(3, pos); - state = transition(state, Tokenizer.DATA, reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - continue stateloop; - case '-': - errNestedComment(); - adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens); - reportedConsecutiveHyphens = true; - state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); - continue stateloop; - case '\r': - errNestedComment(); - adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens); - reportedConsecutiveHyphens = true; - state = transition(state, Tokenizer.COMMENT, reconsume, pos); + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '>': + appendStrBuf(c); + emitComment(3, pos); + state = transition(state, Tokenizer.DATA, reconsume, + pos); + if (shouldSuspend) { break stateloop; - case '\n': - errNestedComment(); - adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens); - reportedConsecutiveHyphens = true; - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue; - case '\u0000': - c = '\uFFFD'; - // CPPONLY: MOZ_FALLTHROUGH; - case '!': - errNestedComment(); - adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens); - reportedConsecutiveHyphens = true; - state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); - continue stateloop; - default: - errNestedComment(); - adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens); - reportedConsecutiveHyphens = true; - state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); - continue stateloop; - } + } + continue stateloop; + case '-': + errNestedComment(); + adjustDoubleHyphenAndAppendToStrBufAndErr(c, + reportedConsecutiveHyphens); + reportedConsecutiveHyphens = true; + state = transition(state, Tokenizer.COMMENT_END, + reconsume, pos); + continue stateloop; + case '\r': + c = '\n'; + silentCarriageReturn(); + errNestedComment(); + adjustDoubleHyphenAndAppendToStrBufAndErr(c, + reportedConsecutiveHyphens); + reportedConsecutiveHyphens = true; + state = transition(state, Tokenizer.COMMENT, + reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + errNestedComment(); + adjustDoubleHyphenAndAppendToStrBufAndErr(c, + reportedConsecutiveHyphens); + reportedConsecutiveHyphens = true; + state = transition(state, Tokenizer.COMMENT, + reconsume, pos); + continue stateloop; + case '!': + errNestedComment(); + adjustDoubleHyphenAndAppendToStrBufAndErr(c, + reportedConsecutiveHyphens); + reportedConsecutiveHyphens = true; + state = transition(state, + Tokenizer.COMMENT_END_BANG, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // CPPONLY: MOZ_FALLTHROUGH; + default: + errNestedComment(); + adjustDoubleHyphenAndAppendToStrBufAndErr(c, + reportedConsecutiveHyphens); + reportedConsecutiveHyphens = true; + state = transition(state, Tokenizer.COMMENT, + reconsume, pos); + continue stateloop; } - // CPPONLY: MOZ_FALLTHROUGH; - // XXX reorder point + // no fallthrough, reordering opportunity case COMMENT_START_DASH: if (++pos == endPos) { break stateloop; @@ -3089,6 +3119,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { state = transition(state, Tokenizer.COMMENT, reconsume, pos); continue stateloop; } + // no fallthrough, reordering opportunity case CDATA_START: for (;;) { if (++pos == endPos) { @@ -3111,7 +3142,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { cstart = pos; // start coalescing reconsume = true; state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); - break; // FALL THROUGH continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -3129,7 +3161,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case ']': flushChars(buf, pos); state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); - break cdatasectionloop; // FALL THROUGH + // `break` optimizes; `continue stateloop;` would be valid + break cdatasectionloop; case '\u0000': maybeEmitReplacementCharacter(buf, pos); continue; @@ -3145,23 +3178,23 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } // CPPONLY: MOZ_FALLTHROUGH; case CDATA_RSQB: - cdatarsqb: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case ']': - state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); - break cdatarsqb; - default: - tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, - 1); - cstart = pos; - reconsume = true; - state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); - continue stateloop; - } + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case ']': + state = transition(state, Tokenizer.CDATA_RSQB_RSQB, + reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid + break; + default: + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); + cstart = pos; + reconsume = true; + state = transition(state, Tokenizer.CDATA_SECTION, + reconsume, pos); + continue stateloop; } // CPPONLY: MOZ_FALLTHROUGH; case CDATA_RSQB_RSQB: @@ -3200,6 +3233,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; } } + // no fallthrough, reordering opportunity case ATTRIBUTE_VALUE_SINGLE_QUOTED: attributevaluesinglequotedloop: for (;;) { if (reconsume) { @@ -3235,8 +3269,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { setAdditionalAndRememberAmpersandLocation('\''); returnState = state; state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break attributevaluesinglequotedloop; - // continue stateloop; case '\r': appendStrBufCarriageReturn(); break stateloop; @@ -3335,7 +3369,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { // Didn't fail yet appendCharRefBuf(c); state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); - // FALL THROUGH continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break; } // CPPONLY: MOZ_FALLTHROUGH; case CHARACTER_REFERENCE_HILO_LOOKUP: @@ -3411,7 +3446,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { candidate = -1; charRefBufMark = 0; state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); - // FALL THROUGH continue stateloop; + // fallthrough optimizes; `continue stateloop;` would also be valid } // CPPONLY: MOZ_FALLTHROUGH; case CHARACTER_REFERENCE_TAIL: @@ -3639,7 +3674,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ reconsume = true; state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); - // FALL THROUGH continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break; } // CPPONLY: MOZ_FALLTHROUGH; case DECIMAL_NRC_LOOP: @@ -3671,7 +3707,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { cstart = pos + 1; } state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); - // FALL THROUGH continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid break decimalloop; } else { errNoDigitsInNCR(); @@ -3711,7 +3747,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } reconsume = true; state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); - // FALL THROUGH continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid break decimalloop; } } @@ -3725,6 +3761,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { handleNcrValue(returnState); state = transition(state, returnState, reconsume, pos); continue stateloop; + // no fallthrough, reordering opportunity case HEX_NCR_LOOP: for (;;) { if (++pos == endPos) { @@ -3809,6 +3846,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } } } + // no fallthrough, reordering opportunity case PLAINTEXT: plaintextloop: for (;;) { if (reconsume) { @@ -3838,6 +3876,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue; } } + // no fallthrough, reordering opportunity case CLOSE_TAG_OPEN: if (++pos == endPos) { break stateloop; @@ -3919,6 +3958,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; } } + // no fallthrough, reordering opportunity case RCDATA: rcdataloop: for (;;) { if (reconsume) { @@ -3969,6 +4009,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue; } } + // no fallthrough, reordering opportunity case RAWTEXT: rawtextloop: for (;;) { if (reconsume) { @@ -3989,8 +4030,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { returnState = state; state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break rawtextloop; - // FALL THRU continue stateloop; case '\u0000': emitReplacementCharacter(buf, pos); continue; @@ -4025,8 +4066,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { index = 0; clearStrBufBeforeUse(); state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break rawtextrcdatalessthansignloop; - // FALL THRU continue stateloop; default: /* * Otherwise, emit a U+003C LESS-THAN SIGN @@ -4157,6 +4198,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } } } + // no fallthrough, reordering opportunity // BEGIN HOTSPOT WORKAROUND case BOGUS_COMMENT: boguscommentloop: for (;;) { @@ -4197,6 +4239,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '-': appendStrBuf(c); state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break boguscommentloop; case '\r': appendStrBufCarriageReturn(); @@ -4250,6 +4293,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; } } + // no fallthrough, reordering opportunity case SCRIPT_DATA: scriptdataloop: for (;;) { if (reconsume) { @@ -4269,8 +4313,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { flushChars(buf, pos); returnState = state; state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); - break scriptdataloop; // FALL THRU continue - // stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break scriptdataloop; case '\u0000': emitReplacementCharacter(buf, pos); continue; @@ -4311,9 +4355,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { tokenHandler.characters(Tokenizer.LT_GT, 0, 1); cstart = pos; state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); - break scriptdatalessthansignloop; // FALL THRU - // continue - // stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break scriptdatalessthansignloop; default: /* * Otherwise, emit a U+003C LESS-THAN SIGN @@ -4348,9 +4391,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * script data escape start dash state. */ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); - break scriptdataescapestartloop; // FALL THRU - // continue - // stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break scriptdataescapestartloop; default: /* * Anything else Reconsume the current input @@ -4379,8 +4421,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * script data escaped dash dash state. */ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdataescapestartdashloop; - // continue stateloop; default: /* * Anything else Reconsume the current input @@ -4443,8 +4485,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * script data escaped state. */ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdataescapeddashdashloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -4469,9 +4511,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * script data escaped dash state. */ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); - break scriptdataescapedloop; // FALL THRU - // continue - // stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break scriptdataescapedloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the @@ -4524,8 +4565,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ flushChars(buf, pos); state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdataescapeddashloop; - // continue stateloop; case '\u0000': emitReplacementCharacter(buf, pos); state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); @@ -4589,8 +4630,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * escape start state. */ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdataescapedlessthanloop; - // continue stateloop; default: /* * Anything else Emit a U+003C LESS-THAN SIGN @@ -4649,8 +4690,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * script data double escaped state. */ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdatadoubleescapestartloop; - // continue stateloop; default: /* * Anything else Reconsume the current input @@ -4683,9 +4724,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * script data double escaped dash state. */ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); - break scriptdatadoubleescapedloop; // FALL THRU - // continue - // stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break scriptdatadoubleescapedloop; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C @@ -4731,8 +4771,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * script data double escaped dash dash state. */ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdatadoubleescapeddashloop; - // continue stateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C @@ -4789,6 +4829,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * state. */ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdatadoubleescapeddashdashloop; case '>': /* @@ -4839,6 +4880,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ index = 0; state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break scriptdatadoubleescapedlessthanloop; default: /* @@ -4905,6 +4947,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue stateloop; } } + // no fallthrough, reordering opportunity case MARKUP_DECLARATION_OCTYPE: markupdeclarationdoctypeloop: for (;;) { if (++pos == endPos) { @@ -4929,8 +4972,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } else { reconsume = true; state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break markupdeclarationdoctypeloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -4965,8 +5008,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the before DOCTYPE name state. */ state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break doctypeloop; - // continue stateloop; default: /* * Anything else Parse error. @@ -4978,8 +5021,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ reconsume = true; state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break doctypeloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -5060,8 +5103,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * Switch to the DOCTYPE name state. */ state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break beforedoctypenameloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -5093,8 +5136,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ strBufToDoctypeName(); state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break doctypenameloop; - // continue stateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current @@ -5180,8 +5223,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case 'P': index = 0; state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break afterdoctypenameloop; - // continue stateloop; case 's': case 'S': index = 0; @@ -5235,8 +5278,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } else { reconsume = true; state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break doctypeublicloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -5271,8 +5314,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * identifier state. */ state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break afterdoctypepublickeywordloop; - // FALL THROUGH continue stateloop case '"': /* * U+0022 QUOTATION MARK (") Parse Error. @@ -5378,8 +5421,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * (double-quoted) state. */ state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break beforedoctypepublicidentifierloop; - // continue stateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's @@ -5445,8 +5488,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ publicIdentifier = strBufToString(); state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break doctypepublicidentifierdoublequotedloop; - // continue stateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. @@ -5521,8 +5564,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * system identifiers state. */ state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break afterdoctypepublicidentifierloop; - // continue stateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current @@ -5636,8 +5679,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * (double-quoted) state. */ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break betweendoctypepublicandsystemidentifiersloop; - // continue stateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's @@ -5683,7 +5726,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ systemIdentifier = strBufToString(); state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); - continue stateloop; + // `break` optimizes; `continue stateloop;` would be valid + break doctypesystemidentifierdoublequotedloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. @@ -5730,6 +5774,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue; } } + // CPPONLY: MOZ_FALLTHROUGH; case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: afterdoctypesystemidentifierloop: for (;;) { if (++pos == endPos) { @@ -5777,8 +5822,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { */ bogusDoctypeWithoutQuirks(); state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break afterdoctypesystemidentifierloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -5824,6 +5869,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue; } } + // no fallthrough, reordering opportunity case DOCTYPE_YSTEM: doctypeystemloop: for (;;) { if (++pos == endPos) { @@ -5853,8 +5899,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } else { reconsume = true; state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break doctypeystemloop; - // continue stateloop; } } // CPPONLY: MOZ_FALLTHROUGH; @@ -5889,8 +5935,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * identifier state. */ state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break afterdoctypesystemkeywordloop; - // FALL THROUGH continue stateloop case '"': /* * U+0022 QUOTATION MARK (") Parse Error. @@ -6009,8 +6055,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * (single-quoted) state. */ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break beforedoctypesystemidentifierloop; - // continue stateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ errExpectedSystemId(); @@ -6107,6 +6153,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue; } } + // no fallthrough, reordering opportunity case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: for (;;) { if (++pos == endPos) { @@ -6168,6 +6215,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { continue; } } + // no fallthrough, reordering opportunity case PROCESSING_INSTRUCTION: processinginstructionloop: for (;;) { if (++pos == endPos) { @@ -6180,8 +6228,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException { state, Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid break processinginstructionloop; - // continue stateloop; default: continue; } diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index 7048dca6..42bd86f3 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -13,7 +13,7 @@ * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER @@ -52,6 +52,7 @@ import com.sdicons.json.model.JSONString; import com.sdicons.json.model.JSONValue; import com.sdicons.json.parser.JSONParser; +import com.sdicons.json.parser.ParserException; public class TokenizerTester { @@ -102,7 +103,7 @@ private static boolean jsonDeepEquals(JSONValue one, JSONValue other) { private final Writer writer; public TokenizerTester(InputStream stream) throws TokenStreamException, - RecognitionException, UnsupportedEncodingException { + RecognitionException, UnsupportedEncodingException, ParserException { tokenHandler = new JSONArrayTokenHandler(); driver = new Driver(new ErrorReportingTokenizer(tokenHandler)); driver.setCommentPolicy(XmlViolationPolicy.ALLOW); @@ -221,9 +222,10 @@ private void runTestInner(String inputString, JSONArray expectedTokens, * @throws TokenStreamException * @throws IOException * @throws SAXException + * @throws ParserException */ public static void main(String[] args) throws TokenStreamException, - RecognitionException, SAXException, IOException { + RecognitionException, SAXException, IOException, ParserException { for (int i = 0; i < args.length; i++) { byte[] fileBytes = Files.readAllBytes(Paths.get(args[i])); String fileContent = new String(fileBytes, StandardCharsets.UTF_8); From c3a57da5b3cc511548bf8d14eb32db1f3706726d Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Wed, 22 Jun 2022 15:40:30 +0300 Subject: [PATCH 16/27] Revert ParserException import that should not have been added in the previous commit --- .../nu/validator/htmlparser/test/TokenizerTester.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index 42bd86f3..7048dca6 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -13,7 +13,7 @@ * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER @@ -52,7 +52,6 @@ import com.sdicons.json.model.JSONString; import com.sdicons.json.model.JSONValue; import com.sdicons.json.parser.JSONParser; -import com.sdicons.json.parser.ParserException; public class TokenizerTester { @@ -103,7 +102,7 @@ private static boolean jsonDeepEquals(JSONValue one, JSONValue other) { private final Writer writer; public TokenizerTester(InputStream stream) throws TokenStreamException, - RecognitionException, UnsupportedEncodingException, ParserException { + RecognitionException, UnsupportedEncodingException { tokenHandler = new JSONArrayTokenHandler(); driver = new Driver(new ErrorReportingTokenizer(tokenHandler)); driver.setCommentPolicy(XmlViolationPolicy.ALLOW); @@ -222,10 +221,9 @@ private void runTestInner(String inputString, JSONArray expectedTokens, * @throws TokenStreamException * @throws IOException * @throws SAXException - * @throws ParserException */ public static void main(String[] args) throws TokenStreamException, - RecognitionException, SAXException, IOException, ParserException { + RecognitionException, SAXException, IOException { for (int i = 0; i < args.length; i++) { byte[] fileBytes = Files.readAllBytes(Paths.get(args[i])); String fileContent = new String(fileBytes, StandardCharsets.UTF_8); From a57c00b83e6d4c8cfe0a1d32c46f705189fb6800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emilio=20Cobos=20=C3=81lvarez?= Date: Thu, 23 Jun 2022 12:26:10 +0200 Subject: [PATCH 17/27] Mozilla bug 1372276 - Remove menuitem. (#70) See Mozilla bug 1775477. Co-authored-by: Henri Sivonen --- .../htmlparser/impl/ElementName.java | 152 +++++++++--------- .../htmlparser/impl/TreeBuilder.java | 8 +- .../htmlparser/cpptranslate/CppVisitor.java | 7 - 3 files changed, 74 insertions(+), 93 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/ElementName.java b/src/nu/validator/htmlparser/impl/ElementName.java index 0f80b223..5748c3fb 100644 --- a/src/nu/validator/htmlparser/impl/ElementName.java +++ b/src/nu/validator/htmlparser/impl/ElementName.java @@ -557,8 +557,6 @@ public void destructor() { // return "ANNOTATION_XML"; // case TreeBuilder.FOREIGNOBJECT_OR_DESC: // return "FOREIGNOBJECT_OR_DESC"; -// case TreeBuilder.MENUITEM: -// return "MENUITEM"; // } // return null; // } @@ -1143,10 +1141,6 @@ public void destructor() { // CPPONLY: NS_NewHTMLFormElement, // CPPONLY: NS_NewSVGUnknownElement, TreeBuilder.FORM | SPECIAL); -public static final ElementName MENUITEM = new ElementName("menuitem", "menuitem", -// CPPONLY: NS_NewHTMLMenuItemElement, -// CPPONLY: NS_NewSVGUnknownElement, -TreeBuilder.MENUITEM); public static final ElementName PARAM = new ElementName("param", "param", // CPPONLY: NS_NewHTMLSharedElement, // CPPONLY: NS_NewSVGUnknownElement, @@ -1484,7 +1478,7 @@ public void destructor() { // CPPONLY: NS_NewSVGUnknownElement, TreeBuilder.TBODY_OR_THEAD_OR_TFOOT | SPECIAL | FOSTER_PARENTING | OPTIONAL_END_TAG); private final static @NoLength ElementName[] ELEMENT_NAMES = { -FIGCAPTION, +MN, CITE, FRAMESET, H1, @@ -1495,7 +1489,7 @@ public void destructor() { BGSOUND, SOURCE, HTML, -RP, +OPTGROUP, NOFRAMES, MTEXT, VIEW, @@ -1507,8 +1501,8 @@ public void destructor() { GLYPHREF, LI, ACRONYM, -SECTION, -HR, +TSPAN, +FEFUNCR, CANVAS, BASEFONT, FEDISTANTLIGHT, @@ -1530,11 +1524,11 @@ public void destructor() { PATH, MALIGNMARK, SMALL, -PARAM, -OPTION, -VIDEO, -BR, -FOOTER, +ANIMATEMOTION, +POLYGON, +COLGROUP, +ABBR, +FEGAUSSIANBLUR, TR, DETAILS, DT, @@ -1578,15 +1572,15 @@ public void destructor() { LABEL, ALTGLYPHITEM, FORM, -BUTTON, -KEYGEN, -PATTERN, -AUDIO, -FEDISPLACEMENTMAP, -SAMP, -ANIMATECOLOR, -FECOMPONENTTRANSFER, -HEADER, +CAPTION, +MAIN, +SPAN, +MO, +HGROUP, +STOP, +CENTER, +FILTER, +MARKER, NOBR, ADDRESS, DEFS, @@ -1672,28 +1666,27 @@ public void destructor() { SYMBOL, ANIMATETRANSFORM, EM, -MENUITEM, -ANIMATEMOTION, -CAPTION, -MN, -MAIN, -POLYGON, -SPAN, -TSPAN, -MO, -COLGROUP, -HGROUP, -OPTGROUP, -STOP, -ABBR, -CENTER, -FEFUNCR, -FILTER, -FEGAUSSIANBLUR, -MARKER, +PARAM, +BUTTON, +FIGCAPTION, +KEYGEN, +OPTION, +PATTERN, +SECTION, +AUDIO, +VIDEO, +FEDISPLACEMENTMAP, +RP, +SAMP, +BR, +ANIMATECOLOR, +HR, +FECOMPONENTTRANSFER, +FOOTER, +HEADER, }; private final static int[] ELEMENT_HASHES = { -1900845386, +1902641154, 1748359220, 2001349720, 876609538, @@ -1704,7 +1697,7 @@ public void destructor() { 1730965751, 1756474198, 1868312196, -1938817026, +1939219752, 1988763672, 2005324101, 2060065124, @@ -1716,8 +1709,8 @@ public void destructor() { 1766992520, 1818230786, 1881613047, -1907661127, -1967128578, +1907959605, +1967760215, 1982935782, 1999397992, 2001392798, @@ -1739,11 +1732,11 @@ public void destructor() { 1805502724, 1854228698, 1874053333, -1889085973, -1905563974, -1925844629, -1963982850, -1967795958, +1898223949, +1906087319, +1932928296, +1965115924, +1968053806, 1973420034, 1983633431, 1998585858, @@ -1787,15 +1780,15 @@ public void destructor() { 1870268949, 1881288348, 1884120164, -1898753862, -1903302038, -1906135367, -1914900309, -1934172497, -1941178676, -1965334268, -1967788867, -1968836118, +1899272519, +1904412884, +1907435316, +1919418370, +1935549734, +1941221172, +1966223078, +1967795910, +1971461414, 1971938532, 1982173479, 1983533124, @@ -1881,24 +1874,23 @@ public void destructor() { 1874102998, 1881498736, 1881669634, -1887579800, -1898223949, -1899272519, -1902641154, -1904412884, -1906087319, -1907435316, -1907959605, -1919418370, -1932928296, -1935549734, -1939219752, -1941221172, -1965115924, -1966223078, -1967760215, -1967795910, -1968053806, -1971461414, +1889085973, +1898753862, +1900845386, +1903302038, +1905563974, +1906135367, +1907661127, +1914900309, +1925844629, +1934172497, +1938817026, +1941178676, +1963982850, +1965334268, +1967128578, +1967788867, +1967795958, +1968836118, }; } diff --git a/src/nu/validator/htmlparser/impl/TreeBuilder.java b/src/nu/validator/htmlparser/impl/TreeBuilder.java index 997ae015..1f437bf7 100644 --- a/src/nu/validator/htmlparser/impl/TreeBuilder.java +++ b/src/nu/validator/htmlparser/impl/TreeBuilder.java @@ -196,11 +196,9 @@ public abstract class TreeBuilder implements TokenHandler, final static int KEYGEN = 65; - final static int MENUITEM = 66; + final static int TEMPLATE = 66; - final static int TEMPLATE = 67; - - final static int IMG = 68; + final static int IMG = 67; // start insertion modes @@ -2121,7 +2119,6 @@ public final void startTag(ElementName elementName, reconstructTheActiveFormattingElements(); // FALL THROUGH to PARAM_OR_SOURCE_OR_TRACK // CPPONLY: MOZ_FALLTHROUGH; - // CPPONLY: case MENUITEM: case PARAM_OR_SOURCE_OR_TRACK: appendVoidElementToCurrentMayFoster( elementName, @@ -3570,7 +3567,6 @@ public final void endTag(ElementName elementName) throws SAXException { break; case AREA_OR_WBR: case KEYGEN: // XXX?? - // CPPONLY: case MENUITEM: case PARAM_OR_SOURCE_OR_TRACK: case EMBED: case IMG: diff --git a/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java b/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java index 9f559dce..cb0a7173 100755 --- a/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java +++ b/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java @@ -1961,16 +1961,9 @@ public void visit(SwitchStmt n, LocalSymbolTable arg) { public void visit(SwitchEntryStmt n, LocalSymbolTable arg) { if (n.getLabel() != null) { - boolean isMenuitem = n.getLabel().toString().equals("MENUITEM"); - if (isMenuitem) { - printer.printWithoutIndent("#ifdef ENABLE_VOID_MENUITEM\n"); - } printer.print("case "); n.getLabel().accept(this, arg); printer.print(":"); - if (isMenuitem) { - printer.printWithoutIndent("\n#endif"); - } } else { printer.print("default:"); } From 2bb195842921fe7c3b88efe30a9e4d6e50e693ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emilio=20Cobos=20=C3=81lvarez?= Date: Thu, 23 Jun 2022 12:32:47 +0200 Subject: [PATCH 18/27] Mozilla bug 1775477 - Fix parser-created keygen element interface. (#71) This basically undoes the manual change from #70. --- src/nu/validator/htmlparser/impl/ElementName.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nu/validator/htmlparser/impl/ElementName.java b/src/nu/validator/htmlparser/impl/ElementName.java index 5748c3fb..9ee2ad01 100644 --- a/src/nu/validator/htmlparser/impl/ElementName.java +++ b/src/nu/validator/htmlparser/impl/ElementName.java @@ -1166,7 +1166,7 @@ public void destructor() { // CPPONLY: NS_NewSVGUnknownElement, TreeBuilder.MI_MO_MN_MS_MTEXT | SCOPING_AS_MATHML); public static final ElementName KEYGEN = new ElementName("keygen", "keygen", -// CPPONLY: NS_NewHTMLElement, +// CPPONLY: NS_NewHTMLUnknownElement, // CPPONLY: NS_NewSVGUnknownElement, TreeBuilder.KEYGEN | SPECIAL); public static final ElementName MAIN = new ElementName("main", "main", From 4c23b9e8bb22a0ade08c461cae4f4dab951f562e Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Tue, 27 Sep 2022 12:34:27 +0900 Subject: [PATCH 19/27] =?UTF-8?q?Warn=20about=20self-closing=20tag=20synta?= =?UTF-8?q?x=20if=20profile=20is=20=E2=80=9Chtml-strict=E2=80=9D=20(#73)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../htmlparser/impl/ErrorReportingTokenizer.java | 12 ++++++++++++ src/nu/validator/htmlparser/impl/Tokenizer.java | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java b/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java index 77dc9090..fcd2128e 100644 --- a/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java +++ b/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java @@ -710,6 +710,18 @@ private boolean isAstralPrivateUse(int c) { note("xhtml1", "Unquoted attribute value."); } + @Override + protected void noteSelfClosingTag() throws SAXException { + note("html-strict", + "Self-closing tag syntax in text/html documents is widely" + + " discouraged; it’s unnecessary and interacts badly" + + " with other HTML features (e.g., unquoted attribute" + + " values). If you’re using a tool that injects" + + " self-closing tag syntax into all void elements," + + " without any option to prevent it from doing so," + + " then consider switching to a different tool."); + } + /** * Sets the transitionHandler. * diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 495b8db7..775dd7c9 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -2292,6 +2292,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * flag of the current tag token. Emit the current * tag token. */ + noteSelfClosingTag(); state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); if (shouldSuspend) { break stateloop; @@ -7594,6 +7595,9 @@ protected void noteAttributeWithoutValue() throws SAXException { protected void noteUnquotedAttributeValue() throws SAXException { } + protected void noteSelfClosingTag() throws SAXException { + } + /** * Sets the encodingDeclarationHandler. * From c358af4badbecdd8fb1fb19dc2c3aa748cc36937 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Tue, 27 Sep 2022 20:08:05 +0900 Subject: [PATCH 20/27] Move the self-closing-tag warning to the TreeBuilder code (#74) --- .../impl/ErrorReportingTokenizer.java | 21 ++++---- .../validator/htmlparser/impl/Tokenizer.java | 15 ++++-- .../htmlparser/impl/TreeBuilder.java | 53 +++++++++++++++++++ 3 files changed, 73 insertions(+), 16 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java b/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java index fcd2128e..6c8e7617 100644 --- a/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java +++ b/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java @@ -134,6 +134,15 @@ public void setErrorProfile(HashMap errorProfileMap) { this.errorProfileMap = errorProfileMap; } + /** + * Gets the errorProfile. + * + * @param errorProfile + */ + @Override public HashMap getErrorProfile() { + return errorProfileMap; + } + /** * Reports on an event based on profile selected. * @@ -710,18 +719,6 @@ private boolean isAstralPrivateUse(int c) { note("xhtml1", "Unquoted attribute value."); } - @Override - protected void noteSelfClosingTag() throws SAXException { - note("html-strict", - "Self-closing tag syntax in text/html documents is widely" - + " discouraged; it’s unnecessary and interacts badly" - + " with other HTML features (e.g., unquoted attribute" - + " values). If you’re using a tool that injects" - + " self-closing tag syntax into all void elements," - + " without any option to prevent it from doing so," - + " then consider switching to a different tool."); - } - /** * Sets the transitionHandler. * diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 775dd7c9..6e1a0ab0 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -35,6 +35,8 @@ package nu.validator.htmlparser.impl; +import java.util.HashMap; + import org.xml.sax.ErrorHandler; import org.xml.sax.Locator; import org.xml.sax.ext.Locator2; @@ -686,6 +688,15 @@ public ErrorHandler getErrorHandler() { return this.errorHandler; } + /** + * Gets the errorProfile. + * + * @param errorProfile + */ + public HashMap getErrorProfile() { + return null; + } + /** * Sets the commentPolicy. * @@ -2292,7 +2303,6 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * flag of the current tag token. Emit the current * tag token. */ - noteSelfClosingTag(); state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); if (shouldSuspend) { break stateloop; @@ -7595,9 +7605,6 @@ protected void noteAttributeWithoutValue() throws SAXException { protected void noteUnquotedAttributeValue() throws SAXException { } - protected void noteSelfClosingTag() throws SAXException { - } - /** * Sets the encodingDeclarationHandler. * diff --git a/src/nu/validator/htmlparser/impl/TreeBuilder.java b/src/nu/validator/htmlparser/impl/TreeBuilder.java index 1f437bf7..967b53f6 100644 --- a/src/nu/validator/htmlparser/impl/TreeBuilder.java +++ b/src/nu/validator/htmlparser/impl/TreeBuilder.java @@ -1458,6 +1458,8 @@ public final void startTag(ElementName elementName, flushCharacters(); // [NOCPP[ + boolean wasSelfClosing = selfClosing; + boolean voidElement = false; if (errorHandler != null) { // ID uniqueness @IdType String id = attributes.getId(); @@ -1580,6 +1582,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case TITLE: @@ -1592,6 +1597,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case SCRIPT: @@ -1778,6 +1786,9 @@ public final void startTag(ElementName elementName, attributes, formPointer); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case FORM: @@ -2124,6 +2135,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case HR: @@ -2132,6 +2146,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case IMAGE: @@ -2145,6 +2162,9 @@ public final void startTag(ElementName elementName, elementName, attributes, formPointer); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case TEXTAREA: @@ -2324,6 +2344,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case META: @@ -2391,6 +2414,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case META: @@ -2399,6 +2425,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case STYLE: @@ -2438,6 +2467,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; case TEMPLATE: @@ -2572,6 +2604,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] attributes = null; // CPP break starttagloop; default: @@ -2745,6 +2780,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] pop(); // head attributes = null; // CPP break starttagloop; @@ -2756,6 +2794,9 @@ public final void startTag(ElementName elementName, elementName, attributes); selfClosing = false; + // [NOCPP[ + voidElement = true; + // ]NOCPP] pop(); // head attributes = null; // CPP break starttagloop; @@ -2844,6 +2885,18 @@ public final void startTag(ElementName elementName, } if (selfClosing) { errSelfClosing(); + // [NOCPP[ + } else if (wasSelfClosing && voidElement + && tokenizer.getErrorProfile() != null + && tokenizer.getErrorProfile().get("html-strict") != null) { + warn("Self-closing tag syntax in text/html documents is widely" + + " discouraged; it’s unnecessary and interacts badly" + + " with other HTML features (e.g., unquoted attribute" + + " values). If you’re using a tool that injects" + + " self-closing tag syntax into all void elements," + + " without any option to prevent it from doing so," + + " then consider switching to a different tool."); + // ]NOCPP] } // CPPONLY: if (mBuilder == null && attributes != HtmlAttributes.EMPTY_ATTRIBUTES) { // CPPONLY: Portability.delete(attributes); From f79f0ca404c26116e05f9e51540ab8a02ff0d7ba Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 5 Oct 2022 21:01:13 +0900 Subject: [PATCH 21/27] Streamline the warning about self-closing-tag syntax (#75) --- src/nu/validator/htmlparser/impl/TreeBuilder.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/TreeBuilder.java b/src/nu/validator/htmlparser/impl/TreeBuilder.java index 967b53f6..0f5c41da 100644 --- a/src/nu/validator/htmlparser/impl/TreeBuilder.java +++ b/src/nu/validator/htmlparser/impl/TreeBuilder.java @@ -2889,13 +2889,8 @@ public final void startTag(ElementName elementName, } else if (wasSelfClosing && voidElement && tokenizer.getErrorProfile() != null && tokenizer.getErrorProfile().get("html-strict") != null) { - warn("Self-closing tag syntax in text/html documents is widely" - + " discouraged; it’s unnecessary and interacts badly" - + " with other HTML features (e.g., unquoted attribute" - + " values). If you’re using a tool that injects" - + " self-closing tag syntax into all void elements," - + " without any option to prevent it from doing so," - + " then consider switching to a different tool."); + warn("Trailing slash on void elements has no effect and interacts" + + " badly with unquoted attribute values."); // ]NOCPP] } // CPPONLY: if (mBuilder == null && attributes != HtmlAttributes.EMPTY_ATTRIBUTES) { From 5f0eb3a017e99c47f8e763c1468bc2b31d83e71a Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 14 Oct 2022 15:24:19 +0900 Subject: [PATCH 22/27] Skip template contents when checking ID uniqueness (#77) --- src/nu/validator/htmlparser/impl/TreeBuilder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nu/validator/htmlparser/impl/TreeBuilder.java b/src/nu/validator/htmlparser/impl/TreeBuilder.java index 0f5c41da..b6438326 100644 --- a/src/nu/validator/htmlparser/impl/TreeBuilder.java +++ b/src/nu/validator/htmlparser/impl/TreeBuilder.java @@ -1463,7 +1463,7 @@ public final void startTag(ElementName elementName, if (errorHandler != null) { // ID uniqueness @IdType String id = attributes.getId(); - if (id != null) { + if (id != null && !isTemplateContents()) { LocatorImpl oldLoc = idLocations.get(id); if (oldLoc != null) { err("Duplicate ID \u201C" + id + "\u201D."); From 7bf3c236cf9489fc27e38322d33286dc0d4383e6 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 16 Mar 2023 16:44:17 +0000 Subject: [PATCH 23/27] Mozilla bug 1552008 - Track column number in the HTML. r=smaug,nchevobbe Differential Revision: https://phabricator.services.mozilla.com/D170579 --- src/nu/validator/htmlparser/impl/Tokenizer.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 6e1a0ab0..9ff1d280 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -1401,6 +1401,9 @@ protected void startErrorReporting() throws SAXException { public void start() throws SAXException { initializeWithoutStarting(); tokenHandler.startTokenization(this); + // CPPONLY: line = 0; + // CPPONLY: col = 1; + // CPPONLY: nextCharOnNewLine = true; // [NOCPP[ startErrorReporting(); // ]NOCPP] @@ -6340,6 +6343,8 @@ private void initDoctypeFields() { appendStrBuf('\n'); } + // [NOCPP[ + @Inline protected void silentCarriageReturn() { ++line; lastCR = true; @@ -6349,6 +6354,8 @@ private void initDoctypeFields() { ++line; } + // ]NOCPP] + private void emitCarriageReturn(@NoLength char[] buf, int pos) throws SAXException { silentCarriageReturn(); @@ -7172,11 +7179,15 @@ private void suspendIfRequestedAfterCurrentNonTextToken() { return suspendAfterCurrentNonTextToken; } + // [NOCPP[ + @Inline protected char checkChar(@NoLength char[] buf, int pos) throws SAXException { return buf[pos]; } + // ]NOCPP] + public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { if (encodingDeclarationHandler != null) { From fccac59de2fa7e1275aabea3fb3adb3ab2437c27 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 31 May 2023 18:16:14 +0900 Subject: [PATCH 24/27] WIP: Make the test harness recognize error-reporting failures --- pom.xml | 1 - .../test/JSONArrayTokenHandler.java | 8 ++++ .../htmlparser/test/TokenizerTester.java | 38 +++++++++++++++---- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index 6b665c69..9e65c20f 100644 --- a/pom.xml +++ b/pom.xml @@ -132,7 +132,6 @@ maven-surefire-plugin Html5libTest - true ${project.build.testSourceDirectory}/test/resources diff --git a/test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java b/test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java index 298f406a..2dc529d2 100644 --- a/test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java +++ b/test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java @@ -60,6 +60,8 @@ public class JSONArrayTokenHandler implements TokenHandler, ErrorHandler { private JSONArray array = null; + private boolean hasError = false; + private int contentModelFlag; private String contentModelElement; @@ -118,6 +120,7 @@ public void eof() throws SAXException { public void startTokenization(Tokenizer self) throws SAXException { array = new JSONArray(); + hasError = false; if (contentModelElement != null) { self.setStateAndEndTagExpectation(contentModelFlag, contentModelElement); } @@ -149,6 +152,7 @@ public boolean wantsComments() throws SAXException { public void error(SAXParseException exception) throws SAXException { // flushCharacters(); // array.getValue().add(PARSE_ERROR); + hasError = true; } public void fatalError(SAXParseException exception) throws SAXException { @@ -167,6 +171,10 @@ public JSONArray getArray() { return array; } + public boolean hasError() { + return hasError; + } + public void endTokenization() throws SAXException { } diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index 7048dca6..8cf80e8e 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -135,7 +135,14 @@ void runTests() throws SAXException, IOException { private void runTest(JSONObject test) throws SAXException, IOException { String inputString = ((JSONString) test.get("input")).getValue(); + String expectedError = null; + boolean errorReported = false; JSONArray expectedTokens = (JSONArray) test.get("output"); + JSONArray errorsArray = (JSONArray) test.getValue().get("errors"); + if (errorsArray != null) { + JSONObject errorsObject = (JSONObject) errorsArray.get(0); + expectedError = ((JSONString) errorsObject.get("code")).getValue(); + } String description = ((JSONString) test.get("description")).getValue(); JSONString lastStartTagJSON = ((JSONString) test.get("lastStartTag")); String lastStartTag = lastStartTagJSON == null ? null @@ -143,33 +150,33 @@ private void runTest(JSONObject test) throws SAXException, IOException { JSONArray contentModelFlags = (JSONArray) test.get("initialStates"); if (contentModelFlags == null) { runTestInner(inputString, expectedTokens, description, - Tokenizer.DATA, null); + expectedError, Tokenizer.DATA, null); } else { for (JSONValue value : contentModelFlags.getValue()) { if (PCDATA.equals(value)) { lastStartTag = lastStartTag == null ? "xmp" : lastStartTag; runTestInner(inputString, expectedTokens, description, - Tokenizer.DATA, lastStartTag); + expectedError, Tokenizer.DATA, lastStartTag); } else if (RAWTEXT.equals(value)) { lastStartTag = lastStartTag == null ? "xmp" : lastStartTag; runTestInner(inputString, expectedTokens, description, - Tokenizer.RAWTEXT, lastStartTag); + expectedError, Tokenizer.RAWTEXT, lastStartTag); } else if (RCDATA.equals(value)) { lastStartTag = lastStartTag == null ? "xmp" : lastStartTag; runTestInner(inputString, expectedTokens, description, - Tokenizer.RCDATA, lastStartTag); + expectedError, Tokenizer.RCDATA, lastStartTag); } else if (CDATA.equals(value)) { lastStartTag = lastStartTag == null ? "xmp" : lastStartTag; runTestInner(inputString, expectedTokens, description, - Tokenizer.CDATA_SECTION, lastStartTag); + expectedError, Tokenizer.CDATA_SECTION, lastStartTag); } else if (PLAINTEXT.equals(value)) { lastStartTag = lastStartTag == null ? "plaintext" : lastStartTag; runTestInner(inputString, expectedTokens, description, - Tokenizer.PLAINTEXT, lastStartTag); + expectedError, Tokenizer.PLAINTEXT, lastStartTag); } else if (SCRIPT_DATA.equals(value)) { lastStartTag = lastStartTag == null ? "script" : lastStartTag; runTestInner(inputString, expectedTokens, description, - Tokenizer.SCRIPT_DATA, lastStartTag); + expectedError, Tokenizer.SCRIPT_DATA, lastStartTag); } else { throw new RuntimeException("Broken test data."); } @@ -185,13 +192,28 @@ private void runTest(JSONObject test) throws SAXException, IOException { * @throws IOException */ private void runTestInner(String inputString, JSONArray expectedTokens, - String description, int contentModelFlag, + String description, String expectedError, int contentModelFlag, String contentModelElement) throws SAXException, IOException { tokenHandler.setContentModelFlag(contentModelFlag, contentModelElement); InputSource is = new InputSource(new StringReader(inputString)); try { driver.tokenize(is); JSONArray actualTokens = tokenHandler.getArray(); + if (expectedError != null + && !expectedError.equals("eof-in-cdata") + && !expectedError.equals("surrogate-in-input-stream") + && !expectedError.startsWith("eof-in-script-html-comment") + && !expectedError.startsWith("incorrectly-closed-comment") + && !tokenHandler.hasError()) { + exitStatus = 1; + writer.write("Failure\n"); + writer.write(description); + writer.write("\nInput:\n"); + writer.write(inputString); + writer.write("\nError expected:\n"); + writer.write(expectedError); + writer.write("\n"); + } if (!jsonDeepEquals(actualTokens, expectedTokens)) { exitStatus = 1; writer.write("Failure\n"); From cf8af4057386af7cc6fa1f68ef38596a49caac81 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 14 Dec 2015 18:36:20 +0900 Subject: [PATCH 25/27] Conform ampersand-error reporting to HTML spec --- .../validator/htmlparser/impl/Tokenizer.java | 62 +++++++++++-------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 9ff1d280..214245b6 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -231,6 +231,8 @@ public class Tokenizer implements Locator, Locator2 { public static final int COMMENT_LESSTHAN_BANG_DASH_DASH = 79; + public static final int AMBIGUOUS_AMPERSAND = 75; + /** * Magic value for UTF-16 operations. */ @@ -3339,6 +3341,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '<': case '&': case '\u0000': + case ';': emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; @@ -3367,17 +3370,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { firstCharKey = c - 'A'; } else { // No match - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } // Didn't fail yet @@ -3439,17 +3437,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } } if (hilo == 0) { - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } // Didn't fail yet @@ -3532,16 +3525,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { if (candidate == -1) { // reconsume deals with CR, LF or nul - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } else { // c can't be CR, LF or nul if we got here @@ -3579,10 +3568,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ - errNoNamedCharacterMatch(); appendCharRefBufToStrBuf(); reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } } @@ -3645,6 +3633,36 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * I'm ∉ I tell you. */ } + // XXX reorder point + case AMBIGUOUS_AMPERSAND: + /* + * Unlike the definition is the spec, we don't consume the + * next input character right away when entering this state; + * that's because our current implementation differs from + * the spec in that we've already consumed the relevant + * character *before* entering this state. + */ + ampersandloop: for (;;) { + if (c == ';') { + errNoNamedCharacterMatch(); + } else if ((c >= '0' && c <= '9') + || (c >= 'A' && c <= 'Z') + || (c >= 'a' && c <= 'z')) { + if (++pos == endPos) { + break stateloop; + } + appendCharRefBuf(c); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + c = checkChar(buf, pos); + continue; + } + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } case CONSUME_NCR: if (++pos == endPos) { break stateloop; @@ -6831,7 +6849,6 @@ public void eof() throws SAXException { state = returnState; continue; case CHARACTER_REFERENCE_HILO_LOOKUP: - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); state = returnState; continue; @@ -6885,10 +6902,6 @@ public void eof() throws SAXException { } if (candidate == -1) { - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); state = returnState; continue eofloop; @@ -6926,7 +6939,6 @@ public void eof() throws SAXException { * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ - errNoNamedCharacterMatch(); appendCharRefBufToStrBuf(); state = returnState; continue eofloop; From b07aacccb40712b4054185ce01be18ff4d9dccc6 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 7 Aug 2020 04:21:37 +0900 Subject: [PATCH 26/27] Ensure ampersand handling across buffer boundaries --- src/nu/validator/htmlparser/impl/Tokenizer.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 214245b6..6548d08e 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -3648,18 +3648,17 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { - if (++pos == endPos) { - break stateloop; - } appendCharRefBuf(c); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; + cstart = pos + 1; + } + if (++pos == endPos) { + break stateloop; } c = checkChar(buf, pos); continue; } - reconsume = true; state = transition(state, returnState, reconsume, pos); continue stateloop; } From 8c718d3348ea63e4de61e9713d5b7a25ee121c5a Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 8 Aug 2020 06:10:36 +0900 Subject: [PATCH 27/27] Only reconsume from ampersand state if not in attr --- src/nu/validator/htmlparser/impl/Tokenizer.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 6548d08e..d5b65f45 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -3659,6 +3659,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { c = checkChar(buf, pos); continue; } + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + reconsume = true; + } state = transition(state, returnState, reconsume, pos); continue stateloop; }