From 5ae26d8c0b966b9b36ea3c02c0c3e31c1bcae613 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 14 Dec 2015 18:36:20 +0900 Subject: [PATCH] Conform ampersand-error reporting to HTML spec --- .../validator/htmlparser/impl/Tokenizer.java | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 7096e704..95a35b11 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -3233,6 +3233,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '<': case '&': case '\u0000': + case ';': emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; @@ -3261,17 +3262,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { firstCharKey = c - 'A'; } else { // No match - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } // Didn't fail yet @@ -3332,17 +3328,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } } if (hilo == 0) { - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } // Didn't fail yet @@ -3425,16 +3416,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { if (candidate == -1) { // reconsume deals with CR, LF or nul - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } else { // c can't be CR, LF or nul if we got here @@ -3472,10 +3459,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ - errNoNamedCharacterMatch(); appendCharRefBufToStrBuf(); reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } } @@ -3538,6 +3524,37 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * I'm ∉ I tell you. */ } + // XXX reorder point + case AMBIGUOUS_AMPERSAND: + /* + * Unlike the definition is the spec, we don't consume the + * next input character right away when entering this state; + * that's because our current implementation differs from + * the spec in that we've already consumed the relevant + * character *before* entering this state. + * Also, our implementation of this state has no looping. + * So we never stay in this state; instead, we always + * transition out from it back to returnState. + */ + state = returnState; + if (c == ';') { + errNoNamedCharacterMatch(); + continue stateloop; + } else if ((c >= '0' && c <= '9') + || (c >= 'A' && c <= 'Z') + || (c >= 'a' && c <= 'z')) { + appendCharRefBuf(c); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos + 1; + } + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + continue stateloop; + } + continue stateloop; case CONSUME_NCR: if (++pos == endPos) { break stateloop; @@ -6632,7 +6649,6 @@ public void eof() throws SAXException { state = returnState; continue; case CHARACTER_REFERENCE_HILO_LOOKUP: - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); state = returnState; continue; @@ -6686,10 +6702,6 @@ public void eof() throws SAXException { } if (candidate == -1) { - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); state = returnState; continue eofloop; @@ -6727,7 +6739,6 @@ public void eof() throws SAXException { * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ - errNoNamedCharacterMatch(); appendCharRefBufToStrBuf(); state = returnState; continue eofloop;