Skip to content

Commit 5ae26d8

Browse files
Conform ampersand-error reporting to HTML spec
1 parent f30815d commit 5ae26d8

File tree

1 file changed

+36
-25
lines changed

1 file changed

+36
-25
lines changed

src/nu/validator/htmlparser/impl/Tokenizer.java

+36-25
Original file line numberDiff line numberDiff line change
@@ -3233,6 +3233,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
32333233
case '<':
32343234
case '&':
32353235
case '\u0000':
3236+
case ';':
32363237
emitOrAppendCharRefBuf(returnState);
32373238
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
32383239
cstart = pos;
@@ -3261,17 +3262,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
32613262
firstCharKey = c - 'A';
32623263
} else {
32633264
// No match
3264-
/*
3265-
* If no match can be made, then this is a parse
3266-
* error.
3267-
*/
3268-
errNoNamedCharacterMatch();
32693265
emitOrAppendCharRefBuf(returnState);
32703266
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
32713267
cstart = pos;
32723268
}
32733269
reconsume = true;
3274-
state = transition(state, returnState, reconsume, pos);
3270+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
32753271
continue stateloop;
32763272
}
32773273
// Didn't fail yet
@@ -3332,17 +3328,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
33323328
}
33333329
}
33343330
if (hilo == 0) {
3335-
/*
3336-
* If no match can be made, then this is a parse
3337-
* error.
3338-
*/
3339-
errNoNamedCharacterMatch();
33403331
emitOrAppendCharRefBuf(returnState);
33413332
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
33423333
cstart = pos;
33433334
}
33443335
reconsume = true;
3345-
state = transition(state, returnState, reconsume, pos);
3336+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
33463337
continue stateloop;
33473338
}
33483339
// Didn't fail yet
@@ -3425,16 +3416,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
34253416

34263417
if (candidate == -1) {
34273418
// reconsume deals with CR, LF or nul
3428-
/*
3429-
* If no match can be made, then this is a parse error.
3430-
*/
3431-
errNoNamedCharacterMatch();
34323419
emitOrAppendCharRefBuf(returnState);
34333420
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
34343421
cstart = pos;
34353422
}
34363423
reconsume = true;
3437-
state = transition(state, returnState, reconsume, pos);
3424+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
34383425
continue stateloop;
34393426
} else {
34403427
// c can't be CR, LF or nul if we got here
@@ -3472,10 +3459,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
34723459
* after the U+0026 AMPERSAND (&) must be
34733460
* unconsumed, and nothing is returned.
34743461
*/
3475-
errNoNamedCharacterMatch();
34763462
appendCharRefBufToStrBuf();
34773463
reconsume = true;
3478-
state = transition(state, returnState, reconsume, pos);
3464+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
34793465
continue stateloop;
34803466
}
34813467
}
@@ -3538,6 +3524,37 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
35383524
* I'm ∉ I tell you.
35393525
*/
35403526
}
3527+
// XXX reorder point
3528+
case AMBIGUOUS_AMPERSAND:
3529+
/*
3530+
* Unlike the definition is the spec, we don't consume the
3531+
* next input character right away when entering this state;
3532+
* that's because our current implementation differs from
3533+
* the spec in that we've already consumed the relevant
3534+
* character *before* entering this state.
3535+
* Also, our implementation of this state has no looping.
3536+
* So we never stay in this state; instead, we always
3537+
* transition out from it back to returnState.
3538+
*/
3539+
state = returnState;
3540+
if (c == ';') {
3541+
errNoNamedCharacterMatch();
3542+
continue stateloop;
3543+
} else if ((c >= '0' && c <= '9')
3544+
|| (c >= 'A' && c <= 'Z')
3545+
|| (c >= 'a' && c <= 'z')) {
3546+
appendCharRefBuf(c);
3547+
emitOrAppendCharRefBuf(returnState);
3548+
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3549+
cstart = pos + 1;
3550+
}
3551+
if (++pos == endPos) {
3552+
break stateloop;
3553+
}
3554+
c = checkChar(buf, pos);
3555+
continue stateloop;
3556+
}
3557+
continue stateloop;
35413558
case CONSUME_NCR:
35423559
if (++pos == endPos) {
35433560
break stateloop;
@@ -6632,7 +6649,6 @@ public void eof() throws SAXException {
66326649
state = returnState;
66336650
continue;
66346651
case CHARACTER_REFERENCE_HILO_LOOKUP:
6635-
errNoNamedCharacterMatch();
66366652
emitOrAppendCharRefBuf(returnState);
66376653
state = returnState;
66386654
continue;
@@ -6686,10 +6702,6 @@ public void eof() throws SAXException {
66866702
}
66876703

66886704
if (candidate == -1) {
6689-
/*
6690-
* If no match can be made, then this is a parse error.
6691-
*/
6692-
errNoNamedCharacterMatch();
66936705
emitOrAppendCharRefBuf(returnState);
66946706
state = returnState;
66956707
continue eofloop;
@@ -6727,7 +6739,6 @@ public void eof() throws SAXException {
67276739
* after the U+0026 AMPERSAND (&) must be
67286740
* unconsumed, and nothing is returned.
67296741
*/
6730-
errNoNamedCharacterMatch();
67316742
appendCharRefBufToStrBuf();
67326743
state = returnState;
67336744
continue eofloop;

0 commit comments

Comments
 (0)