Conform encoding-label matching to Encoding spec

sideshowbarker · sideshowbarker · commit beb060b9f684 · 2020-09-13T10:02:33.000+09:00
This change makes the parser’s encoding-name matching conform to the current Encoding spec at https://encoding.spec.whatwg.org/#concept-encoding-get — which requires that only leading and trailing whitespace be removed from a string before checking if it matches any valid encoding name. Otherwise, without this change, the parser instead implements https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching — which requires deleting “all characters except a-z, A-Z, and 0-9” from a string before checking if it matches any valid encoding name. That difference makes us fail two html5-tests cases. Relates to #47
diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java
@@ -417,9 +417,7 @@ public static String toNameKey(String str) {
             if (c >= 'A' && c <= 'Z') {
                 c += 0x20;
             }
-            if (!((c >= '\t' && c <= '\r') || (c >= '\u0020' && c <= '\u002F')
-                    || (c >= '\u003A' && c <= '\u0040')
-                    || (c >= '\u005B' && c <= '\u0060') || (c >= '\u007B' && c <= '\u007E'))) {
+            if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r')) {
                 buf[j] = c;
                 j++;
             }

Original file line number	Diff line number	Diff line change
`@@ -417,9 +417,7 @@ public static String toNameKey(String str) {`
`417`	`417`	`if (c >= 'A' && c <= 'Z') {`
`418`	`418`	`c += 0x20;`
`419`	`419`	`}`
`420`		`- if (!((c >= '\t' && c <= '\r') \|\| (c >= '\u0020' && c <= '\u002F')`
`421`		`- \|\| (c >= '\u003A' && c <= '\u0040')`
`422`		`- \|\| (c >= '\u005B' && c <= '\u0060') \|\| (c >= '\u007B' && c <= '\u007E'))) {`
	`420`	`+ if (!(c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\f' \|\| c == '\r')) {`
`423`	`421`	`buf[j] = c;`
`424`	`422`	`j++;`
`425`	`423`	`}`