Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sonar-project.properties
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ sonar.links.ci=https://github.com/apache/nutch/actions
sonar.sources=src/java,src/plugin
sonar.tests=src/test,src/plugin
sonar.test.inclusions=**/src/test/**/*.java,**/Test*.java,**/*IT.java
sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,**/plugin.xml
# Exclude build/config files and plugin resource directories (no Java code in conf, data, sample)
sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,**/plugin.xml,**/src/plugin/**/conf/**,**/src/plugin/**/data/**,**/src/plugin/**/sample/**
sonar.source.encoding=UTF-8
sonar.java.source=17

Expand Down
23 changes: 20 additions & 3 deletions src/java/org/apache/nutch/parse/ParseOutputFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map.Entry;

Expand All @@ -73,7 +74,23 @@ public class ParseOutputFormat extends OutputFormat<Text, Parse> {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}


/**
* Parses the comma-separated db.parsemeta.to.crawldb config value.
* Uses comma-split and trim to avoid ReDoS from regex backtracking.
* @param value config value (may be null or empty)
* @return array of trimmed, non-empty metadata keys (never null)
*/
static String[] getParseMetaToCrawlDBKeys(String value) {
if (value == null || value.isEmpty()) {
return new String[0];
}
return Arrays.stream(value.split(",", -1))
.map(String::trim)
.filter(s -> !s.isEmpty())
.toArray(String[]::new);
}

private static class SimpleEntry implements Entry<Text, CrawlDatum> {
private Text key;
private CrawlDatum value;
Expand Down Expand Up @@ -177,8 +194,8 @@ public RecordWriter<Text, Parse> getRecordWriter(TaskAttemptContext context)
Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);

final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "")
.split(" *, *");
final String[] parseMDtoCrawlDB = getParseMetaToCrawlDBKeys(
conf.get("db.parsemeta.to.crawldb", ""));

// textOut Options
final MapFile.Writer textOut;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net.URL;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -64,15 +62,10 @@ public class HtmlParser implements Parser {
// NUTCH-2042 (cf. TIKA-357): increased to 8 kB
private static final int CHUNK_SIZE = 8192;

// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
"<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern = Pattern.compile(
"charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
private static Pattern charsetPatternHTML5 = Pattern.compile(
"<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
Pattern.CASE_INSENSITIVE);
private static final String META_TAG_START = "<meta";
private static final String CHARSET_EQ = "charset=";
private static final String HTTP_EQUIV = "http-equiv";
private static final String CONTENT_TYPE = "content-type";

private String parserImpl;

Expand All @@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
* <code>byte[]</code> representation of an html file
*/

/**
* Extracts charset value from a string like "charset=utf-8" or "charset = utf-8".
* Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain only [a-z0-9_-].
*/
private static String extractCharsetValue(String s, int fromIndex) {
int idx = s.indexOf(CHARSET_EQ, fromIndex);
if (idx < 0) {
return null;
}
int start = idx + CHARSET_EQ.length();
while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == '\t')) {
start++;
}
if (start >= s.length()) {
return null;
}
char first = s.charAt(start);
if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && (first < 'A' || first > 'Z')) {
return null;
}
if (first == '"' || first == '\'') {
start++;
}
int end = start;
while (end < s.length()) {
char c = s.charAt(end);
if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == '>') {
break;
}
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-') {
end++;
} else {
break;
}
}
return end > start ? s.substring(start, end) : null;
}

/**
* Finds charset from HTML string using linear scans only (no backtracking regex).
* Checks meta http-equiv Content-Type then HTML5 meta charset.
* Package-private for unit testing.
*/
static String extractCharsetFromMeta(String str) {
String lower = str.toLowerCase();
int pos = 0;
while (true) {
int metaStart = lower.indexOf(META_TAG_START, pos);
if (metaStart < 0) {
break;
}
int tagEnd = str.indexOf('>', metaStart);
if (tagEnd < 0) {
break;
}
String tagContent = str.substring(metaStart, tagEnd);
String tagLower = tagContent.toLowerCase();
// HTML4: meta http-equiv=Content-Type ... charset=...
if (tagLower.contains(HTTP_EQUIV) && tagLower.contains(CONTENT_TYPE)) {
String charset = extractCharsetValue(tagContent, 0);
if (charset != null) {
return charset;
}
}
// HTML5: <meta charset="utf-8">
if (tagLower.contains(CHARSET_EQ)) {
String charset = extractCharsetValue(tagContent, 0);
if (charset != null) {
return charset;
}
}
pos = tagEnd + 1;
}
return null;
}

private static String sniffCharacterEncoding(byte[] content) {
int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;

Expand All @@ -102,20 +171,7 @@ private static String sniffCharacterEncoding(byte[] content) {
// {U+0041, U+0082, U+00B7}.
String str = new String(content, 0, length, StandardCharsets.US_ASCII);

Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
if (charsetMatcher.find())
encoding = charsetMatcher.group(1);
}
if (encoding == null) {
// check for HTML5 meta charset
metaMatcher = charsetPatternHTML5.matcher(str);
if (metaMatcher.find()) {
encoding = metaMatcher.group(1);
}
}
String encoding = extractCharsetFromMeta(str);
if (encoding == null) {
// check for BOM
if (content.length >= 3 && content[0] == (byte) 0xEF
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

import static org.junit.jupiter.api.Assertions.*;

public class TestHtmlParser {
class TestHtmlParser {

private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
Expand Down Expand Up @@ -105,7 +105,7 @@ protected Parse parse(byte[] contentBytes) {
}

@Test
public void testEncodingDetection() {
void testEncodingDetection() {
for (String[] testPage : encodingTestPages) {
String name = testPage[0];
Charset charset = Charset.forName(testPage[1]);
Expand All @@ -131,7 +131,7 @@ public void testEncodingDetection() {
}

@Test
public void testResolveBaseUrl() {
void testResolveBaseUrl() {
byte[] contentBytes = resolveBaseUrlTestContent
.getBytes(StandardCharsets.UTF_8);
// parse using http://example.com/ as "fetch" URL
Expand All @@ -143,4 +143,30 @@ public void testResolveBaseUrl() {
outlinks[0].getToUrl());
}

/** Tests charset extraction from meta tags (ReDoS-safe parsing). */
@Test
void testExtractCharsetFromMeta() {
assertNull(HtmlParser.extractCharsetFromMeta(""));
assertNull(HtmlParser.extractCharsetFromMeta("<html><head></head></html>"));

assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"));
assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
"<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"));
assertEquals("ISO-8859-1", HtmlParser.extractCharsetFromMeta(
"<meta http-equiv=Content-Type content=\"text/html; charset=ISO-8859-1\">"));

assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
"<meta charset=\"utf-8\">"));
assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
"<meta charset='utf-8'>"));
assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
"<meta charset=utf-8>"));

// First content-type meta wins when both appear
String both = "<meta http-equiv=\"Content-Type\" content=\"charset=windows-1252\">"
+ "<meta charset=\"utf-8\">";
assertEquals("windows-1252", HtmlParser.extractCharsetFromMeta(both));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
Expand Down Expand Up @@ -188,13 +186,72 @@ public ParseResult getParse(Content c) {
return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}

private static final Pattern STRING_PATTERN = Pattern.compile(
"(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
// A simple pattern. This allows also invalid URL characters.
private static final Pattern URI_PATTERN = Pattern.compile(
"(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
/**
* Extracts content of quoted strings (single or double) from JavaScript.
* Uses linear scan to avoid ReDoS. Backslash escapes the next character.
* Package-private for unit testing.
*/
static List<String> extractQuotedStrings(String plainText) {
List<String> result = new ArrayList<>();
int i = 0;
while (i < plainText.length()) {
char q = 0;
int start = -1;
if (plainText.charAt(i) == '"' || plainText.charAt(i) == '\'') {
q = plainText.charAt(i);
start = i + 1;
}
if (start > 0) {
StringBuilder content = new StringBuilder();
int j = start;
while (j < plainText.length()) {
char c = plainText.charAt(j);
if (c == '\\') {
j++;
if (j < plainText.length()) {
content.append(plainText.charAt(j));
j++;
}
continue;
}
if (c == q) {
String s = content.toString().trim();
if (s.length() > 0) {
result.add(s);
}
i = j + 1;
break;
}
content.append(c);
j++;
}
if (j >= plainText.length()) {
i++;
}
} else {
i++;
}
}
return result;
}

/**
* Checks if the string looks like a URI/path (contains . or /, no internal whitespace).
* Linear check to avoid ReDoS. Package-private for unit testing.
*/
static boolean looksLikeUri(String s) {
if (s == null) {
return false;
}
s = s.trim();
if (s.isEmpty()) {
return false;
}
if (s.indexOf(' ') >= 0 || s.indexOf('\t') >= 0) {
return false;
}
return s.contains(".") || s.contains("/");
}

// Alternative pattern, which limits valid url characters.
// private static final String URI_PATTERN =
Expand All @@ -216,14 +273,10 @@ private Outlink[] getJSLinks(String plainText, String anchor, String base) {

try {

Matcher matcher = STRING_PATTERN.matcher(plainText);

String url;
List<String> quotedStrings = extractQuotedStrings(plainText);

while (matcher.find()) {
url = matcher.group(2);
Matcher matcherUri = URI_PATTERN.matcher(url);
if (!matcherUri.matches()) {
for (String url : quotedStrings) {
if (!looksLikeUri(url)) {
continue;
}
if (url.startsWith("www.")) {
Expand Down
Loading