Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
2dc07d9
Address deprecation in Nutch codebase
lewismc Sep 24, 2025
4d0431d
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Oct 20, 2025
d97f408
Merge branch 'master' into NUTCH-3130
lewismc Feb 22, 2026
8997e64
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 22, 2026
01239fc
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 22, 2026
5d5920a
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
3a49221
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
1ba758b
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
608f76f
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
e4a6faf
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
cc5d047
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
5656cc4
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
7043fa6
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
6f013c7
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
c0ebcca
Merge branch 'master' into NUTCH-3130
lewismc Feb 23, 2026
a1854bb
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
0d37158
Merge branch 'master' into NUTCH-3130
lewismc Feb 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 44 additions & 65 deletions .github/workflows/master-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,9 @@ on:
branches: [master]

# Java Version Strategy:
# - BUILD: Requires Java 17+ (JUnit 6 dependency)
# - RUNTIME: Supports Java 11+ (javac.version=11 produces Java 11 bytecode)
#
# The 'build' job verifies bytecode compilation for both Java 11 and 17 targets.
# The 'runtime-java11' job verifies the built artifacts actually run on Java 11.
# The 'tests' job runs on JDK 17 (required by JUnit 6) with the default
# javac.version=11 bytecode target for backward compatibility.
# - BUILD and RUNTIME: Java 17 only
# - The 'build' job compiles with javac.version=17.
# - The 'tests' job runs on JDK 17.

jobs:
javadoc:
Expand Down Expand Up @@ -85,16 +81,9 @@ jobs:
if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }}
run: exit 1

# Build verification with Java bytecode target matrix
# Verifies bytecode compatibility for both Java 11 and Java 17 targets
# Build verification (Java 17 only)
build:
strategy:
fail-fast: false
matrix:
javac-version: ['11', '17']
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
name: build (javac.version=${{ matrix.javac-version }})
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Set up JDK 17
Expand All @@ -109,16 +98,13 @@ jobs:
key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }}
restore-keys: |
${{ runner.os }}-ivy-
- name: Build with javac.version=${{ matrix.javac-version }}
run: ant clean runtime -Djavac.version=${{ matrix.javac-version }} -buildfile build.xml
- name: Build with Java 17
run: ant clean runtime -Djavac.version=17 -buildfile build.xml
- name: Verify bytecode version
run: |
# Extract and verify the bytecode version of compiled classes
# Java 11 = major version 55, Java 17 = major version 61
EXPECTED_VERSION=${{ matrix.javac-version == '11' && '55' || '61' }}
echo "Expected major version: $EXPECTED_VERSION (Java ${{ matrix.javac-version }})"

# Find a real class file (exclude package-info.class which may have different version)
# Java 17 = major version 61
EXPECTED_VERSION=61
echo "Expected major version: $EXPECTED_VERSION (Java 17)"
cd build/classes
CLASS_FILE=$(find . -name "*.class" ! -name "package-info.class" | head -1)
if [ -n "$CLASS_FILE" ]; then
Expand All @@ -135,44 +121,7 @@ jobs:
exit 1
fi

# Verify runtime compatibility on Java 11
# This ensures the built artifacts can actually run on Java 11
runtime-java11:
needs: build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Set up JDK 17 for building
uses: actions/setup-java@v5
with:
java-version: '17'
distribution: 'temurin'
- name: Cache Ivy dependencies
uses: actions/cache@v4
with:
path: ~/.ivy2/cache
key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }}
restore-keys: |
${{ runner.os }}-ivy-
- name: Build with Java 11 target
run: ant clean runtime -Djavac.version=11 -buildfile build.xml
- name: Set up JDK 11 for runtime verification
uses: actions/setup-java@v5
with:
java-version: '11'
distribution: 'temurin'
- name: Verify runtime on Java 11
run: |
echo "Verifying Nutch can run on Java 11..."
java -version
cd runtime/local
# Actually load Java classes by running showproperties
# This invokes org.apache.nutch.tools.ShowProperties and verifies the JAR loads
bin/nutch showproperties | head -20
echo "Java 11 runtime verification complete"

# Tests run on JDK 17 (required by JUnit 6) with default javac.version=11
# Java 11 runtime compatibility is verified by the runtime-java11 job
# Tests run on JDK 17
tests:
strategy:
fail-fast: false
Expand Down Expand Up @@ -212,16 +161,19 @@ jobs:
- '.github/workflows/*'
# run if the build configuration or both 'core' and 'plugins' files were changed
- name: test all
id: build_all
if: ${{ steps.filter.outputs.buildconf == 'true' || ( steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'true' ) }}
run: ant clean test -buildfile build.xml
run: ant clean test -buildfile build.xml | tee build.log
# run only if 'core' files were changed
- name: test core
id: build_core
if: ${{ steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-core -buildfile build.xml
run: ant clean test-core -buildfile build.xml | tee build.log
# run only if 'plugins' files were changed
- name: test plugins
id: build_plugins
if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-plugins -buildfile build.xml
run: ant clean test-plugins -buildfile build.xml | tee build.log
# run indexer integration tests when indexer plugin files change (Docker required, ubuntu-latest only)
- name: test indexer integration
if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
Expand All @@ -237,6 +189,32 @@ jobs:
else
echo "has_results=false" >> $GITHUB_OUTPUT
fi
# check for deprecation warnings in build output
# Scenario 1: Approved deprecations are allowlisted and do not fail the build.
# Scenario 2: Any other deprecation in the Nutch codebase fails the build.
- name: Check for deprecation warnings
if: always()
run: |
if [ ! -f build.log ]; then
echo "⚠️ build.log not found, skipping deprecation check."
exit 0
fi
deprecations=$(grep -iE "warning: \[deprecation\]" build.log || true)
if [ -z "$deprecations" ]; then
echo "✅ No Java deprecation warnings found."
exit 0
fi
# Allowlist: deprecated Nutch classes (e.g. SpellCheckedMetadata, future release), test mocks implementing deprecated Hadoop API,
# and classes using commons-cli Option.Builder.build() / HelpFormatter (deprecated in newer commons-cli; we align with Hadoop 3.4.2 / 1.4).
APPROVED_PATTERN="SpellCheckedMetadata\.java|CrawlDBTestUtil\.java|CrawlDbUpdateUtil\.java|WebGraph\.java|ScoreUpdater\.java|NodeReader\.java|NodeDumper\.java|LinkRank\.java|LinkDumper\.java|ResolveUrls\.java|NutchServer\.java|CrawlCompletionStats\.java|FileDumper\.java|CommonCrawlDataDumper\.java|MimeTypeIndexingFilter\.java"
unapproved=$(echo "$deprecations" | grep -v -E "$APPROVED_PATTERN" || true)
if [ -n "$unapproved" ]; then
echo "❌ Unapproved Java deprecation warnings detected! Failing the build."
echo "$unapproved"
exit 1
fi
echo "✅ Deprecation warnings only in approved deprecated code."
exit 0
- name: Upload Test Report
uses: actions/upload-artifact@v4
if: always() && matrix.os == 'ubuntu-latest' && steps.check_tests.outputs.has_results == 'true'
Expand All @@ -246,6 +224,7 @@ jobs:
./build/test/TEST-*.xml
./build/**/test/TEST-*.xml
retention-days: 1
overwrite: true
- name: Upload Coverage Data
uses: actions/upload-artifact@v4
if: always() && matrix.os == 'ubuntu-latest'
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Apache Nutch README

[![master pull request ci](https://github.com/apache/nutch/actions/workflows/master-build.yml/badge.svg)](https://github.com/apache/nutch/actions/workflows/master-build.yml)
[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=apache_nutch&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=apache_nutch)
[![Smoke Tests](https://ci-builds.apache.org/job/Nutch/job/Nutch-Smoke-Test-Single-Node-Hadoop-Cluster/badge/icon?style=plastic&subject=Smoke%20Tests)](https://ci-builds.apache.org/job/Nutch/job/Nutch-Smoke-Test-Single-Node-Hadoop-Cluster/)

<img src="https://nutch.apache.org/assets/img/nutch_logo_tm.png" align="right" width="300" />

Expand Down
23 changes: 8 additions & 15 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,12 @@
</path>

<presetdef name="javac">
<javac includeantruntime="false" />
<javac includeantruntime="false"
encoding="${build.encoding}"
debug="${javac.debug}"
optimize="${javac.optimize}"
release="${javac.version}"
deprecation="${javac.deprecation}"/>
</presetdef>

<target name="dependencytree" depends="resolve-default" description="Show dependency tree">
Expand Down Expand Up @@ -141,15 +146,9 @@

<target name="compile-core" depends="init, resolve-default" description="--> compile core Java files only">
<javac
encoding="${build.encoding}"
srcdir="${src.dir}"
includes="org/apache/nutch/**/*.java"
destdir="${build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
destdir="${build.classes}">
<compilerarg value="-Xlint:-path"/>
<classpath refid="classpath"/>
</javac>
Expand Down Expand Up @@ -470,15 +469,9 @@
<!-- ================================================================== -->
<target name="compile-core-test" depends="init, compile-core, resolve-test" description="--> compile test code">
<javac
encoding="${build.encoding}"
srcdir="${test.src.dir}"
includes="org/apache/nutch/**/*.java"
destdir="${test.build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
destdir="${test.build.classes}">
<compilerarg value="-Xlint:-path"/>
<classpath refid="test.classpath"/>
</javac>
Expand Down
7 changes: 2 additions & 5 deletions default.properties
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,8 @@ javac.optimize=on
javac.deprecation=on

# Java bytecode target version for compiled classes.
# Set to 11 for backward-compatible runtime (works on Java 11+).
# Note: Building and running tests requires Java 17+ (JUnit 6 requirement),
# but the compiled artifacts will run on Java 11+.
# Override with: ant -Djavac.version=17 to target Java 17 bytecode.
javac.version=11
# Project requires Java 17+ for build and runtime
javac.version=17

runtime.dir=./runtime
runtime.deploy=${runtime.dir}/deploy
Expand Down
3 changes: 3 additions & 0 deletions ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
<dependency org="org.apache.commons" name="commons-jexl3" rev="3.6.0" conf="*->default" />
<dependency org="com.tdunning" name="t-digest" rev="3.3" />

<!-- Aligned with Hadoop 3.4.2; use Option.builder()...build() and new HelpFormatter(). Review when upgrading Hadoop. -->
<dependency org="commons-cli" name="commons-cli" rev="1.9.0" conf="*->default" force="true" />

<!-- Hadoop Dependencies -->
<dependency org="org.apache.hadoop" name="hadoop-common" rev="3.4.2" conf="*->default">
<exclude org="ch.qos.reload4j" name="*"/>
Expand Down
5 changes: 3 additions & 2 deletions sonar-project.properties
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@ sonar.links.scm=https://github.com/apache/nutch
sonar.links.issue=https://issues.apache.org/jira/projects/NUTCH/issues
sonar.links.ci=https://github.com/apache/nutch/actions

sonar.sources=src/java,src/plugin
sonar.sources=src/java,src/plugin,src/bin,docker,conf
sonar.tests=src/test,src/plugin
sonar.test.inclusions=**/src/test/**/*.java,**/Test*.java,**/*IT.java
sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,**/plugin.xml
sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,\
**/sample/**,**/data/**,src/testresources/**,src/java/overview.html
sonar.source.encoding=UTF-8
sonar.java.source=17

Expand Down
78 changes: 27 additions & 51 deletions src/java/org/apache/nutch/crawl/CrawlDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,27 @@
*/
package org.apache.nutch.crawl;

import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.json.JsonWriteFeature;
import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.tdunning.math.stats.MergingDigest;
import com.tdunning.math.stats.TDigest;
import org.apache.commons.jexl3.JexlScript;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
Expand All @@ -67,26 +47,22 @@
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.AbstractChecker;
import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.tdunning.math.stats.MergingDigest;
import com.tdunning.math.stats.TDigest;
import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Read utility for the CrawlDB.
Expand Down Expand Up @@ -263,7 +239,7 @@ protected static class LineRecordWriter
public LineRecordWriter(DataOutputStream out) {
this.out = out;
jsonMapper.getFactory()
.configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, true);
.configure(JsonWriteFeature.ESCAPE_NON_ASCII.mappedFeature(), true);
SimpleModule module = new SimpleModule();
module.addSerializer(Writable.class, new WritableSerializer());
jsonMapper.registerModule(module);
Expand Down
Loading