diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml
index 76d98db4eb..31b06236a4 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -85,6 +85,29 @@ jobs:
if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }}
run: exit 1
+ openapi-lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v5
+ - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36
+ id: filter
+ with:
+ filters: |
+ openapi:
+ - 'openapi.yaml'
+ - '.spectral.yaml'
+ - name: Set up Node.js
+ if: steps.filter.outputs.openapi == 'true'
+ uses: actions/setup-node@v4
+ with:
+ node-version: '18'
+ - name: Install IBM OpenAPI Validator
+ if: steps.filter.outputs.openapi == 'true'
+ run: npm install --no-save ibm-openapi-validator
+ - name: Validate OpenAPI specification
+ if: steps.filter.outputs.openapi == 'true'
+ run: ./node_modules/.bin/lint-openapi openapi.yaml
+
# Build verification with Java bytecode target matrix
# Verifies bytecode compatibility for both Java 11 and Java 17 targets
build:
diff --git a/.spectral.yaml b/.spectral.yaml
new file mode 100644
index 0000000000..6e42544a12
--- /dev/null
+++ b/.spectral.yaml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Custom Spectral ruleset for the Apache Nutch REST API.
+# Extends the IBM Cloud Validation Ruleset and disables rules that conflict
+# with the existing Nutch Java wire format (camelCase properties, UPPER_CASE
+# enums, camelCase parameters, bare-array responses, map-typed responses,
+# and consecutive path parameter segments).
+
+extends: "@ibm-cloud/openapi-ruleset"
+rules:
+ ibm-property-casing-convention: off
+ ibm-enum-casing-convention: off
+ ibm-parameter-casing-convention: off
+ ibm-no-array-responses: off
+ ibm-accept-and-return-models: off
+ ibm-no-consecutive-path-parameter-segments: off
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 93985f2289..68fa8996f4 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -18,8 +18,8 @@
# BUILD_MODE can be either
# 0 == Nutch master branch source install with 'crawl' and 'nutch' scripts on PATH
-# 1 == Same as mode 0 with addition of Nutch REST Server
-# 2 == Same as mode 1 with addition of Nutch WebApp
+# 1 == (DEPRECATED) Same as mode 0 with addition of Nutch REST Server
+# 2 == (DEPRECATED) Same as mode 1 with addition of Nutch WebApp
ARG BUILD_MODE=0
FROM alpine:3.19 AS base
@@ -67,6 +67,7 @@ RUN echo "Nutch master branch source install with 'crawl' and 'nutch' scripts on
FROM base AS branch-version-1
+RUN echo "WARNING: BUILD_MODE=1 (server) is deprecated and will be removed in a future version of Nutch. The Nutch REST server is superseded by the OpenAPI specification (openapi.yaml)."
RUN echo "Nutch master branch source install with 'crawl' and 'nutch' scripts on PATH and Nutch REST Server on $SERVER_HOST:$SERVER_PORT"
ARG SERVER_PORT=8081
ARG SERVER_HOST=0.0.0.0
@@ -86,6 +87,7 @@ ENTRYPOINT [ "supervisord", "--nodaemon", "--configuration", "/etc/supervisord.c
FROM base AS branch-version-2
+RUN echo "WARNING: BUILD_MODE=2 (server + webapp) is deprecated and will be removed in a future version of Nutch. The Nutch REST server and webapp are superseded by the OpenAPI specification (openapi.yaml)."
RUN echo "Nutch master branch source install with 'crawl' and 'nutch' scripts on PATH, Nutch REST Server on $SERVER_HOST:$SERVER_PORT and WebApp on this container port $WEBAPP_PORT"
ARG SERVER_PORT=8081
ARG SERVER_HOST=0.0.0.0
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index b6b8f67a9d..55c63ca4bc 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -90,15 +90,20 @@
+
+
+
+
-
diff --git a/openapi.yaml b/openapi.yaml
new file mode 100644
index 0000000000..92065aebeb
--- /dev/null
+++ b/openapi.yaml
@@ -0,0 +1,1327 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+openapi: "3.1.2"
+
+info:
+ title: Apache Nutch REST API
+ description: >-
+ REST API for managing Apache Nutch crawl jobs, configurations, seed lists,
+ database queries, and data readers. The server is powered by Apache CXF
+ with JAX-RS.
+ version: "1.0.0"
+ license:
+ name: Apache 2.0
+ identifier: Apache-2.0
+ contact:
+ name: Apache Nutch
+ url: https://nutch.apache.org
+
+servers:
+ - url: "{protocol}://localhost:{port}"
+ description: Nutch REST server
+ variables:
+ protocol:
+ default: http
+ enum:
+ - http
+ - https
+ description: The protocol used to access the Nutch server.
+ port:
+ default: "8081"
+ description: >-
+ The port the Nutch server listens on. Configurable via the --port
+ command-line argument.
+
+security:
+ - basicAuth: []
+
+tags:
+ - name: Admin
+ description: Server administration operations
+ - name: Configuration
+ description: Manage Nutch configurations
+ - name: Job
+ description: Manage crawl jobs
+ - name: Database
+ description: Query the CrawlDB and FetchDB
+ - name: Seed
+ description: Manage seed URL lists
+ - name: Reader
+ description: Read sequence files and webgraph data
+ - name: Services
+ description: Auxiliary service operations such as CommonCrawl data dumps
+
+paths:
+ # ---------------------------------------------------------------------------
+ # Admin
+ # ---------------------------------------------------------------------------
+ /admin/:
+ get:
+ tags:
+ - Admin
+ summary: Get server status
+ description: >-
+ Returns the current status of the Nutch server including start date,
+ known configurations, all jobs, and currently running jobs.
+ operationId: getServerStatus
+ responses:
+ "200":
+ description: Server status retrieved successfully.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/NutchServerInfo"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /admin/stop:
+ get:
+ tags:
+ - Admin
+ summary: Stop the Nutch server
+ description: >-
+ Initiates a graceful shutdown of the Nutch server. If jobs are still
+ running and force is not set, the server will refuse to stop.
+ operationId: stopServer
+ parameters:
+ - name: force
+ in: query
+ required: false
+ description: >-
+ If true, kills any running jobs before stopping the server.
+ schema:
+ type: boolean
+ default: false
+ responses:
+ "200":
+ description: Shutdown status message.
+ content:
+ application/json:
+ schema:
+ type: string
+ examples:
+ stopping:
+ value: "Stopping in server on port 8081"
+ busy:
+ value: "Jobs still running -- Cannot stop server now"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ # ---------------------------------------------------------------------------
+ # Configuration
+ # ---------------------------------------------------------------------------
+ /config/:
+ get:
+ tags:
+ - Configuration
+ summary: List all configuration IDs
+ description: Returns the set of all known configuration identifiers.
+ operationId: getConfigs
+ responses:
+ "200":
+ description: A JSON array of configuration ID strings.
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: string
+ uniqueItems: true
+ example:
+ - default
+ - my-custom-config
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /config/{configId}:
+ get:
+ tags:
+ - Configuration
+ summary: Get configuration properties
+ description: Returns all key-value properties for the specified configuration.
+ operationId: getConfig
+ parameters:
+ - $ref: "#/components/parameters/configId"
+ responses:
+ "200":
+ description: A JSON object of configuration property key-value pairs.
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ type: string
+ example:
+ http.agent.name: "NutchBot"
+ http.robots.agents: "NutchBot,*"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "404":
+ $ref: "#/components/responses/NotFound"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+ delete:
+ tags:
+ - Configuration
+ summary: Delete a configuration
+ description: >-
+ Removes the specified configuration from the list of known
+ configurations.
+ operationId: deleteConfig
+ parameters:
+ - $ref: "#/components/parameters/configId"
+ responses:
+ "204":
+ description: Configuration deleted successfully.
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "404":
+ $ref: "#/components/responses/NotFound"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /config/{configId}/{propertyId}:
+ get:
+ tags:
+ - Configuration
+ summary: Get a single configuration property
+ description: >-
+ Returns the value of a specific property within the given
+ configuration.
+ operationId: getProperty
+ parameters:
+ - $ref: "#/components/parameters/configId"
+ - $ref: "#/components/parameters/propertyId"
+ responses:
+ "200":
+ description: The property value as plain text.
+ content:
+ text/plain:
+ schema:
+ type: string
+ example: "NutchBot"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "404":
+ $ref: "#/components/responses/NotFound"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+ put:
+ tags:
+ - Configuration
+ summary: Update a configuration property
+ description: >-
+ Adds or updates the value of a property in the specified
+ configuration.
+ operationId: updateProperty
+ parameters:
+ - $ref: "#/components/parameters/configId"
+ - $ref: "#/components/parameters/propertyId"
+ requestBody:
+ required: true
+ description: The new property value as plain text.
+ content:
+ text/plain:
+ schema:
+ type: string
+ example: "MyNewBot"
+ responses:
+ "200":
+ description: Property updated successfully.
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /config/create:
+ post:
+ tags:
+ - Configuration
+ summary: Create a new configuration
+ description: >-
+ Creates a new Nutch configuration with the specified parameters.
+ Returns the configuration ID on success.
+ operationId: createConfig
+ requestBody:
+ required: true
+ description: The configuration to create.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/NutchConfig"
+ responses:
+ "200":
+ description: Configuration created. Returns the configuration ID.
+ content:
+ text/plain:
+ schema:
+ type: string
+ example: "my-custom-config"
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ # ---------------------------------------------------------------------------
+ # Job
+ # ---------------------------------------------------------------------------
+ /job/:
+ get:
+ tags:
+ - Job
+ summary: List all jobs
+ description: >-
+ Returns job history for all jobs or filtered by crawl ID, regardless
+ of job state.
+ operationId: getJobs
+ parameters:
+ - name: crawlId
+ in: query
+ required: false
+ description: Optional crawl ID to filter jobs by.
+ schema:
+ type: string
+ responses:
+ "200":
+ description: A JSON array of job information objects.
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/JobInfo"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /job/{id}:
+ get:
+ tags:
+ - Job
+ summary: Get job info
+ description: Returns detailed information for a specific job.
+ operationId: getJobInfo
+ parameters:
+ - $ref: "#/components/parameters/jobId"
+ - name: crawlId
+ in: query
+ required: false
+ description: The crawl ID associated with the job.
+ schema:
+ type: string
+ responses:
+ "200":
+ description: Job details.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/JobInfo"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "404":
+ $ref: "#/components/responses/NotFound"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /job/{id}/stop:
+ get:
+ tags:
+ - Job
+ summary: Stop a running job
+ description: Attempts to gracefully stop a running job.
+ operationId: stopJob
+ parameters:
+ - $ref: "#/components/parameters/jobId"
+ - name: crawlId
+ in: query
+ required: false
+ description: The crawl ID associated with the job.
+ schema:
+ type: string
+ responses:
+ "200":
+ description: Whether the job was successfully stopped.
+ content:
+ application/json:
+ schema:
+ type: boolean
+ example: true
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "404":
+ $ref: "#/components/responses/NotFound"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /job/{id}/abort:
+ get:
+ tags:
+ - Job
+ summary: Abort a job
+ description: >-
+ Forcefully aborts a job. Unlike stop, this kills the job immediately.
+ operationId: abortJob
+ parameters:
+ - $ref: "#/components/parameters/jobId"
+ - name: crawlId
+ in: query
+ required: false
+ description: The crawl ID associated with the job.
+ schema:
+ type: string
+ responses:
+ "200":
+ description: Whether the job was successfully aborted.
+ content:
+ application/json:
+ schema:
+ type: boolean
+ example: true
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "404":
+ $ref: "#/components/responses/NotFound"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /job/create:
+ post:
+ tags:
+ - Job
+ summary: Create a new job
+ description: >-
+ Creates and enqueues a new Nutch job (e.g., inject, generate, fetch,
+ parse, updatedb, index).
+ operationId: createJob
+ requestBody:
+ required: true
+ description: The job configuration specifying type, crawl ID, and arguments.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/JobConfig"
+ responses:
+ "200":
+ description: Job created. Returns the job information.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/JobInfo"
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ # ---------------------------------------------------------------------------
+ # Database
+ # ---------------------------------------------------------------------------
+ /db/crawldb:
+ post:
+ tags:
+ - Database
+ summary: Query the CrawlDB
+ description: >-
+ Executes a query against the Nutch CrawlDB. The type field in the
+ request body determines the operation: stats, dump, topN, or url.
+ The stats and url types return JSON; dump and topN return binary
+ octet-stream data.
+ operationId: readCrawlDb
+ requestBody:
+ required: true
+ description: The database query parameters.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/DbQuery"
+ responses:
+ "200":
+ description: >-
+ Query results. Content type varies by query type: application/json
+ for stats and url queries; application/octet-stream for dump and
+ topN queries.
+ content:
+ application/json:
+ schema:
+ type: object
+ description: >-
+ CrawlDB query result (returned for stats and url query
+ types).
+ application/octet-stream:
+ schema:
+ type: string
+ format: binary
+ description: >-
+ Binary data stream (returned for dump and topN query types).
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /db/fetchdb:
+ get:
+ tags:
+ - Database
+ summary: Get FetchDB node information
+ description: >-
+ Returns fetch node database entries for the specified index range.
+ Both from and to default to 0; if to is 0 or exceeds the total
+ number of entries, all entries from the starting index are returned.
+ operationId: fetchDb
+ parameters:
+ - name: from
+ in: query
+ required: false
+ description: Starting index (inclusive). Defaults to 0.
+ schema:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ default: 0
+ - name: to
+ in: query
+ required: false
+ description: Ending index (inclusive). Defaults to 0 (returns all).
+ schema:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ default: 0
+ responses:
+ "200":
+ description: A JSON array of fetch node information objects.
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/FetchNodeDbInfo"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ # ---------------------------------------------------------------------------
+ # Seed
+ # ---------------------------------------------------------------------------
+ /seed/:
+ get:
+ tags:
+ - Seed
+ summary: List all seed lists
+ description: Returns a map of all created seed files keyed by name.
+ operationId: getSeedLists
+ responses:
+ "200":
+ description: A JSON object mapping seed list names to SeedList objects.
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ $ref: "#/components/schemas/SeedList"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /seed/create:
+ post:
+ tags:
+ - Seed
+ summary: Create a seed list file
+ description: >-
+ Creates a seed list file from the provided URLs and writes it to
+ HDFS. Returns the path to the created seed file directory.
+ operationId: createSeedFile
+ requestBody:
+ required: true
+ description: The seed list containing URLs to write.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/SeedList"
+ responses:
+ "200":
+ description: Path to the created seed file directory.
+ content:
+ text/plain:
+ schema:
+ type: string
+ example: "seedFiles/seed-1700000000000"
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ # ---------------------------------------------------------------------------
+ # Reader
+ # ---------------------------------------------------------------------------
+ /reader/sequence/read:
+ post:
+ tags:
+ - Reader
+ summary: Read a sequence file
+ description: >-
+ Reads key-value pairs from a Hadoop sequence file. Supports reading
+ all rows, a limited number of rows, a row range, or counting the
+ total number of rows.
+ operationId: seqRead
+ parameters:
+ - $ref: "#/components/parameters/nrows"
+ - $ref: "#/components/parameters/start"
+ - $ref: "#/components/parameters/end"
+ - $ref: "#/components/parameters/count"
+ requestBody:
+ required: true
+ description: Reader configuration specifying the file path.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ReaderConfig"
+ responses:
+ "200":
+ description: >-
+ Sequence file data. Returns application/json when reading rows,
+ or text/plain when count=true.
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: object
+ text/plain:
+ schema:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ description: Number of rows in the sequence file.
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /reader/link:
+ get:
+ tags:
+ - Reader
+ summary: Get link reader schema
+ description: >-
+ Returns the schema describing the fields in link reader responses.
+ operationId: getLinkSchema
+ responses:
+ "200":
+ description: Link reader response schema.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/LinkSchema"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /reader/link/read:
+ post:
+ tags:
+ - Reader
+ summary: Read link objects
+ description: >-
+ Reads link data (LinkDatum) from the Nutch webgraph. Supports
+ reading all rows, a limited number of rows, a row range, or
+ counting the total number of rows.
+ operationId: linkRead
+ parameters:
+ - $ref: "#/components/parameters/nrows"
+ - $ref: "#/components/parameters/start"
+ - $ref: "#/components/parameters/end"
+ - $ref: "#/components/parameters/count"
+ requestBody:
+ required: true
+ description: Reader configuration specifying the file path.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ReaderConfig"
+ responses:
+ "200":
+ description: >-
+ Link data. Returns application/json when reading rows, or
+ text/plain when count=true.
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: object
+ text/plain:
+ schema:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ description: Number of link entries.
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /reader/node:
+ get:
+ tags:
+ - Reader
+ summary: Get node reader schema
+ description: >-
+ Returns the schema describing the fields in node reader responses.
+ operationId: getNodeSchema
+ responses:
+ "200":
+ description: Node reader response schema.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/NodeSchema"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /reader/node/read:
+ post:
+ tags:
+ - Reader
+ summary: Read node objects
+ description: >-
+ Reads Node objects from the Nutch webgraph. Supports reading all
+ rows, a limited number of rows, a row range, or counting the total
+ number of rows.
+ operationId: nodeRead
+ parameters:
+ - $ref: "#/components/parameters/nrows"
+ - $ref: "#/components/parameters/start"
+ - $ref: "#/components/parameters/end"
+ - $ref: "#/components/parameters/count"
+ requestBody:
+ required: true
+ description: Reader configuration specifying the file path.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ReaderConfig"
+ responses:
+ "200":
+ description: >-
+ Node data. Returns application/json when reading rows, or
+ text/plain when count=true.
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: object
+ text/plain:
+ schema:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ description: Number of node entries.
+ "400":
+ $ref: "#/components/responses/BadRequest"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ # ---------------------------------------------------------------------------
+ # Services
+ # ---------------------------------------------------------------------------
+ /services/commoncrawldump/{crawlId}:
+ get:
+ tags:
+ - Services
+ summary: List CommonCrawl dump paths
+ description: >-
+ Lists the dump file paths for a given crawl ID.
+ operationId: listDumpPaths
+ parameters:
+ - name: crawlId
+ in: path
+ required: true
+ description: The crawl ID whose dump paths to list.
+ schema:
+ type: string
+ responses:
+ "200":
+ description: Service information containing the list of dump paths.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ServiceInfo"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+ /services/commoncrawldump:
+ post:
+ tags:
+ - Services
+ summary: Create a CommonCrawl data dump
+ description: >-
+ Executes a CommonCrawl data dump job for the specified crawl and
+ returns the output directory path.
+ operationId: commoncrawlDump
+ requestBody:
+ required: true
+ description: Service configuration specifying crawl ID and arguments.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ServiceConfig"
+ responses:
+ "200":
+ description: The output directory path for the dump.
+ content:
+ text/plain:
+ schema:
+ type: string
+ example: "myCrawl/dump/commoncrawl-20260213120000"
+ "401":
+ $ref: "#/components/responses/Unauthorized"
+ "500":
+ $ref: "#/components/responses/InternalServerError"
+
+# =============================================================================
+# Components
+# =============================================================================
+components:
+
+ # ---------------------------------------------------------------------------
+ # Security Schemes
+ # ---------------------------------------------------------------------------
+ securitySchemes:
+ basicAuth:
+ type: http
+ scheme: basic
+ description: HTTP Basic Authentication.
+
+ # ---------------------------------------------------------------------------
+ # Reusable Parameters
+ # ---------------------------------------------------------------------------
+ parameters:
+ configId:
+ name: configId
+ in: path
+ required: true
+ description: The unique identifier for the configuration.
+ schema:
+ type: string
+ propertyId:
+ name: propertyId
+ in: path
+ required: true
+ description: The name (key) of the configuration property.
+ schema:
+ type: string
+ jobId:
+ name: id
+ in: path
+ required: true
+ description: The unique identifier for the job.
+ schema:
+ type: string
+ nrows:
+ name: nrows
+ in: query
+ required: false
+ description: >-
+ Number of rows to read. If not specified (or -1), all rows are
+ returned.
+ schema:
+ type: integer
+ format: int32
+ minimum: -1
+ maximum: 2147483647
+ default: -1
+ start:
+ name: start
+ in: query
+ required: false
+ description: Starting line number for a range read.
+ schema:
+ type: integer
+ format: int32
+ minimum: -1
+ maximum: 2147483647
+ default: -1
+ end:
+ name: end
+ in: query
+ required: false
+ description: Ending line number for a range read.
+ schema:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ count:
+ name: count
+ in: query
+ required: false
+ description: >-
+ If true, returns the number of lines instead of the data itself.
+ When set, the response content type is text/plain.
+ schema:
+ type: boolean
+ default: false
+
+ # ---------------------------------------------------------------------------
+ # Schemas
+ # ---------------------------------------------------------------------------
+ schemas:
+ # -- Request Models -------------------------------------------------------
+ NutchConfig:
+ type: object
+ description: Configuration for creating a new Nutch configuration.
+ properties:
+ configId:
+ type: string
+ description: The identifier for this configuration.
+ force:
+ type: boolean
+ description: >-
+ If true, overwrites an existing configuration with the same ID.
+ default: false
+ params:
+ type: object
+ additionalProperties:
+ type: string
+ description: Key-value pairs of Nutch configuration properties.
+ example:
+ configId: "my-config"
+ force: false
+ params:
+ http.agent.name: "MyBot"
+ http.robots.agents: "MyBot,*"
+
+ JobConfig:
+ type: object
+ description: Configuration for creating a new crawl job.
+ required:
+ - type
+ properties:
+ crawlId:
+ type: string
+ description: The crawl identifier.
+ type:
+ $ref: "#/components/schemas/JobType"
+ confId:
+ type: string
+ description: >-
+ The configuration ID to use for this job. Defaults to "default"
+ if not specified.
+ jobClassName:
+ type: string
+ description: >-
+ Fully qualified class name when type is CLASS.
+ args:
+ type: object
+ additionalProperties: true
+ description: Additional arguments for the job.
+ example:
+ crawlId: "crawl-01"
+ type: "INJECT"
+ confId: "default"
+ args:
+ seedDir: "seedFiles/seed-1700000000000"
+
+ DbQuery:
+ type: object
+ description: Parameters for a CrawlDB query.
+ required:
+ - crawlId
+ - type
+ properties:
+ confId:
+ type: string
+ description: >-
+ Configuration ID. Falls back to "default" if not provided.
+ type:
+ type: string
+ description: The type of CrawlDB query to execute.
+ enum:
+ - stats
+ - dump
+ - topN
+ - url
+ args:
+ type: object
+ additionalProperties:
+ type: string
+ description: Additional arguments for the query.
+ crawlId:
+ type: string
+ description: The crawl identifier.
+ example:
+ confId: "default"
+ type: "stats"
+ crawlId: "crawl-01"
+ args: {}
+
+ ReaderConfig:
+ type: object
+ description: Configuration specifying a file path for reader operations.
+ required:
+ - path
+ properties:
+ path:
+ type: string
+ description: >-
+ The path to the sequence file, link data, or node data to read.
+ example:
+ path: "crawl-01/crawldb/current/part-00000/data"
+
+ SeedList:
+ type: object
+ description: A named list of seed URLs.
+ required:
+ - seedUrls
+ properties:
+ id:
+ type: integer
+ format: int64
+ minimum: 0
+ maximum: 9007199254740991
+ description: The seed list identifier.
+ readOnly: true
+ name:
+ type: string
+ description: A human-readable name for this seed list.
+ seedFilePath:
+ type: string
+ description: >-
+ The HDFS path where the seed file is stored. Populated after
+ creation.
+ readOnly: true
+ seedUrls:
+ type: array
+ items:
+ $ref: "#/components/schemas/SeedUrl"
+ description: The collection of seed URLs in this list.
+ example:
+ name: "my-seeds"
+ seedUrls:
+ - url: "https://example.com"
+ - url: "https://nutch.apache.org"
+
+ SeedUrl:
+ type: object
+ description: A single seed URL entry.
+ properties:
+ id:
+ type: integer
+ format: int64
+ minimum: 0
+ maximum: 9007199254740991
+ description: The seed URL identifier.
+ readOnly: true
+ url:
+ type: string
+ description: The seed URL.
+ example:
+ url: "https://example.com"
+
+ ServiceConfig:
+ type: object
+ description: >-
+ Configuration for service operations such as CommonCrawl data dumps.
+ required:
+ - crawlId
+ properties:
+ crawlId:
+ type: string
+ description: The crawl identifier.
+ confId:
+ type: string
+ description: The configuration ID.
+ args:
+ type: object
+ additionalProperties: true
+ description: Additional arguments for the service operation.
+ example:
+ crawlId: "crawl-01"
+ confId: "default"
+ args: {}
+
+ # -- Response Models ------------------------------------------------------
+ NutchServerInfo:
+ type: object
+ description: Status information about the running Nutch server.
+ required:
+ - configuration
+ - jobs
+ - runningJobs
+ properties:
+ startDate:
+ type: string
+ format: date-time
+ description: The date and time the server was started.
+ configuration:
+ type: array
+ items:
+ type: string
+ uniqueItems: true
+ description: Set of known configuration IDs.
+ jobs:
+ type: array
+ items:
+ $ref: "#/components/schemas/JobInfo"
+ description: All jobs (any state).
+ runningJobs:
+ type: array
+ items:
+ $ref: "#/components/schemas/JobInfo"
+ description: Currently running jobs.
+
+ JobInfo:
+ type: object
+ description: Information about a crawl job.
+ required:
+ - type
+ - state
+ properties:
+ id:
+ type: string
+ description: The unique job identifier.
+ type:
+ $ref: "#/components/schemas/JobType"
+ confId:
+ type: string
+ description: The configuration ID used for this job.
+ args:
+ type: object
+ additionalProperties: true
+ description: Arguments passed to the job.
+ result:
+ type: object
+ additionalProperties: true
+ description: Result data returned after job completion.
+ state:
+ $ref: "#/components/schemas/State"
+ msg:
+ type: string
+ description: A human-readable status or error message.
+ crawlId:
+ type: string
+ description: The crawl identifier associated with this job.
+
+ FetchNodeDbInfo:
+ type: object
+ description: Information about a fetched node in the FetchDB.
+ required:
+ - children
+ properties:
+ url:
+ type: string
+ description: The URL of the fetched node.
+ status:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ description: The HTTP status code of the fetch.
+ numOfOutlinks:
+ type: integer
+ format: int32
+ minimum: 0
+ maximum: 2147483647
+ description: The number of outgoing links discovered.
+ children:
+ type: array
+ items:
+ $ref: "#/components/schemas/ChildNode"
+ description: The outgoing links from this node.
+
+ ChildNode:
+ type: object
+ description: A child (outlink) of a fetched node.
+ properties:
+ childUrl:
+ type: string
+ description: The URL of the child node.
+ anchorText:
+ type: string
+ description: The anchor text of the link.
+
+ ServiceInfo:
+ type: object
+ description: Information returned by service operations.
+ required:
+ - dumpPaths
+ properties:
+ dumpPaths:
+ type: array
+ items:
+ type: string
+ description: List of file paths for the dump output.
+
+ # -- Schema Objects (Reader) ----------------------------------------------
+ LinkSchema:
+ type: object
+ description: Schema describing the fields in a link reader response.
+ properties:
+ key_url:
+ type: string
+ example: "string"
+ timestamp:
+ type: string
+ example: "int"
+ score:
+ type: string
+ example: "float"
+ anchor:
+ type: string
+ example: "string"
+ linktype:
+ type: string
+ example: "string"
+ url:
+ type: string
+ example: "string"
+
+ NodeSchema:
+ type: object
+ description: Schema describing the fields in a node reader response.
+ properties:
+ key_url:
+ type: string
+ example: "string"
+ num_inlinks:
+ type: string
+ example: "int"
+ num_outlinks:
+ type: string
+ example: "int"
+ inlink_score:
+ type: string
+ example: "float"
+ outlink_score:
+ type: string
+ example: "float"
+ metadata:
+ type: string
+ example: "string"
+
+ # -- Enums ----------------------------------------------------------------
+ JobType:
+ type: string
+ description: The type of Nutch crawl job.
+ enum:
+ - INJECT
+ - GENERATE
+ - FETCH
+ - PARSE
+ - UPDATEDB
+ - INDEX
+ - READDB
+ - CLASS
+ - INVERTLINKS
+ - DEDUP
+
+ State:
+ type: string
+ description: The current state of a job.
+ enum:
+ - IDLE
+ - RUNNING
+ - FINISHED
+ - FAILED
+ - KILLED
+ - STOPPING
+ - KILLING
+ - ANY
+
+ # ---------------------------------------------------------------------------
+ # Reusable Responses
+ # ---------------------------------------------------------------------------
+ responses:
+ BadRequest:
+ description: >-
+ Bad request. The request body is missing, malformed, or contains
+ invalid parameters.
+ content:
+ text/plain:
+ schema:
+ type: string
+ example: "Nutch configuration cannot be empty!"
+
+ Unauthorized:
+ description: >-
+ Unauthorized. Basic authentication credentials are missing or
+ invalid.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ example:
+ message: "Authentication required."
+
+ NotFound:
+ description: The requested resource was not found.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ example:
+ message: "Resource not found."
+
+ InternalServerError:
+ description: An unexpected server error occurred.
+ content:
+ text/plain:
+ schema:
+ type: string
+ example: "Internal server error."
diff --git a/src/bin/nutch b/src/bin/nutch
index d4b873cd08..233a4594a5 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -119,8 +119,8 @@ if [ $# = 0 ]; then
echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR"
echo " warc exports crawled data from segments at the WARC format"
echo ""
- echo " (Nutch Server)"
- echo " startserver runs the Nutch Server on localhost:8081"
+ echo " (Nutch Server - DEPRECATED)"
+ echo " startserver (deprecated) runs the Nutch Server on localhost:8081"
echo ""
echo " (or)"
echo " CLASSNAME run the main of the class named CLASSNAME"
@@ -317,6 +317,8 @@ elif [ "$COMMAND" = "junit" ] ; then
fi
CLASS=org.junit.runner.JUnitCore
elif [ "$COMMAND" = "startserver" ] ; then
+ echo "WARNING: The 'startserver' command is deprecated and will be removed in a future version of Nutch." >&2
+ echo "The Nutch REST server is superseded by the OpenAPI specification (openapi.yaml)." >&2
CLASS=org.apache.nutch.service.NutchServer
elif [ "$COMMAND" = "warc" ] ; then
CLASS=org.apache.nutch.tools.warc.WARCExporter
diff --git a/src/java/org/apache/nutch/service/ConfManager.java b/src/java/org/apache/nutch/service/ConfManager.java
index fb4ec8758a..637cfc09a9 100644
--- a/src/java/org/apache/nutch/service/ConfManager.java
+++ b/src/java/org/apache/nutch/service/ConfManager.java
@@ -22,6 +22,12 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.service.model.request.NutchConfig;
+/**
+ * @deprecated This interface and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public interface ConfManager {
public Configuration get(String confId);
diff --git a/src/java/org/apache/nutch/service/JobManager.java b/src/java/org/apache/nutch/service/JobManager.java
index 35a56e1e52..8b7a9d17ad 100644
--- a/src/java/org/apache/nutch/service/JobManager.java
+++ b/src/java/org/apache/nutch/service/JobManager.java
@@ -21,6 +21,12 @@
import org.apache.nutch.service.model.response.JobInfo;
import org.apache.nutch.service.model.response.JobInfo.State;
+/**
+ * @deprecated This interface and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public interface JobManager {
public static enum JobType{
diff --git a/src/java/org/apache/nutch/service/NutchReader.java b/src/java/org/apache/nutch/service/NutchReader.java
index 8d77254244..05ad29232a 100644
--- a/src/java/org/apache/nutch/service/NutchReader.java
+++ b/src/java/org/apache/nutch/service/NutchReader.java
@@ -25,6 +25,12 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * @deprecated This interface and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public interface NutchReader {
static final Logger LOG = LoggerFactory
diff --git a/src/java/org/apache/nutch/service/NutchServer.java b/src/java/org/apache/nutch/service/NutchServer.java
index 9468670317..95066ae76f 100644
--- a/src/java/org/apache/nutch/service/NutchServer.java
+++ b/src/java/org/apache/nutch/service/NutchServer.java
@@ -57,6 +57,12 @@
import com.google.common.collect.Queues;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class NutchServer {
private static final Logger LOG = LoggerFactory
diff --git a/src/java/org/apache/nutch/service/SeedManager.java b/src/java/org/apache/nutch/service/SeedManager.java
index 11ddedb62c..e4c07c3ab0 100644
--- a/src/java/org/apache/nutch/service/SeedManager.java
+++ b/src/java/org/apache/nutch/service/SeedManager.java
@@ -20,6 +20,12 @@
import org.apache.nutch.service.model.request.SeedList;
+/**
+ * @deprecated This interface and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public interface SeedManager {
public SeedList getSeedList(String seedName);
diff --git a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
index 7db24d3195..e2548b4aba 100644
--- a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
+++ b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
@@ -33,6 +33,12 @@
import com.google.common.collect.Maps;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class ConfManagerImpl implements ConfManager {
diff --git a/src/java/org/apache/nutch/service/impl/JobFactory.java b/src/java/org/apache/nutch/service/impl/JobFactory.java
index 60bbb253da..32caf1b5c4 100644
--- a/src/java/org/apache/nutch/service/impl/JobFactory.java
+++ b/src/java/org/apache/nutch/service/impl/JobFactory.java
@@ -33,6 +33,12 @@
import com.google.common.collect.Maps;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class JobFactory {
private static Map> typeToClass;
diff --git a/src/java/org/apache/nutch/service/impl/JobManagerImpl.java b/src/java/org/apache/nutch/service/impl/JobManagerImpl.java
index 3bcb7dde93..7a03094853 100644
--- a/src/java/org/apache/nutch/service/impl/JobManagerImpl.java
+++ b/src/java/org/apache/nutch/service/impl/JobManagerImpl.java
@@ -27,6 +27,12 @@
import org.apache.nutch.service.model.response.JobInfo.State;
import org.apache.nutch.util.NutchTool;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class JobManagerImpl implements JobManager {
private JobFactory jobFactory;
diff --git a/src/java/org/apache/nutch/service/impl/JobWorker.java b/src/java/org/apache/nutch/service/impl/JobWorker.java
index d3343ae714..cbe40402a2 100644
--- a/src/java/org/apache/nutch/service/impl/JobWorker.java
+++ b/src/java/org/apache/nutch/service/impl/JobWorker.java
@@ -29,6 +29,12 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class JobWorker implements Runnable{
private JobInfo jobInfo;
diff --git a/src/java/org/apache/nutch/service/impl/LinkReader.java b/src/java/org/apache/nutch/service/impl/LinkReader.java
index 59d84509a6..2ea073857c 100644
--- a/src/java/org/apache/nutch/service/impl/LinkReader.java
+++ b/src/java/org/apache/nutch/service/impl/LinkReader.java
@@ -33,6 +33,12 @@
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.service.NutchReader;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class LinkReader implements NutchReader {
@Override
diff --git a/src/java/org/apache/nutch/service/impl/NodeReader.java b/src/java/org/apache/nutch/service/impl/NodeReader.java
index efa94f2329..6d6a067fb2 100644
--- a/src/java/org/apache/nutch/service/impl/NodeReader.java
+++ b/src/java/org/apache/nutch/service/impl/NodeReader.java
@@ -33,6 +33,12 @@
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.service.NutchReader;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class NodeReader implements NutchReader {
@Override
diff --git a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
index 1d1e8175be..a0f006fbb9 100644
--- a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
+++ b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
@@ -30,6 +30,12 @@
import com.google.common.collect.Lists;
import com.google.common.collect.Queues;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class NutchServerPoolExecutor extends ThreadPoolExecutor{
private Queue workersHistory;
diff --git a/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java
index a28de943c2..b8926ed5b9 100644
--- a/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java
+++ b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java
@@ -22,6 +22,12 @@
import org.apache.nutch.service.SeedManager;
import org.apache.nutch.service.model.request.SeedList;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class SeedManagerImpl implements SeedManager {
private static Map seeds;
diff --git a/src/java/org/apache/nutch/service/impl/SequenceReader.java b/src/java/org/apache/nutch/service/impl/SequenceReader.java
index 26b3d55d4d..ded6f3e438 100644
--- a/src/java/org/apache/nutch/service/impl/SequenceReader.java
+++ b/src/java/org/apache/nutch/service/impl/SequenceReader.java
@@ -36,7 +36,11 @@
* ways to read the file.
* @author Sujen Shah
*
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
*/
+@Deprecated
public class SequenceReader implements NutchReader {
@Override
diff --git a/src/java/org/apache/nutch/service/impl/ServiceWorker.java b/src/java/org/apache/nutch/service/impl/ServiceWorker.java
index f86acadde5..4c42f852f5 100644
--- a/src/java/org/apache/nutch/service/impl/ServiceWorker.java
+++ b/src/java/org/apache/nutch/service/impl/ServiceWorker.java
@@ -24,6 +24,12 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class ServiceWorker implements Runnable {
private ServiceConfig serviceConfig;
diff --git a/src/java/org/apache/nutch/service/model/request/DbQuery.java b/src/java/org/apache/nutch/service/model/request/DbQuery.java
index 4b707df209..c5d8b04e1b 100644
--- a/src/java/org/apache/nutch/service/model/request/DbQuery.java
+++ b/src/java/org/apache/nutch/service/model/request/DbQuery.java
@@ -19,6 +19,12 @@
import java.util.HashMap;
import java.util.Map;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class DbQuery {
private String confId;
diff --git a/src/java/org/apache/nutch/service/model/request/JobConfig.java b/src/java/org/apache/nutch/service/model/request/JobConfig.java
index ab80517526..8e5038dbdd 100644
--- a/src/java/org/apache/nutch/service/model/request/JobConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/JobConfig.java
@@ -23,7 +23,11 @@
/**
* Job-specific configuration.
*
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
*/
+@Deprecated
public class JobConfig {
private String crawlId;
private JobType type;
diff --git a/src/java/org/apache/nutch/service/model/request/NutchConfig.java b/src/java/org/apache/nutch/service/model/request/NutchConfig.java
index 7049463a84..4056619f8b 100644
--- a/src/java/org/apache/nutch/service/model/request/NutchConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/NutchConfig.java
@@ -20,6 +20,12 @@
import java.util.Collections;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class NutchConfig {
private String configId;
private boolean force = false;
diff --git a/src/java/org/apache/nutch/service/model/request/ReaderConfig.java b/src/java/org/apache/nutch/service/model/request/ReaderConfig.java
index 3e44f879e0..96e1ee8267 100644
--- a/src/java/org/apache/nutch/service/model/request/ReaderConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/ReaderConfig.java
@@ -16,6 +16,12 @@
*/
package org.apache.nutch.service.model.request;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class ReaderConfig {
private String path;
diff --git a/src/java/org/apache/nutch/service/model/request/SeedList.java b/src/java/org/apache/nutch/service/model/request/SeedList.java
index 5bd3c4fa65..7d7843adb0 100644
--- a/src/java/org/apache/nutch/service/model/request/SeedList.java
+++ b/src/java/org/apache/nutch/service/model/request/SeedList.java
@@ -24,6 +24,12 @@
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonManagedReference;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class SeedList implements Serializable {
private Long id;
diff --git a/src/java/org/apache/nutch/service/model/request/SeedUrl.java b/src/java/org/apache/nutch/service/model/request/SeedUrl.java
index f05e4d0259..84192ab162 100644
--- a/src/java/org/apache/nutch/service/model/request/SeedUrl.java
+++ b/src/java/org/apache/nutch/service/model/request/SeedUrl.java
@@ -21,6 +21,12 @@
import com.fasterxml.jackson.annotation.JsonBackReference;
import com.fasterxml.jackson.annotation.JsonIgnore;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class SeedUrl implements Serializable {
private Long id;
diff --git a/src/java/org/apache/nutch/service/model/request/ServiceConfig.java b/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
index 85d6a3ea6b..60d0f8992b 100644
--- a/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
@@ -18,6 +18,12 @@
import java.util.Map;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class ServiceConfig {
private String crawlId;
diff --git a/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java b/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
index 21887ad659..2cc4fafb24 100644
--- a/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
@@ -21,6 +21,12 @@
import org.apache.nutch.parse.Outlink;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class FetchNodeDbInfo {
private String url;
diff --git a/src/java/org/apache/nutch/service/model/response/JobInfo.java b/src/java/org/apache/nutch/service/model/response/JobInfo.java
index 807c2d5df4..c787db33c7 100644
--- a/src/java/org/apache/nutch/service/model/response/JobInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/JobInfo.java
@@ -23,7 +23,12 @@
/**
* This is the response object containing Job information
+ *
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
*/
+@Deprecated
public class JobInfo {
public static enum State {
diff --git a/src/java/org/apache/nutch/service/model/response/NutchServerInfo.java b/src/java/org/apache/nutch/service/model/response/NutchServerInfo.java
index f8867e6ce6..c94aeb40eb 100644
--- a/src/java/org/apache/nutch/service/model/response/NutchServerInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/NutchServerInfo.java
@@ -20,6 +20,12 @@
import java.util.Date;
import java.util.Set;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class NutchServerInfo {
private Date startDate;
diff --git a/src/java/org/apache/nutch/service/model/response/ServiceInfo.java b/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
index 456f8c53ae..a49336e534 100644
--- a/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
@@ -18,6 +18,12 @@
import java.util.List;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
public class ServiceInfo {
private List dumpPaths;
diff --git a/src/java/org/apache/nutch/service/resources/AbstractResource.java b/src/java/org/apache/nutch/service/resources/AbstractResource.java
index b277a75395..66bf04a5d5 100644
--- a/src/java/org/apache/nutch/service/resources/AbstractResource.java
+++ b/src/java/org/apache/nutch/service/resources/AbstractResource.java
@@ -26,6 +26,12 @@
import org.apache.nutch.service.JobManager;
import org.apache.nutch.service.NutchServer;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
@Produces(MediaType.APPLICATION_JSON)
public abstract class AbstractResource {
diff --git a/src/java/org/apache/nutch/service/resources/AdminResource.java b/src/java/org/apache/nutch/service/resources/AdminResource.java
index 03832628a2..e0ef315f81 100644
--- a/src/java/org/apache/nutch/service/resources/AdminResource.java
+++ b/src/java/org/apache/nutch/service/resources/AdminResource.java
@@ -28,6 +28,12 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
@Path(value="/admin")
public class AdminResource extends AbstractResource{
diff --git a/src/java/org/apache/nutch/service/resources/ConfigResource.java b/src/java/org/apache/nutch/service/resources/ConfigResource.java
index 38e14dcf3a..e3fb11e8e1 100644
--- a/src/java/org/apache/nutch/service/resources/ConfigResource.java
+++ b/src/java/org/apache/nutch/service/resources/ConfigResource.java
@@ -33,6 +33,12 @@
import com.fasterxml.jackson.jaxrs.annotation.JacksonFeatures;
import com.fasterxml.jackson.databind.SerializationFeature;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
@Path("/config")
public class ConfigResource extends AbstractResource{
diff --git a/src/java/org/apache/nutch/service/resources/DbResource.java b/src/java/org/apache/nutch/service/resources/DbResource.java
index dc7049a227..b6f9e9d79b 100644
--- a/src/java/org/apache/nutch/service/resources/DbResource.java
+++ b/src/java/org/apache/nutch/service/resources/DbResource.java
@@ -38,6 +38,12 @@
import org.apache.nutch.service.model.request.DbQuery;
import org.apache.nutch.service.model.response.FetchNodeDbInfo;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
@Path(value = "/db")
public class DbResource extends AbstractResource {
diff --git a/src/java/org/apache/nutch/service/resources/JobResource.java b/src/java/org/apache/nutch/service/resources/JobResource.java
index 0641d2160d..163920a5de 100644
--- a/src/java/org/apache/nutch/service/resources/JobResource.java
+++ b/src/java/org/apache/nutch/service/resources/JobResource.java
@@ -32,6 +32,12 @@
import org.apache.nutch.service.model.response.JobInfo;
import org.apache.nutch.service.model.response.JobInfo.State;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
@Path(value = "/job")
public class JobResource extends AbstractResource {
diff --git a/src/java/org/apache/nutch/service/resources/ReaderResouce.java b/src/java/org/apache/nutch/service/resources/ReaderResouce.java
index f2f52e9c2a..e98c47c955 100644
--- a/src/java/org/apache/nutch/service/resources/ReaderResouce.java
+++ b/src/java/org/apache/nutch/service/resources/ReaderResouce.java
@@ -40,7 +40,11 @@
* nodes and links from the Nutch webgraph.
* @author Sujen Shah
*
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
*/
+@Deprecated
@Path("/reader")
public class ReaderResouce {
diff --git a/src/java/org/apache/nutch/service/resources/SeedResource.java b/src/java/org/apache/nutch/service/resources/SeedResource.java
index a1a555141e..0115731fe5 100644
--- a/src/java/org/apache/nutch/service/resources/SeedResource.java
+++ b/src/java/org/apache/nutch/service/resources/SeedResource.java
@@ -39,6 +39,12 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
+ */
+@Deprecated
@Path("/seed")
public class SeedResource extends AbstractResource {
private static final Logger LOG = LoggerFactory
diff --git a/src/java/org/apache/nutch/service/resources/ServicesResource.java b/src/java/org/apache/nutch/service/resources/ServicesResource.java
index c129652c33..49c952f3c2 100644
--- a/src/java/org/apache/nutch/service/resources/ServicesResource.java
+++ b/src/java/org/apache/nutch/service/resources/ServicesResource.java
@@ -38,7 +38,12 @@
/**
* The services resource defines an endpoint to enable the user to carry out
* Nutch jobs like dump, commoncrawldump, etc.
+ *
+ * @deprecated This class and the Nutch REST service will be removed in a
+ * future version of Nutch. Use the OpenAPI specification
+ * (openapi.yaml) as the authoritative API contract instead.
*/
+@Deprecated
@Path("/services")
public class ServicesResource {