Apache Nutch · Schema

JobConfig

Configuration for creating a new crawl job.

Web CrawlerIndexingSearchApacheJavaHadoopOpen Source

Properties

Name	Type	Description
crawlId	string	The crawl identifier.
type	string	The type of Nutch crawl job.
confId	string	The configuration ID to use for this job. Defaults to "default" if not specified.
jobClassName	string	Fully qualified class name when type is CLASS.
args	object	Additional arguments for the job.

View JSON Schema on GitHub

JSON Schema

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/apache-nutch/refs/heads/main/json-schema/apache-nutch-job-config-schema.json",
  "title": "JobConfig",
  "description": "Configuration for creating a new crawl job.",
  "type": "object",
  "properties": {
    "crawlId": {
      "type": "string",
      "description": "The crawl identifier."
    },
    "type": {
      "type": "string",
      "description": "The type of Nutch crawl job.",
      "enum": [
        "INJECT",
        "GENERATE",
        "FETCH",
        "PARSE",
        "UPDATEDB",
        "INDEX",
        "READDB",
        "CLASS",
        "INVERTLINKS",
        "DEDUP"
      ]
    },
    "confId": {
      "type": "string",
      "description": "The configuration ID to use for this job. Defaults to \"default\" if not specified."
    },
    "jobClassName": {
      "type": "string",
      "description": "Fully qualified class name when type is CLASS."
    },
    "args": {
      "type": "object",
      "additionalProperties": true,
      "description": "Additional arguments for the job."
    }
  },
  "required": [
    "type"
  ],
  "example": {
    "crawlId": "crawl-01",
    "type": "INJECT",
    "confId": "default",
    "args": {
      "seedDir": "seedFiles/seed-1700000000000"
    }
  }
}