Apache Nutch · Schema

JobConfig

Configuration for creating a new crawl job.

Web CrawlerIndexingSearchApacheJavaHadoopOpen Source

Properties

Name Type Description
crawlId string The crawl identifier.
type string The type of Nutch crawl job.
confId string The configuration ID to use for this job. Defaults to "default" if not specified.
jobClassName string Fully qualified class name when type is CLASS.
args object Additional arguments for the job.
View JSON Schema on GitHub

JSON Schema

apache-nutch-job-config-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/apache-nutch/refs/heads/main/json-schema/apache-nutch-job-config-schema.json",
  "title": "JobConfig",
  "description": "Configuration for creating a new crawl job.",
  "type": "object",
  "properties": {
    "crawlId": {
      "type": "string",
      "description": "The crawl identifier."
    },
    "type": {
      "type": "string",
      "description": "The type of Nutch crawl job.",
      "enum": [
        "INJECT",
        "GENERATE",
        "FETCH",
        "PARSE",
        "UPDATEDB",
        "INDEX",
        "READDB",
        "CLASS",
        "INVERTLINKS",
        "DEDUP"
      ]
    },
    "confId": {
      "type": "string",
      "description": "The configuration ID to use for this job. Defaults to \"default\" if not specified."
    },
    "jobClassName": {
      "type": "string",
      "description": "Fully qualified class name when type is CLASS."
    },
    "args": {
      "type": "object",
      "additionalProperties": true,
      "description": "Additional arguments for the job."
    }
  },
  "required": [
    "type"
  ],
  "example": {
    "crawlId": "crawl-01",
    "type": "INJECT",
    "confId": "default",
    "args": {
      "seedDir": "seedFiles/seed-1700000000000"
    }
  }
}