Apache Nutch · Schema
JobConfig
Configuration for creating a new crawl job.
Web CrawlerIndexingSearchApacheJavaHadoopOpen Source
Properties
| Name | Type | Description |
|---|---|---|
| crawlId | string | The crawl identifier. |
| type | string | The type of Nutch crawl job. |
| confId | string | The configuration ID to use for this job. Defaults to "default" if not specified. |
| jobClassName | string | Fully qualified class name when type is CLASS. |
| args | object | Additional arguments for the job. |
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com/api-evangelist/apache-nutch/refs/heads/main/json-schema/apache-nutch-job-config-schema.json",
"title": "JobConfig",
"description": "Configuration for creating a new crawl job.",
"type": "object",
"properties": {
"crawlId": {
"type": "string",
"description": "The crawl identifier."
},
"type": {
"type": "string",
"description": "The type of Nutch crawl job.",
"enum": [
"INJECT",
"GENERATE",
"FETCH",
"PARSE",
"UPDATEDB",
"INDEX",
"READDB",
"CLASS",
"INVERTLINKS",
"DEDUP"
]
},
"confId": {
"type": "string",
"description": "The configuration ID to use for this job. Defaults to \"default\" if not specified."
},
"jobClassName": {
"type": "string",
"description": "Fully qualified class name when type is CLASS."
},
"args": {
"type": "object",
"additionalProperties": true,
"description": "Additional arguments for the job."
}
},
"required": [
"type"
],
"example": {
"crawlId": "crawl-01",
"type": "INJECT",
"confId": "default",
"args": {
"seedDir": "seedFiles/seed-1700000000000"
}
}
}