Newscatcher · Schema

Newscatcher Article

Schema for a news article returned by the Newscatcher News API v3.

NewsSearchNLPSentiment AnalysisEntity ExtractionClusteringMedia IntelligenceFinancial IntelligenceAIEnterprise

Properties

Name	Type	Description
id	string	The unique identifier for the article.
title	string	The title of the article.
author	string	The primary author of the article.
authors	object	A list of authors of the article.
published_date	string	The date the article was published (ISO 8601).
published_date_precision	string	The precision of the published date (e.g., full, timezone unknown).
updated_date	stringnull	The date the article was last updated.
parse_date	stringnull	The date the article was parsed by Newscatcher.
link	string	The URL link to the article.
domain_url	string	The domain URL of the article's source.
full_domain_url	string	The full domain URL of the article's source.
name_source	string	The name of the source where the article was published.
is_headline	boolean	Indicates if the article is a headline.
paid_content	boolean	Indicates whether the article is behind a paywall.
parent_url	string	The categorical/section URL of the article.
country	string	The country where the article was published (ISO 3166-1 alpha-2).
language	string	The language in which the article is written (ISO 639-1).
rank	integer	The rank of the article's source by global traffic.
score	number	The relevance score of the article.
description	string	A brief description or excerpt of the article.
content	string	The full text content of the article.
word_count	integer	The word count of the article.
is_opinion	boolean	Indicates if the article is an opinion piece.
media	string	The media/image URL associated with the article.
rights	string	The copyright/rights information for the article.
twitter_account	stringnull	The Twitter account associated with the article's author or source.
all_links	object	A list of all URLs mentioned in the article.
all_domain_links	object	A list of all domain URLs mentioned in the article.
nlp	object
robots_compliant	boolean	True if the article content can be safely accessed per the publisher's robots.txt rules.
custom_tags	object	Custom taxonomy tags associated with the article.

View JSON Schema on GitHub

JSON Schema

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/newscatcher/main/json-schema/newscatcher-article-schema.json",
  "title": "Newscatcher Article",
  "description": "Schema for a news article returned by the Newscatcher News API v3.",
  "type": "object",
  "required": ["title", "link", "domain_url", "full_domain_url", "parent_url", "rank", "id", "score"],
  "properties": {
    "id": {
      "type": "string",
      "description": "The unique identifier for the article."
    },
    "title": {
      "type": "string",
      "description": "The title of the article."
    },
    "author": {
      "type": "string",
      "description": "The primary author of the article."
    },
    "authors": {
      "description": "A list of authors of the article.",
      "anyOf": [
        {"type": "array", "items": {"type": "string"}},
        {"type": "string"}
      ]
    },
    "published_date": {
      "type": "string",
      "description": "The date the article was published (ISO 8601)."
    },
    "published_date_precision": {
      "type": "string",
      "description": "The precision of the published date (e.g., full, timezone unknown)."
    },
    "updated_date": {
      "type": ["string", "null"],
      "description": "The date the article was last updated."
    },
    "parse_date": {
      "type": ["string", "null"],
      "description": "The date the article was parsed by Newscatcher."
    },
    "link": {
      "type": "string",
      "format": "uri",
      "description": "The URL link to the article."
    },
    "domain_url": {
      "type": "string",
      "description": "The domain URL of the article's source."
    },
    "full_domain_url": {
      "type": "string",
      "description": "The full domain URL of the article's source."
    },
    "name_source": {
      "type": "string",
      "description": "The name of the source where the article was published."
    },
    "is_headline": {
      "type": "boolean",
      "description": "Indicates if the article is a headline."
    },
    "paid_content": {
      "type": "boolean",
      "description": "Indicates whether the article is behind a paywall."
    },
    "parent_url": {
      "type": "string",
      "description": "The categorical/section URL of the article."
    },
    "country": {
      "type": "string",
      "description": "The country where the article was published (ISO 3166-1 alpha-2)."
    },
    "language": {
      "type": "string",
      "description": "The language in which the article is written (ISO 639-1)."
    },
    "rank": {
      "type": "integer",
      "description": "The rank of the article's source by global traffic."
    },
    "score": {
      "type": "number",
      "description": "The relevance score of the article."
    },
    "description": {
      "type": "string",
      "description": "A brief description or excerpt of the article."
    },
    "content": {
      "type": "string",
      "description": "The full text content of the article."
    },
    "word_count": {
      "type": "integer",
      "description": "The word count of the article.",
      "default": 0
    },
    "is_opinion": {
      "type": "boolean",
      "description": "Indicates if the article is an opinion piece."
    },
    "media": {
      "type": "string",
      "description": "The media/image URL associated with the article."
    },
    "rights": {
      "type": "string",
      "description": "The copyright/rights information for the article."
    },
    "twitter_account": {
      "type": ["string", "null"],
      "description": "The Twitter account associated with the article's author or source."
    },
    "all_links": {
      "description": "A list of all URLs mentioned in the article.",
      "anyOf": [
        {"type": "array", "items": {"type": "string"}},
        {"type": "string"}
      ],
      "default": []
    },
    "all_domain_links": {
      "description": "A list of all domain URLs mentioned in the article.",
      "anyOf": [
        {"type": "array", "items": {"type": "string"}},
        {"type": "string"}
      ],
      "default": []
    },
    "nlp": {
      "$ref": "#/$defs/NlpData"
    },
    "robots_compliant": {
      "type": "boolean",
      "description": "True if the article content can be safely accessed per the publisher's robots.txt rules."
    },
    "custom_tags": {
      "type": "object",
      "description": "Custom taxonomy tags associated with the article.",
      "additionalProperties": {
        "type": "array",
        "items": {"type": "string"}
      }
    }
  },
  "$defs": {
    "NlpData": {
      "type": "object",
      "description": "Natural Language Processing enrichment data for the article.",
      "properties": {
        "theme": {
          "type": "string",
          "description": "The themes or categories identified in the article."
        },
        "summary": {
          "type": "string",
          "description": "A brief AI-generated summary of the article."
        },
        "translation_summary": {
          "type": "string",
          "description": "A brief AI-generated summary of the article's English translation."
        },
        "sentiment": {
          "$ref": "#/$defs/SentimentScores"
        },
        "ner_PER": {
          "type": "array",
          "description": "Named entities: persons recognized in the article.",
          "items": {"type": "string"}
        },
        "ner_ORG": {
          "type": "array",
          "description": "Named entities: organizations recognized in the article.",
          "items": {"type": "string"}
        },
        "ner_MISC": {
          "type": "array",
          "description": "Named entities: miscellaneous entities recognized in the article.",
          "items": {"type": "string"}
        },
        "ner_LOC": {
          "type": "array",
          "description": "Named entities: locations recognized in the article.",
          "items": {"type": "string"}
        },
        "iptc_tags": {
          "type": "array",
          "description": "IPTC Media Topic tags for content classification.",
          "items": {"type": "string"}
        },
        "iab_tags": {
          "type": "array",
          "description": "IAB content category tags.",
          "items": {"type": "string"}
        },
        "embedding": {
          "type": "array",
          "description": "Pre-computed vector embedding for the article.",
          "items": {"type": "number"}
        }
      }
    },
    "SentimentScores": {
      "type": "object",
      "description": "Sentiment scores for the article's title and content.",
      "properties": {
        "title": {
          "type": "number",
          "format": "float",
          "description": "Sentiment score for the article title (-1.0 negative to 1.0 positive).",
          "minimum": -1.0,
          "maximum": 1.0
        },
        "content": {
          "type": "number",
          "format": "float",
          "description": "Sentiment score for the article content (-1.0 negative to 1.0 positive).",
          "minimum": -1.0,
          "maximum": 1.0
        }
      }
    }
  }
}