spaCy/spacy/matcher/_schemas.py
adrianeboyd 8fe7bdd0fa Improve token pattern checking without validation (#4105)
* Fix typo in rule-based matching docs

* Improve token pattern checking without validation

Add more detailed token pattern checks without full JSON pattern validation and
provide more detailed error messages.

Addresses #4070 (also related: #4063, #4100).

* Check whether top-level attributes in patterns and attr for PhraseMatcher are
  in token pattern schema

* Check whether attribute value types are supported in general (as opposed to
  per attribute with full validation)

* Report various internal error types (OverflowError, AttributeError, KeyError)
  as ValueError with standard error messages

* Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS,
  LEMMA, and DEP

* Add error messages with relevant details on how to use validate=True or nlp()
  instead of nlp.make_doc()

* Support attr=TEXT for PhraseMatcher

* Add NORM to schema

* Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler

* Remove unnecessary .keys()

* Rephrase error messages

* Add another type check to Matcher

Add another type check to Matcher for more understandable error messages
in some rare cases.

* Support phrase_matcher_attr=TEXT for EntityRuler

* Don't use spacy.errors in examples and bin scripts

* Fix error code

* Auto-format

Also try get Azure pipelines to finally start a build :(

* Update errors.py


Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2019-08-21 14:00:37 +02:00

177 lines
6.3 KiB
Python

# coding: utf8
from __future__ import unicode_literals
TOKEN_PATTERN_SCHEMA = {
"$schema": "http://json-schema.org/draft-06/schema",
"definitions": {
"string_value": {
"anyOf": [
{"type": "string"},
{
"type": "object",
"properties": {
"REGEX": {"type": "string"},
"IN": {"type": "array", "items": {"type": "string"}},
"NOT_IN": {"type": "array", "items": {"type": "string"}},
},
"additionalProperties": False,
},
]
},
"integer_value": {
"anyOf": [
{"type": "integer"},
{
"type": "object",
"properties": {
"REGEX": {"type": "string"},
"IN": {"type": "array", "items": {"type": "integer"}},
"NOT_IN": {"type": "array", "items": {"type": "integer"}},
"==": {"type": "integer"},
">=": {"type": "integer"},
"<=": {"type": "integer"},
">": {"type": "integer"},
"<": {"type": "integer"},
},
"additionalProperties": False,
},
]
},
"boolean_value": {"type": "boolean"},
"underscore_value": {
"anyOf": [
{"type": ["string", "integer", "number", "array", "boolean", "null"]},
{
"type": "object",
"properties": {
"REGEX": {"type": "string"},
"IN": {
"type": "array",
"items": {"type": ["string", "integer"]},
},
"NOT_IN": {
"type": "array",
"items": {"type": ["string", "integer"]},
},
"==": {"type": "integer"},
">=": {"type": "integer"},
"<=": {"type": "integer"},
">": {"type": "integer"},
"<": {"type": "integer"},
},
"additionalProperties": False,
},
]
},
},
"type": "array",
"items": {
"type": "object",
"properties": {
"ORTH": {
"title": "Verbatim token text",
"$ref": "#/definitions/string_value",
},
"TEXT": {
"title": "Verbatim token text (spaCy v2.1+)",
"$ref": "#/definitions/string_value",
},
"LOWER": {
"title": "Lowercase form of token text",
"$ref": "#/definitions/string_value",
},
"POS": {
"title": "Coarse-grained part-of-speech tag",
"$ref": "#/definitions/string_value",
},
"TAG": {
"title": "Fine-grained part-of-speech tag",
"$ref": "#/definitions/string_value",
},
"DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
"LEMMA": {
"title": "Lemma (base form)",
"$ref": "#/definitions/string_value",
},
"SHAPE": {
"title": "Abstract token shape",
"$ref": "#/definitions/string_value",
},
"ENT_TYPE": {
"title": "Entity label of single token",
"$ref": "#/definitions/string_value",
},
"NORM": {
"title": "Normalized form of the token text",
"$ref": "#/definitions/string_value",
},
"LENGTH": {
"title": "Token character length",
"$ref": "#/definitions/integer_value",
},
"IS_ALPHA": {
"title": "Token consists of alphanumeric characters",
"$ref": "#/definitions/boolean_value",
},
"IS_ASCII": {
"title": "Token consists of ASCII characters",
"$ref": "#/definitions/boolean_value",
},
"IS_DIGIT": {
"title": "Token consists of digits",
"$ref": "#/definitions/boolean_value",
},
"IS_LOWER": {
"title": "Token is lowercase",
"$ref": "#/definitions/boolean_value",
},
"IS_UPPER": {
"title": "Token is uppercase",
"$ref": "#/definitions/boolean_value",
},
"IS_TITLE": {
"title": "Token is titlecase",
"$ref": "#/definitions/boolean_value",
},
"IS_PUNCT": {
"title": "Token is punctuation",
"$ref": "#/definitions/boolean_value",
},
"IS_SPACE": {
"title": "Token is whitespace",
"$ref": "#/definitions/boolean_value",
},
"IS_STOP": {
"title": "Token is stop word",
"$ref": "#/definitions/boolean_value",
},
"LIKE_NUM": {
"title": "Token resembles a number",
"$ref": "#/definitions/boolean_value",
},
"LIKE_URL": {
"title": "Token resembles a URL",
"$ref": "#/definitions/boolean_value",
},
"LIKE_EMAIL": {
"title": "Token resembles an email address",
"$ref": "#/definitions/boolean_value",
},
"_": {
"title": "Custom extension token attributes (token._.)",
"type": "object",
"patternProperties": {
"^.*$": {"$ref": "#/definitions/underscore_value"}
},
},
"OP": {
"title": "Operators / quantifiers",
"type": "string",
"enum": ["+", "*", "?", "!"],
},
},
"additionalProperties": False,
},
}