From 33a2682d60c753469d78cf68b6065a284e774f40 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Dec 2019 12:39:49 +0100 Subject: [PATCH] Add better schemas and validation using Pydantic (#4831) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Add better schemas and validation using Pydantic * Revert lookups.md * Remove unused import * Update spacy/schemas.py Co-Authored-By: Sebastián Ramírez * Various small fixes * Fix docstring Co-authored-by: Sebastián Ramírez --- requirements.txt | 3 +- setup.cfg | 1 + spacy/cli/_schemas.py | 217 ------------------ spacy/errors.py | 3 - spacy/matcher/_schemas.py | 197 ---------------- spacy/matcher/dependencymatcher.pyx | 5 +- spacy/matcher/matcher.pxd | 2 +- spacy/matcher/matcher.pyx | 18 +- spacy/matcher/phrasematcher.pyx | 4 +- spacy/schemas.py | 188 +++++++++++++++ spacy/tests/doc/test_to_json.py | 9 - spacy/tests/matcher/test_matcher_api.py | 2 +- spacy/tests/matcher/test_matcher_logic.py | 12 +- .../tests/matcher/test_pattern_validation.py | 30 +-- spacy/tests/test_json_schemas.py | 47 ---- spacy/util.py | 42 ---- 16 files changed, 217 insertions(+), 563 deletions(-) delete mode 100644 spacy/cli/_schemas.py delete mode 100644 spacy/matcher/_schemas.py create mode 100644 spacy/schemas.py delete mode 100644 spacy/tests/test_json_schemas.py diff --git a/requirements.txt b/requirements.txt index 188459c67..79a05b2bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,8 +12,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 -# Optional dependencies -jsonschema>=2.6.0,<3.1.0 +pydantic>=1.0.0,<2.0.0 # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index 28259c989..755f522e7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ install_requires = numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 + pydantic>=1.0.0,<2.0.0 [options.extras_require] lookups = diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py deleted file mode 100644 index 42e5e04dd..000000000 --- a/spacy/cli/_schemas.py +++ /dev/null @@ -1,217 +0,0 @@ - -# NB: This schema describes the new format of the training data, see #2928 -TRAINING_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "title": "Training data for spaCy models", - "type": "array", - "items": { - "type": "object", - "properties": { - "text": { - "title": "The text of the training example", - "type": "string", - "minLength": 1, - }, - "ents": { - "title": "Named entity spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0, - }, - "label": { - "title": "Entity label", - "type": "string", - "minLength": 1, - "pattern": "^[A-Z0-9]*$", - }, - }, - "required": ["start", "end", "label"], - }, - }, - "sents": { - "title": "Sentence spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0, - }, - }, - "required": ["start", "end"], - }, - }, - "cats": { - "title": "Text categories for the text classifier", - "type": "object", - "patternProperties": { - "*": { - "title": "A text category", - "oneOf": [ - {"type": "boolean"}, - {"type": "number", "minimum": 0}, - ], - } - }, - "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1}, - }, - "tokens": { - "title": "The tokens in the text", - "type": "array", - "items": { - "type": "object", - "minProperties": 1, - "properties": { - "id": { - "title": "Token ID, usually token index", - "type": "integer", - "minimum": 0, - }, - "start": { - "title": "Start character offset of the token", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the token", - "type": "integer", - "minimum": 0, - }, - "pos": { - "title": "Coarse-grained part-of-speech tag", - "type": "string", - "minLength": 1, - }, - "tag": { - "title": "Fine-grained part-of-speech tag", - "type": "string", - "minLength": 1, - }, - "dep": { - "title": "Dependency label", - "type": "string", - "minLength": 1, - }, - "head": { - "title": "Index of the token's head", - "type": "integer", - "minimum": 0, - }, - }, - "required": ["start", "end"], - }, - }, - "_": {"title": "Custom user space", "type": "object"}, - }, - "required": ["text"], - }, -} - -META_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "type": "object", - "properties": { - "lang": { - "title": "Two-letter language code, e.g. 'en'", - "type": "string", - "minLength": 2, - "maxLength": 2, - "pattern": "^[a-z]*$", - }, - "name": { - "title": "Model name", - "type": "string", - "minLength": 1, - "pattern": "^[a-z_]*$", - }, - "version": { - "title": "Model version", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-]*$", - }, - "spacy_version": { - "title": "Compatible spaCy version identifier", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-><=]*$", - }, - "parent_package": { - "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly", - "type": "string", - "minLength": 1, - "default": "spacy", - }, - "pipeline": { - "title": "Names of pipeline components", - "type": "array", - "items": {"type": "string", "minLength": 1}, - }, - "description": {"title": "Model description", "type": "string"}, - "license": {"title": "Model license", "type": "string"}, - "author": {"title": "Model author name", "type": "string"}, - "email": {"title": "Model author email", "type": "string", "format": "email"}, - "url": {"title": "Model author URL", "type": "string", "format": "uri"}, - "sources": { - "title": "Training data sources", - "type": "array", - "items": {"type": "string"}, - }, - "vectors": { - "title": "Included word vectors", - "type": "object", - "properties": { - "keys": { - "title": "Number of unique keys", - "type": "integer", - "minimum": 0, - }, - "vectors": { - "title": "Number of unique vectors", - "type": "integer", - "minimum": 0, - }, - "width": { - "title": "Number of dimensions", - "type": "integer", - "minimum": 0, - }, - }, - }, - "accuracy": { - "title": "Accuracy numbers", - "type": "object", - "patternProperties": {"*": {"type": "number", "minimum": 0.0}}, - }, - "speed": { - "title": "Speed evaluation numbers", - "type": "object", - "patternProperties": { - "*": { - "oneOf": [ - {"type": "number", "minimum": 0.0}, - {"type": "integer", "minimum": 0}, - ] - } - }, - }, - }, - "required": ["lang", "name", "version"], -} diff --git a/spacy/errors.py b/spacy/errors.py index 81747b33b..3aa4bedea 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -105,7 +105,6 @@ class Warnings(object): "smaller JSON files instead.") - @add_codes class Errors(object): E001 = ("No component '{name}' found in pipeline. Available names: {opts}") @@ -419,8 +418,6 @@ class Errors(object): E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") E135 = ("If you meant to replace a built-in component, use `create_pipe`: " "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`") - E136 = ("This additional feature requires the jsonschema library to be " - "installed:\npip install jsonschema") E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure " "to provide a valid JSON object as input with either the `text` " "or `tokens` key. For more info, see the docs:\n" diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py deleted file mode 100644 index ce6379c45..000000000 --- a/spacy/matcher/_schemas.py +++ /dev/null @@ -1,197 +0,0 @@ - -TOKEN_PATTERN_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "definitions": { - "string_value": { - "anyOf": [ - {"type": "string"}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": {"type": "array", "items": {"type": "string"}}, - "NOT_IN": {"type": "array", "items": {"type": "string"}}, - }, - "additionalProperties": False, - }, - ] - }, - "integer_value": { - "anyOf": [ - {"type": "integer"}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": {"type": "array", "items": {"type": "integer"}}, - "NOT_IN": {"type": "array", "items": {"type": "integer"}}, - "==": {"type": "integer"}, - ">=": {"type": "integer"}, - "<=": {"type": "integer"}, - ">": {"type": "integer"}, - "<": {"type": "integer"}, - }, - "additionalProperties": False, - }, - ] - }, - "boolean_value": {"type": "boolean"}, - "underscore_value": { - "anyOf": [ - {"type": ["string", "integer", "number", "array", "boolean", "null"]}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": { - "type": "array", - "items": {"type": ["string", "integer"]}, - }, - "NOT_IN": { - "type": "array", - "items": {"type": ["string", "integer"]}, - }, - "==": {"type": "integer"}, - ">=": {"type": "integer"}, - "<=": {"type": "integer"}, - ">": {"type": "integer"}, - "<": {"type": "integer"}, - }, - "additionalProperties": False, - }, - ] - }, - }, - "type": "array", - "items": { - "type": "object", - "properties": { - "ORTH": { - "title": "Verbatim token text", - "$ref": "#/definitions/string_value", - }, - "TEXT": { - "title": "Verbatim token text (spaCy v2.1+)", - "$ref": "#/definitions/string_value", - }, - "LOWER": { - "title": "Lowercase form of token text", - "$ref": "#/definitions/string_value", - }, - "POS": { - "title": "Coarse-grained part-of-speech tag", - "$ref": "#/definitions/string_value", - }, - "TAG": { - "title": "Fine-grained part-of-speech tag", - "$ref": "#/definitions/string_value", - }, - "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, - "LEMMA": { - "title": "Lemma (base form)", - "$ref": "#/definitions/string_value", - }, - "SHAPE": { - "title": "Abstract token shape", - "$ref": "#/definitions/string_value", - }, - "ENT_TYPE": { - "title": "Entity label of single token", - "$ref": "#/definitions/string_value", - }, - "NORM": { - "title": "Normalized form of the token text", - "$ref": "#/definitions/string_value", - }, - "LENGTH": { - "title": "Token character length", - "$ref": "#/definitions/integer_value", - }, - "IS_ALPHA": { - "title": "Token consists of alphabetic characters", - "$ref": "#/definitions/boolean_value", - }, - "IS_ASCII": { - "title": "Token consists of ASCII characters", - "$ref": "#/definitions/boolean_value", - }, - "IS_DIGIT": { - "title": "Token consists of digits", - "$ref": "#/definitions/boolean_value", - }, - "IS_LOWER": { - "title": "Token is lowercase", - "$ref": "#/definitions/boolean_value", - }, - "IS_UPPER": { - "title": "Token is uppercase", - "$ref": "#/definitions/boolean_value", - }, - "IS_TITLE": { - "title": "Token is titlecase", - "$ref": "#/definitions/boolean_value", - }, - "IS_PUNCT": { - "title": "Token is punctuation", - "$ref": "#/definitions/boolean_value", - }, - "IS_SPACE": { - "title": "Token is whitespace", - "$ref": "#/definitions/boolean_value", - }, - "IS_BRACKET": { - "title": "Token is a bracket", - "$ref": "#/definitions/boolean_value", - }, - "IS_QUOTE": { - "title": "Token is a quotation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_LEFT_PUNCT": { - "title": "Token is a left punctuation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_RIGHT_PUNCT": { - "title": "Token is a right punctuation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_CURRENCY": { - "title": "Token is a currency symbol", - "$ref": "#/definitions/boolean_value", - }, - "IS_STOP": { - "title": "Token is stop word", - "$ref": "#/definitions/boolean_value", - }, - "IS_SENT_START": { - "title": "Token is the first in a sentence", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_NUM": { - "title": "Token resembles a number", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_URL": { - "title": "Token resembles a URL", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_EMAIL": { - "title": "Token resembles an email address", - "$ref": "#/definitions/boolean_value", - }, - "_": { - "title": "Custom extension token attributes (token._.)", - "type": "object", - "patternProperties": { - "^.*$": {"$ref": "#/definitions/underscore_value"} - }, - }, - "OP": { - "title": "Operators / quantifiers", - "type": "string", - "enum": ["+", "*", "?", "!"], - }, - }, - "additionalProperties": False, - }, -} diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 46cff0d0c..f94c66cb0 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -39,7 +39,8 @@ cdef class DependencyMatcher: RETURNS (DependencyMatcher): The newly constructed object. """ size = 20 - self.token_matcher = Matcher(vocab) + # TODO: make matcher work with validation + self.token_matcher = Matcher(vocab, validate=False) self._keys_to_token = {} self._patterns = {} self._root = {} @@ -129,7 +130,7 @@ cdef class DependencyMatcher: # TODO: Better ways to hash edges in pattern? for j in range(len(_patterns[i])): k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j)) - self.token_matcher.add(k, None, _patterns[i][j]) + self.token_matcher.add(k, [_patterns[i][j]]) _keys_to_token[k] = j _keys_to_token_list.append(_keys_to_token) self._keys_to_token.setdefault(key, []) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index dd04153bf..689734079 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -63,7 +63,7 @@ cdef class Matcher: cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab - cdef public object validator + cdef public object validate cdef public object _patterns cdef public object _callbacks cdef public object _extensions diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 2908ab0c2..4258fdb6a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -15,8 +15,7 @@ from ..tokens.doc cimport Doc, get_token_attr from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA -from ._schemas import TOKEN_PATTERN_SCHEMA -from ..util import get_json_validator, validate_json +from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning from ..strings import get_string_id from ..attrs import IDS @@ -32,7 +31,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=False): + def __init__(self, vocab, validate=True): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -46,10 +45,7 @@ cdef class Matcher: self._seen_attrs = set() self.vocab = vocab self.mem = Pool() - if validate: - self.validator = get_json_validator(TOKEN_PATTERN_SCHEMA) - else: - self.validator = None + self.validate = validate def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -119,8 +115,8 @@ cdef class Matcher: raise ValueError(Errors.E012.format(key=key)) if not isinstance(pattern, list): raise ValueError(Errors.E178.format(pat=pattern, key=key)) - if self.validator: - errors[i] = validate_json(pattern, self.validator) + if self.validate: + errors[i] = validate_token_pattern(pattern) if any(err for err in errors.values()): raise MatchPatternError(key, errors) key = self._normalize_key(key) @@ -668,8 +664,6 @@ def _get_attr_values(spec, string_store): continue if attr == "TEXT": attr = "ORTH" - if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: - raise ValueError(Errors.E152.format(attr=attr)) attr = IDS.get(attr) if isinstance(value, basestring): value = string_store.add(value) @@ -684,7 +678,7 @@ def _get_attr_values(spec, string_store): if attr is not None: attr_values.append((attr, value)) else: - # should be caught above using TOKEN_PATTERN_SCHEMA + # should be caught in validation raise ValueError(Errors.E152.format(attr=attr)) return attr_values diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 20f45b9e4..961a318f6 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -9,7 +9,7 @@ from ..structs cimport TokenC from ..tokens.token cimport Token from ..typedefs cimport attr_t -from ._schemas import TOKEN_PATTERN_SCHEMA +from ..schemas import TokenPattern from ..errors import Errors, Warnings, deprecation_warning, user_warning @@ -54,7 +54,7 @@ cdef class PhraseMatcher: attr = attr.upper() if attr == "TEXT": attr = "ORTH" - if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: + if attr.lower() not in TokenPattern().dict(): raise ValueError(Errors.E152.format(attr=attr)) self.attr = self.vocab.strings[attr] diff --git a/spacy/schemas.py b/spacy/schemas.py new file mode 100644 index 000000000..4a5054125 --- /dev/null +++ b/spacy/schemas.py @@ -0,0 +1,188 @@ +from typing import Dict, List, Union, Optional +from enum import Enum +from pydantic import BaseModel, Field, ValidationError, validator +from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool +from collections import defaultdict + +from .attrs import NAMES + + +def validate(schema, obj): + """Validate data against a given pydantic schema. + + obj (dict): JSON-serializable data to validate. + schema (pydantic.BaseModel): The schema to validate against. + RETURNS (list): A list of error messages, if available. + """ + try: + schema(**obj) + return [] + except ValidationError as e: + errors = e.errors() + data = defaultdict(list) + for error in errors: + err_loc = " -> ".join([str(p) for p in error.get("loc", [])]) + data[err_loc].append(error.get("msg")) + return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] + + +# Matcher token patterns + + +def validate_token_pattern(obj): + # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"}) + get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k + if isinstance(obj, list): + converted = [] + for pattern in obj: + if isinstance(pattern, dict): + pattern = {get_key(k): v for k, v in pattern.items()} + converted.append(pattern) + obj = converted + return validate(TokenPatternSchema, {"pattern": obj}) + + +class TokenPatternString(BaseModel): + REGEX: Optional[StrictStr] + IN: Optional[List[StrictStr]] + NOT_IN: Optional[List[StrictStr]] + + class Config: + extra = "forbid" + + @validator("*", pre=True, whole=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternNumber(BaseModel): + REGEX: Optional[StrictStr] = None + IN: Optional[List[StrictInt]] = None + NOT_IN: Optional[List[StrictInt]] = None + EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") + GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") + LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") + GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") + LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") + + class Config: + extra = "forbid" + + @validator("*", pre=True, whole=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternOperator(str, Enum): + plus: StrictStr = "+" + start: StrictStr = "*" + question: StrictStr = "?" + exclamation: StrictStr = "!" + + +StringValue = Union[TokenPatternString, StrictStr] +NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] +UnderscoreValue = Union[ + TokenPatternString, TokenPatternNumber, str, int, float, list, bool, +] + + +class TokenPattern(BaseModel): + orth: Optional[StringValue] = None + text: Optional[StringValue] = None + lower: Optional[StringValue] = None + pos: Optional[StringValue] = None + tag: Optional[StringValue] = None + dep: Optional[StringValue] = None + lemma: Optional[StringValue] = None + shape: Optional[StringValue] = None + ent_type: Optional[StringValue] = None + norm: Optional[StringValue] = None + length: Optional[NumberValue] = None + is_alpha: Optional[StrictBool] = None + is_ascii: Optional[StrictBool] = None + is_digit: Optional[StrictBool] = None + is_lower: Optional[StrictBool] = None + is_upper: Optional[StrictBool] = None + is_title: Optional[StrictBool] = None + is_punct: Optional[StrictBool] = None + is_space: Optional[StrictBool] = None + is_bracket: Optional[StrictBool] = None + is_quote: Optional[StrictBool] = None + is_left_punct: Optional[StrictBool] = None + is_right_punct: Optional[StrictBool] = None + is_currency: Optional[StrictBool] = None + is_stop: Optional[StrictBool] = None + is_sent_start: Optional[StrictBool] = None + like_num: Optional[StrictBool] = None + like_url: Optional[StrictBool] = None + like_email: Optional[StrictBool] = None + op: Optional[TokenPatternOperator] = None + underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") + + class Config: + extra = "forbid" + allow_population_by_field_name = True + alias_generator = lambda value: value.upper() + + @validator("*", pre=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternSchema(BaseModel): + pattern: List[TokenPattern] = Field(..., minItems=1) + + class Config: + extra = "forbid" + + +# Model meta + + +class ModelMetaSchema(BaseModel): + # fmt: off + lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'") + name: StrictStr = Field(..., title="Model name") + version: StrictStr = Field(..., title="Model version") + spacy_version: Optional[StrictStr] = Field(None, title="Compatible spaCy version identifier") + parent_package: Optional[StrictStr] = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") + pipeline: Optional[List[StrictStr]] = Field([], title="Names of pipeline components") + description: Optional[StrictStr] = Field(None, title="Model description") + license: Optional[StrictStr] = Field(None, title="Model license") + author: Optional[StrictStr] = Field(None, title="Model author name") + email: Optional[StrictStr] = Field(None, title="Model author email") + url: Optional[StrictStr] = Field(None, title="Model author URL") + sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") + vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors") + accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") + speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") + # fmt: on + + +# Training data object in "simple training style" + + +class SimpleTrainingSchema(BaseModel): + # TODO: write + + class Config: + title = "Schema for training data dict in passed to nlp.update" + extra = "forbid" + + +# JSON training format + + +class TrainingSchema(BaseModel): + # TODO: write + + class Config: + title = "Schema for training data in spaCy's JSON format" + extra = "forbid" diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index 18243c306..da3bc7dbb 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -1,6 +1,4 @@ import pytest -from spacy.cli._schemas import TRAINING_SCHEMA -from spacy.util import get_json_validator, validate_json from spacy.tokens import Doc from ..util import get_doc @@ -55,10 +53,3 @@ def test_doc_to_json_underscore_error_serialize(doc): Doc.set_extension("json_test4", method=lambda doc: doc.text) with pytest.raises(ValueError): doc.to_json(underscore=["json_test4"]) - - -def test_doc_to_json_valid_training(doc): - json_doc = doc.to_json() - validator = get_json_validator(TRAINING_SCHEMA) - errors = validate_json([json_doc], validator) - assert not errors diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index adeef834d..3900f1e68 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -179,7 +179,7 @@ def test_matcher_match_one_plus(matcher): doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 - pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}] + pattern = [{"ORTH": "Philippe"}, {"ORTH": "Philippe", "OP": "+"}] matcher.add("KleenePhilippe", [pattern]) m = matcher(doc) assert len(m) == 1 diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index a6a82f2e2..a2b2cd83f 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -6,18 +6,18 @@ from spacy.matcher import Matcher from spacy.tokens import Doc, Span -pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}] -pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}] -pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}] +pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] +pattern2 = [{"ORTH": "A"}, {"ORTH": "A"}] +pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}] pattern4 = [ - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, ] pattern5 = [ {"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, ] re_pattern1 = "AA*" diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index c879cc0fe..ade724d05 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -1,8 +1,7 @@ import pytest from spacy.matcher import Matcher -from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA from spacy.errors import MatchPatternError -from spacy.util import get_json_validator, validate_json +from spacy.schemas import validate_token_pattern # (pattern, num errors with validation, num errors identified with minimal # checks) @@ -15,12 +14,12 @@ TEST_PATTERNS = [ ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ([1, 2, 3], 3, 1), # Bad patterns flagged outside of Matcher - ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0), + ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) # Bad patterns not flagged with minimal checks ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0), - ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0), - ([{"LENGTH": {"VALUE": 5}}], 1, 0), - ([{"TEXT": {"VALUE": "foo"}}], 1, 0), + ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 4, 0), # prev: (2, 0) + ([{"LENGTH": {"VALUE": 5}}], 2, 0), # prev: (1, 0) + ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) ([{"IS_DIGIT": -1}], 1, 0), ([{"ORTH": -1}], 1, 0), # Good patterns @@ -31,15 +30,9 @@ TEST_PATTERNS = [ ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0), ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0), ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0), + ([{"orth": "foo"}], 0, 0), # prev: xfail ] -XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)] - - -@pytest.fixture -def validator(): - return get_json_validator(TOKEN_PATTERN_SCHEMA) - @pytest.mark.parametrize( "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]] @@ -51,15 +44,8 @@ def test_matcher_pattern_validation(en_vocab, pattern): @pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS) -def test_pattern_validation(validator, pattern, n_errors, _): - errors = validate_json(pattern, validator) - assert len(errors) == n_errors - - -@pytest.mark.xfail -@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS) -def test_xfail_pattern_validation(validator, pattern, n_errors, _): - errors = validate_json(pattern, validator) +def test_pattern_validation(pattern, n_errors, _): + errors = validate_token_pattern(pattern) assert len(errors) == n_errors diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py deleted file mode 100644 index 1330d3a65..000000000 --- a/spacy/tests/test_json_schemas.py +++ /dev/null @@ -1,47 +0,0 @@ -from spacy.util import get_json_validator, validate_json, validate_schema -from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA -from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA -import pytest - - -@pytest.fixture(scope="session") -def training_schema_validator(): - return get_json_validator(TRAINING_SCHEMA) - - -def test_validate_schema(): - validate_schema({"type": "object"}) - with pytest.raises(Exception): - validate_schema({"type": lambda x: x}) - - -@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA, TOKEN_PATTERN_SCHEMA]) -def test_schemas(schema): - validate_schema(schema) - - -@pytest.mark.parametrize( - "data", - [ - {"text": "Hello world"}, - {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, - ], -) -def test_json_schema_training_valid(data, training_schema_validator): - errors = validate_json([data], training_schema_validator) - assert not errors - - -@pytest.mark.parametrize( - "data,n_errors", - [ - ({"spans": []}, 1), - ({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2), - ({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1), - ({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1), - ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), - ], -) -def test_json_schema_training_invalid(data, n_errors, training_schema_validator): - errors = validate_json([data], training_schema_validator) - assert len(errors) == n_errors diff --git a/spacy/util.py b/spacy/util.py index 4e6c10e2b..57bbee69f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,11 +13,6 @@ import srsly import catalogue import sys -try: - import jsonschema -except ImportError: - jsonschema = None - try: import cupy.random except ImportError: @@ -705,43 +700,6 @@ def fix_random_seed(seed=0): cupy.random.seed(seed) -def get_json_validator(schema): - # We're using a helper function here to make it easier to change the - # validator that's used (e.g. different draft implementation), without - # having to change it all across the codebase. - # TODO: replace with (stable) Draft6Validator, if available - if jsonschema is None: - raise ValueError(Errors.E136) - return jsonschema.Draft4Validator(schema) - - -def validate_schema(schema): - """Validate a given schema. This just checks if the schema itself is valid.""" - validator = get_json_validator(schema) - validator.check_schema(schema) - - -def validate_json(data, validator): - """Validate data against a given JSON schema (see https://json-schema.org). - - data: JSON-serializable data to validate. - validator (jsonschema.DraftXValidator): The validator. - RETURNS (list): A list of error messages, if available. - """ - errors = [] - for err in sorted(validator.iter_errors(data), key=lambda e: e.path): - if err.path: - err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) - else: - err_path = "" - msg = err.message + " " + err_path - if err.context: # Error has suberrors, e.g. if schema uses anyOf - suberrs = [f" - {suberr.message}" for suberr in err.context] - msg += f":\n{''.join(suberrs)}" - errors.append(msg) - return errors - - def get_serialization_exclude(serializers, exclude, kwargs): """Helper function to validate serialization args and manage transition from keyword arguments (pre v2.1) to exclude argument.