Store JSON schemas in Python and tidy up (#3235)

This commit is contained in:
Ines Montani 2019-02-07 09:44:31 +01:00 committed by Matthew Honnibal
parent 1ea4df459d
commit 338d659bd0
9 changed files with 263 additions and 344 deletions

View File

@ -1,5 +1,4 @@
recursive-include include *.h recursive-include include *.h
recursive-include spacy/cli/schemas *.json
include LICENSE include LICENSE
include README.md include README.md
include bin/spacy include bin/spacy

220
spacy/cli/_schemas.py Normal file
View File

@ -0,0 +1,220 @@
# coding: utf-8
from __future__ import unicode_literals
# NB: This schema describes the new format of the training data, see #2928
TRAINING_SCHEMA = {
"$schema": "http://json-schema.org/draft-06/schema",
"title": "Training data for spaCy models",
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {
"title": "The text of the training example",
"type": "string",
"minLength": 1,
},
"ents": {
"title": "Named entity spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0,
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0,
},
"label": {
"title": "Entity label",
"type": "string",
"minLength": 1,
"pattern": "^[A-Z0-9]*$",
},
},
"required": ["start", "end", "label"],
},
},
"sents": {
"title": "Sentence spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0,
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0,
},
},
"required": ["start", "end"],
},
},
"cats": {
"title": "Text categories for the text classifier",
"type": "object",
"patternProperties": {
"*": {
"title": "A text category",
"oneOf": [
{"type": "boolean"},
{"type": "number", "minimum": 0},
],
}
},
"propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
},
"tokens": {
"title": "The tokens in the text",
"type": "array",
"items": {
"type": "object",
"minProperties": 1,
"properties": {
"id": {
"title": "Token ID, usually token index",
"type": "integer",
"minimum": 0,
},
"start": {
"title": "Start character offset of the token",
"type": "integer",
"minimum": 0,
},
"end": {
"title": "End character offset of the token",
"type": "integer",
"minimum": 0,
},
"pos": {
"title": "Coarse-grained part-of-speech tag",
"type": "string",
"minLength": 1,
},
"tag": {
"title": "Fine-grained part-of-speech tag",
"type": "string",
"minLength": 1,
},
"dep": {
"title": "Dependency label",
"type": "string",
"minLength": 1,
},
"head": {
"title": "Index of the token's head",
"type": "integer",
"minimum": 0,
},
},
"required": ["start", "end"],
},
},
"_": {"title": "Custom user space", "type": "object"},
},
"required": ["text"],
},
}
META_SCHEMA = {
"$schema": "http://json-schema.org/draft-06/schema",
"type": "object",
"properties": {
"lang": {
"title": "Two-letter language code, e.g. 'en'",
"type": "string",
"minLength": 2,
"maxLength": 2,
"pattern": "^[a-z]*$",
},
"name": {
"title": "Model name",
"type": "string",
"minLength": 1,
"pattern": "^[a-z_]*$",
},
"version": {
"title": "Model version",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-]*$",
},
"spacy_version": {
"title": "Compatible spaCy version identifier",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-><=]*$",
},
"parent_package": {
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
"type": "string",
"minLength": 1,
"default": "spacy",
},
"pipeline": {
"title": "Names of pipeline components",
"type": "array",
"items": {"type": "string", "minLength": 1},
},
"description": {"title": "Model description", "type": "string"},
"license": {"title": "Model license", "type": "string"},
"author": {"title": "Model author name", "type": "string"},
"email": {"title": "Model author email", "type": "string", "format": "email"},
"url": {"title": "Model author URL", "type": "string", "format": "uri"},
"sources": {
"title": "Training data sources",
"type": "array",
"items": {"type": "string"},
},
"vectors": {
"title": "Included word vectors",
"type": "object",
"properties": {
"keys": {
"title": "Number of unique keys",
"type": "integer",
"minimum": 0,
},
"vectors": {
"title": "Number of unique vectors",
"type": "integer",
"minimum": 0,
},
"width": {
"title": "Number of dimensions",
"type": "integer",
"minimum": 0,
},
},
},
"accuracy": {
"title": "Accuracy numbers",
"type": "object",
"patternProperties": {"*": {"type": "number", "minimum": 0.0}},
},
"speed": {
"title": "Speed evaluation numbers",
"type": "object",
"patternProperties": {
"*": {
"oneOf": [
{"type": "number", "minimum": 0.0},
{"type": "integer", "minimum": 0},
]
}
},
},
},
"required": ["lang", "name", "version"],
}

View File

@ -11,8 +11,6 @@ from wasabi import Printer, MESSAGES
from ..gold import GoldCorpus, read_json_object from ..gold import GoldCorpus, read_json_object
from ..util import load_model, get_lang_class from ..util import load_model, get_lang_class
# from .schemas import get_schema, validate_json
# Minimum number of expected occurences of label in data to train new label # Minimum number of expected occurences of label in data to train new label
NEW_LABEL_THRESHOLD = 50 NEW_LABEL_THRESHOLD = 50
@ -76,7 +74,6 @@ def debug_data(
# Validate data format using the JSON schema # Validate data format using the JSON schema
# TODO: update once the new format is ready # TODO: update once the new format is ready
# schema = get_schema("training")
train_data_errors = [] # TODO: validate_json(train_data, schema) train_data_errors = [] # TODO: validate_json(train_data, schema)
dev_data_errors = [] # TODO: validate_json(dev_data, schema) dev_data_errors = [] # TODO: validate_json(dev_data, schema)
if not train_data_errors: if not train_data_errors:

View File

@ -1,51 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from pathlib import Path
from jsonschema import Draft4Validator
import srsly
from ...errors import Errors
SCHEMAS = {}
def get_schema(name):
"""Get the JSON schema for a given name. Looks for a .json file in
spacy.cli.schemas, validates the schema and raises ValueError if not found.
EXAMPLE:
>>> schema = get_schema('training')
name (unicode): The name of the schema.
RETURNS (dict): The JSON schema.
"""
if name not in SCHEMAS:
schema_path = Path(__file__).parent / "{}.json".format(name)
if not schema_path.exists():
raise ValueError(Errors.E104.format(name=name))
schema = srsly.read_json(schema_path)
# TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema)
validator.check_schema(schema)
SCHEMAS[name] = schema
return SCHEMAS[name]
def validate_json(data, schema):
"""Validate data against a given JSON schema (see https://json-schema.org).
data: JSON-serializable data to validate.
schema (dict): The JSON schema.
RETURNS (list): A list of error messages, if available.
"""
validator = Draft4Validator(schema)
errors = []
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
if err.path:
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
else:
err_path = ""
errors.append(err.message + " " + err_path)
return errors

View File

@ -1,128 +0,0 @@
{
"$schema": "http://json-schema.org/draft-06/schema",
"type": "object",
"properties": {
"lang": {
"title": "Two-letter language code, e.g. 'en'",
"type": "string",
"minLength": 2,
"maxLength": 2,
"pattern": "^[a-z]*$"
},
"name": {
"title": "Model name",
"type": "string",
"minLength": 1,
"pattern": "^[a-z_]*$"
},
"version": {
"title": "Model version",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-]*$"
},
"spacy_version": {
"title": "Compatible spaCy version identifier",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-><=]*$"
},
"parent_package": {
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
"type": "string",
"minLength": 1,
"default": "spacy"
},
"pipeline": {
"title": "Names of pipeline components",
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
},
"description": {
"title": "Model description",
"type": "string"
},
"license": {
"title": "Model license",
"type": "string"
},
"author": {
"title": "Model author name",
"type": "string"
},
"email": {
"title": "Model author email",
"type": "string",
"format": "email"
},
"url": {
"title": "Model author URL",
"type": "string",
"format": "uri"
},
"sources": {
"title": "Training data sources",
"type": "array",
"items": {
"type": "string"
}
},
"vectors": {
"title": "Included word vectors",
"type": "object",
"properties": {
"keys": {
"title": "Number of unique keys",
"type": "integer",
"minimum": 0
},
"vectors": {
"title": "Number of unique vectors",
"type": "integer",
"minimum": 0
},
"width": {
"title": "Number of dimensions",
"type": "integer",
"minimum": 0
}
}
},
"accuracy": {
"title": "Accuracy numbers",
"type": "object",
"patternProperties": {
"*": {
"type": "number",
"minimum": 0.0
}
}
},
"speed": {
"title": "Speed evaluation numbers",
"type": "object",
"patternProperties": {
"*": {
"oneOf": [
{
"type": "number",
"minimum": 0.0
},
{
"type": "integer",
"minimum": 0
}
]
}
}
}
},
"required": [
"lang",
"name",
"version"
]
}

View File

@ -1,146 +0,0 @@
{
"$schema": "http://json-schema.org/draft-06/schema",
"title": "Training data for spaCy models",
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {
"title": "The text of the training example",
"type": "string",
"minLength": 1
},
"ents": {
"title": "Named entity spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0
},
"label": {
"title": "Entity label",
"type": "string",
"minLength": 1,
"pattern": "^[A-Z0-9]*$"
}
},
"required": [
"start",
"end",
"label"
]
}
},
"sents": {
"title": "Sentence spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0
}
},
"required": [
"start",
"end"
]
}
},
"cats": {
"title": "Text categories for the text classifier",
"type": "object",
"patternProperties": {
"*": {
"title": "A text category",
"oneOf": [
{
"type": "boolean"
},
{
"type": "number",
"minimum": 0
}
]
}
},
"propertyNames": {
"pattern": "^[A-Z0-9]*$",
"minLength": 1
}
},
"tokens": {
"title": "The tokens in the text",
"type": "array",
"items": {
"type": "object",
"minProperties": 1,
"properties": {
"id": {
"title": "Token ID, usually token index",
"type": "integer",
"minimum": 0
},
"start": {
"title": "Start character offset of the token",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the token",
"type": "integer",
"minimum": 0
},
"pos": {
"title": "Coarse-grained part-of-speech tag",
"type": "string",
"minLength": 1
},
"tag": {
"title": "Fine-grained part-of-speech tag",
"type": "string",
"minLength": 1
},
"dep": {
"title": "Dependency label",
"type": "string",
"minLength": 1
},
"head": {
"title": "Index of the token's head",
"type": "integer",
"minimum": 0
}
},
"required": [
"start",
"end"
]
}
},
"_": {
"title": "Custom user space",
"type": "object"
}
},
"required": [
"text"
]
}
}

View File

@ -2,7 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.cli.schemas import get_schema, validate_json from spacy.cli._schemas import TRAINING_SCHEMA
from spacy.util import validate_json
from spacy.tokens import Doc from spacy.tokens import Doc
from ..util import get_doc from ..util import get_doc
@ -61,5 +62,5 @@ def test_doc_to_json_underscore_error_serialize(doc):
def test_doc_to_json_valid_training(doc): def test_doc_to_json_valid_training(doc):
json_doc = doc.to_json() json_doc = doc.to_json()
errors = validate_json([json_doc], get_schema("training")) errors = validate_json([json_doc], TRAINING_SCHEMA)
assert not errors assert not errors

View File

@ -1,20 +1,20 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.cli.schemas import validate_json, get_schema from spacy.util import validate_json, validate_schema
from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA
import pytest import pytest
@pytest.fixture(scope="session") def test_validate_schema():
def training_schema(): validate_schema({"type": "object"})
return get_schema("training") with pytest.raises(Exception):
validate_schema({"type": lambda x: x})
def test_json_schema_get(): @pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA])
schema = get_schema("training") def test_schemas(schema):
assert schema validate_schema(schema)
with pytest.raises(ValueError):
schema = get_schema("xxx")
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -24,8 +24,8 @@ def test_json_schema_get():
{"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
], ],
) )
def test_json_schema_training_valid(data, training_schema): def test_json_schema_training_valid(data):
errors = validate_json([data], training_schema) errors = validate_json([data], TRAINING_SCHEMA)
assert not errors assert not errors
@ -39,6 +39,6 @@ def test_json_schema_training_valid(data, training_schema):
({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
], ],
) )
def test_json_schema_training_invalid(data, n_errors, training_schema): def test_json_schema_training_invalid(data, n_errors):
errors = validate_json([data], training_schema) errors = validate_json([data], TRAINING_SCHEMA)
assert len(errors) == n_errors assert len(errors) == n_errors

View File

@ -14,6 +14,8 @@ import functools
import itertools import itertools
import numpy.random import numpy.random
import srsly import srsly
from jsonschema import Draft4Validator
try: try:
import cupy.random import cupy.random
@ -626,6 +628,31 @@ def fix_random_seed(seed=0):
cupy.random.seed(seed) cupy.random.seed(seed)
def validate_schema(schema):
# TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema)
validator.check_schema(schema)
def validate_json(data, schema):
"""Validate data against a given JSON schema (see https://json-schema.org).
data: JSON-serializable data to validate.
schema (dict): The JSON schema.
RETURNS (list): A list of error messages, if available.
"""
# TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema)
errors = []
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
if err.path:
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
else:
err_path = ""
errors.append(err.message + " " + err_path)
return errors
class SimpleFrozenDict(dict): class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default """Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty function or method argument (for arguments that should default to empty