diff --git a/MANIFEST.in b/MANIFEST.in index 23af157eb..7bd999649 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ recursive-include include *.h -recursive-include spacy/cli/schemas *.json include LICENSE include README.md include bin/spacy diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py new file mode 100644 index 000000000..3fb2c8979 --- /dev/null +++ b/spacy/cli/_schemas.py @@ -0,0 +1,220 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +# NB: This schema describes the new format of the training data, see #2928 +TRAINING_SCHEMA = { + "$schema": "http://json-schema.org/draft-06/schema", + "title": "Training data for spaCy models", + "type": "array", + "items": { + "type": "object", + "properties": { + "text": { + "title": "The text of the training example", + "type": "string", + "minLength": 1, + }, + "ents": { + "title": "Named entity spans in the text", + "type": "array", + "items": { + "type": "object", + "properties": { + "start": { + "title": "Start character offset of the span", + "type": "integer", + "minimum": 0, + }, + "end": { + "title": "End character offset of the span", + "type": "integer", + "minimum": 0, + }, + "label": { + "title": "Entity label", + "type": "string", + "minLength": 1, + "pattern": "^[A-Z0-9]*$", + }, + }, + "required": ["start", "end", "label"], + }, + }, + "sents": { + "title": "Sentence spans in the text", + "type": "array", + "items": { + "type": "object", + "properties": { + "start": { + "title": "Start character offset of the span", + "type": "integer", + "minimum": 0, + }, + "end": { + "title": "End character offset of the span", + "type": "integer", + "minimum": 0, + }, + }, + "required": ["start", "end"], + }, + }, + "cats": { + "title": "Text categories for the text classifier", + "type": "object", + "patternProperties": { + "*": { + "title": "A text category", + "oneOf": [ + {"type": "boolean"}, + {"type": "number", "minimum": 0}, + ], + } + }, + "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1}, + }, + "tokens": { + "title": "The tokens in the text", + "type": "array", + "items": { + "type": "object", + "minProperties": 1, + "properties": { + "id": { + "title": "Token ID, usually token index", + "type": "integer", + "minimum": 0, + }, + "start": { + "title": "Start character offset of the token", + "type": "integer", + "minimum": 0, + }, + "end": { + "title": "End character offset of the token", + "type": "integer", + "minimum": 0, + }, + "pos": { + "title": "Coarse-grained part-of-speech tag", + "type": "string", + "minLength": 1, + }, + "tag": { + "title": "Fine-grained part-of-speech tag", + "type": "string", + "minLength": 1, + }, + "dep": { + "title": "Dependency label", + "type": "string", + "minLength": 1, + }, + "head": { + "title": "Index of the token's head", + "type": "integer", + "minimum": 0, + }, + }, + "required": ["start", "end"], + }, + }, + "_": {"title": "Custom user space", "type": "object"}, + }, + "required": ["text"], + }, +} + +META_SCHEMA = { + "$schema": "http://json-schema.org/draft-06/schema", + "type": "object", + "properties": { + "lang": { + "title": "Two-letter language code, e.g. 'en'", + "type": "string", + "minLength": 2, + "maxLength": 2, + "pattern": "^[a-z]*$", + }, + "name": { + "title": "Model name", + "type": "string", + "minLength": 1, + "pattern": "^[a-z_]*$", + }, + "version": { + "title": "Model version", + "type": "string", + "minLength": 1, + "pattern": "^[0-9a-z.-]*$", + }, + "spacy_version": { + "title": "Compatible spaCy version identifier", + "type": "string", + "minLength": 1, + "pattern": "^[0-9a-z.-><=]*$", + }, + "parent_package": { + "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly", + "type": "string", + "minLength": 1, + "default": "spacy", + }, + "pipeline": { + "title": "Names of pipeline components", + "type": "array", + "items": {"type": "string", "minLength": 1}, + }, + "description": {"title": "Model description", "type": "string"}, + "license": {"title": "Model license", "type": "string"}, + "author": {"title": "Model author name", "type": "string"}, + "email": {"title": "Model author email", "type": "string", "format": "email"}, + "url": {"title": "Model author URL", "type": "string", "format": "uri"}, + "sources": { + "title": "Training data sources", + "type": "array", + "items": {"type": "string"}, + }, + "vectors": { + "title": "Included word vectors", + "type": "object", + "properties": { + "keys": { + "title": "Number of unique keys", + "type": "integer", + "minimum": 0, + }, + "vectors": { + "title": "Number of unique vectors", + "type": "integer", + "minimum": 0, + }, + "width": { + "title": "Number of dimensions", + "type": "integer", + "minimum": 0, + }, + }, + }, + "accuracy": { + "title": "Accuracy numbers", + "type": "object", + "patternProperties": {"*": {"type": "number", "minimum": 0.0}}, + }, + "speed": { + "title": "Speed evaluation numbers", + "type": "object", + "patternProperties": { + "*": { + "oneOf": [ + {"type": "number", "minimum": 0.0}, + {"type": "integer", "minimum": 0}, + ] + } + }, + }, + }, + "required": ["lang", "name", "version"], +} diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 70acb47fa..634756441 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -11,8 +11,6 @@ from wasabi import Printer, MESSAGES from ..gold import GoldCorpus, read_json_object from ..util import load_model, get_lang_class -# from .schemas import get_schema, validate_json - # Minimum number of expected occurences of label in data to train new label NEW_LABEL_THRESHOLD = 50 @@ -76,7 +74,6 @@ def debug_data( # Validate data format using the JSON schema # TODO: update once the new format is ready - # schema = get_schema("training") train_data_errors = [] # TODO: validate_json(train_data, schema) dev_data_errors = [] # TODO: validate_json(dev_data, schema) if not train_data_errors: diff --git a/spacy/cli/schemas/__init__.py b/spacy/cli/schemas/__init__.py deleted file mode 100644 index c502c6493..000000000 --- a/spacy/cli/schemas/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from pathlib import Path -from jsonschema import Draft4Validator -import srsly - -from ...errors import Errors - - -SCHEMAS = {} - - -def get_schema(name): - """Get the JSON schema for a given name. Looks for a .json file in - spacy.cli.schemas, validates the schema and raises ValueError if not found. - - EXAMPLE: - >>> schema = get_schema('training') - - name (unicode): The name of the schema. - RETURNS (dict): The JSON schema. - """ - if name not in SCHEMAS: - schema_path = Path(__file__).parent / "{}.json".format(name) - if not schema_path.exists(): - raise ValueError(Errors.E104.format(name=name)) - schema = srsly.read_json(schema_path) - # TODO: replace with (stable) Draft6Validator, if available - validator = Draft4Validator(schema) - validator.check_schema(schema) - SCHEMAS[name] = schema - return SCHEMAS[name] - - -def validate_json(data, schema): - """Validate data against a given JSON schema (see https://json-schema.org). - - data: JSON-serializable data to validate. - schema (dict): The JSON schema. - RETURNS (list): A list of error messages, if available. - """ - validator = Draft4Validator(schema) - errors = [] - for err in sorted(validator.iter_errors(data), key=lambda e: e.path): - if err.path: - err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) - else: - err_path = "" - errors.append(err.message + " " + err_path) - return errors diff --git a/spacy/cli/schemas/meta.json b/spacy/cli/schemas/meta.json deleted file mode 100644 index 36ee1282f..000000000 --- a/spacy/cli/schemas/meta.json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-06/schema", - "type": "object", - "properties": { - "lang": { - "title": "Two-letter language code, e.g. 'en'", - "type": "string", - "minLength": 2, - "maxLength": 2, - "pattern": "^[a-z]*$" - }, - "name": { - "title": "Model name", - "type": "string", - "minLength": 1, - "pattern": "^[a-z_]*$" - }, - "version": { - "title": "Model version", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-]*$" - }, - "spacy_version": { - "title": "Compatible spaCy version identifier", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-><=]*$" - }, - "parent_package": { - "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly", - "type": "string", - "minLength": 1, - "default": "spacy" - }, - "pipeline": { - "title": "Names of pipeline components", - "type": "array", - "items": { - "type": "string", - "minLength": 1 - } - }, - "description": { - "title": "Model description", - "type": "string" - }, - "license": { - "title": "Model license", - "type": "string" - }, - "author": { - "title": "Model author name", - "type": "string" - }, - "email": { - "title": "Model author email", - "type": "string", - "format": "email" - }, - "url": { - "title": "Model author URL", - "type": "string", - "format": "uri" - }, - "sources": { - "title": "Training data sources", - "type": "array", - "items": { - "type": "string" - } - }, - "vectors": { - "title": "Included word vectors", - "type": "object", - "properties": { - "keys": { - "title": "Number of unique keys", - "type": "integer", - "minimum": 0 - }, - "vectors": { - "title": "Number of unique vectors", - "type": "integer", - "minimum": 0 - }, - "width": { - "title": "Number of dimensions", - "type": "integer", - "minimum": 0 - } - } - }, - "accuracy": { - "title": "Accuracy numbers", - "type": "object", - "patternProperties": { - "*": { - "type": "number", - "minimum": 0.0 - } - } - }, - "speed": { - "title": "Speed evaluation numbers", - "type": "object", - "patternProperties": { - "*": { - "oneOf": [ - { - "type": "number", - "minimum": 0.0 - }, - { - "type": "integer", - "minimum": 0 - } - ] - } - } - } - }, - "required": [ - "lang", - "name", - "version" - ] -} diff --git a/spacy/cli/schemas/training.json b/spacy/cli/schemas/training.json deleted file mode 100644 index d80ce5c7e..000000000 --- a/spacy/cli/schemas/training.json +++ /dev/null @@ -1,146 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-06/schema", - "title": "Training data for spaCy models", - "type": "array", - "items": { - "type": "object", - "properties": { - "text": { - "title": "The text of the training example", - "type": "string", - "minLength": 1 - }, - "ents": { - "title": "Named entity spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0 - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0 - }, - "label": { - "title": "Entity label", - "type": "string", - "minLength": 1, - "pattern": "^[A-Z0-9]*$" - } - }, - "required": [ - "start", - "end", - "label" - ] - } - }, - "sents": { - "title": "Sentence spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0 - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0 - } - }, - "required": [ - "start", - "end" - ] - } - }, - "cats": { - "title": "Text categories for the text classifier", - "type": "object", - "patternProperties": { - "*": { - "title": "A text category", - "oneOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "minimum": 0 - } - ] - } - }, - "propertyNames": { - "pattern": "^[A-Z0-9]*$", - "minLength": 1 - } - }, - "tokens": { - "title": "The tokens in the text", - "type": "array", - "items": { - "type": "object", - "minProperties": 1, - "properties": { - "id": { - "title": "Token ID, usually token index", - "type": "integer", - "minimum": 0 - }, - "start": { - "title": "Start character offset of the token", - "type": "integer", - "minimum": 0 - }, - "end": { - "title": "End character offset of the token", - "type": "integer", - "minimum": 0 - }, - "pos": { - "title": "Coarse-grained part-of-speech tag", - "type": "string", - "minLength": 1 - }, - "tag": { - "title": "Fine-grained part-of-speech tag", - "type": "string", - "minLength": 1 - }, - "dep": { - "title": "Dependency label", - "type": "string", - "minLength": 1 - }, - "head": { - "title": "Index of the token's head", - "type": "integer", - "minimum": 0 - } - }, - "required": [ - "start", - "end" - ] - } - }, - "_": { - "title": "Custom user space", - "type": "object" - } - }, - "required": [ - "text" - ] - } -} diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index 1869d0918..684791499 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals import pytest -from spacy.cli.schemas import get_schema, validate_json +from spacy.cli._schemas import TRAINING_SCHEMA +from spacy.util import validate_json from spacy.tokens import Doc from ..util import get_doc @@ -61,5 +62,5 @@ def test_doc_to_json_underscore_error_serialize(doc): def test_doc_to_json_valid_training(doc): json_doc = doc.to_json() - errors = validate_json([json_doc], get_schema("training")) + errors = validate_json([json_doc], TRAINING_SCHEMA) assert not errors diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py index 2ddb39f20..ed1385a8b 100644 --- a/spacy/tests/test_json_schemas.py +++ b/spacy/tests/test_json_schemas.py @@ -1,20 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -from spacy.cli.schemas import validate_json, get_schema +from spacy.util import validate_json, validate_schema +from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA import pytest -@pytest.fixture(scope="session") -def training_schema(): - return get_schema("training") +def test_validate_schema(): + validate_schema({"type": "object"}) + with pytest.raises(Exception): + validate_schema({"type": lambda x: x}) -def test_json_schema_get(): - schema = get_schema("training") - assert schema - with pytest.raises(ValueError): - schema = get_schema("xxx") +@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA]) +def test_schemas(schema): + validate_schema(schema) @pytest.mark.parametrize( @@ -24,8 +24,8 @@ def test_json_schema_get(): {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, ], ) -def test_json_schema_training_valid(data, training_schema): - errors = validate_json([data], training_schema) +def test_json_schema_training_valid(data): + errors = validate_json([data], TRAINING_SCHEMA) assert not errors @@ -39,6 +39,6 @@ def test_json_schema_training_valid(data, training_schema): ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), ], ) -def test_json_schema_training_invalid(data, n_errors, training_schema): - errors = validate_json([data], training_schema) +def test_json_schema_training_invalid(data, n_errors): + errors = validate_json([data], TRAINING_SCHEMA) assert len(errors) == n_errors diff --git a/spacy/util.py b/spacy/util.py index 0d60fd4aa..f958c0831 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,6 +14,8 @@ import functools import itertools import numpy.random import srsly +from jsonschema import Draft4Validator + try: import cupy.random @@ -626,6 +628,31 @@ def fix_random_seed(seed=0): cupy.random.seed(seed) +def validate_schema(schema): + # TODO: replace with (stable) Draft6Validator, if available + validator = Draft4Validator(schema) + validator.check_schema(schema) + + +def validate_json(data, schema): + """Validate data against a given JSON schema (see https://json-schema.org). + + data: JSON-serializable data to validate. + schema (dict): The JSON schema. + RETURNS (list): A list of error messages, if available. + """ + # TODO: replace with (stable) Draft6Validator, if available + validator = Draft4Validator(schema) + errors = [] + for err in sorted(validator.iter_errors(data), key=lambda e: e.path): + if err.path: + err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) + else: + err_path = "" + errors.append(err.message + " " + err_path) + return errors + + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty