mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Store JSON schemas in Python and tidy up (#3235)
This commit is contained in:
parent
1ea4df459d
commit
338d659bd0
|
@ -1,5 +1,4 @@
|
|||
recursive-include include *.h
|
||||
recursive-include spacy/cli/schemas *.json
|
||||
include LICENSE
|
||||
include README.md
|
||||
include bin/spacy
|
||||
|
|
220
spacy/cli/_schemas.py
Normal file
220
spacy/cli/_schemas.py
Normal file
|
@ -0,0 +1,220 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# NB: This schema describes the new format of the training data, see #2928
|
||||
TRAINING_SCHEMA = {
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"title": "Training data for spaCy models",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"title": "The text of the training example",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"ents": {
|
||||
"title": "Named entity spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"label": {
|
||||
"title": "Entity label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Z0-9]*$",
|
||||
},
|
||||
},
|
||||
"required": ["start", "end", "label"],
|
||||
},
|
||||
},
|
||||
"sents": {
|
||||
"title": "Sentence spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
"required": ["start", "end"],
|
||||
},
|
||||
},
|
||||
"cats": {
|
||||
"title": "Text categories for the text classifier",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"title": "A text category",
|
||||
"oneOf": [
|
||||
{"type": "boolean"},
|
||||
{"type": "number", "minimum": 0},
|
||||
],
|
||||
}
|
||||
},
|
||||
"propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
|
||||
},
|
||||
"tokens": {
|
||||
"title": "The tokens in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"minProperties": 1,
|
||||
"properties": {
|
||||
"id": {
|
||||
"title": "Token ID, usually token index",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"start": {
|
||||
"title": "Start character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"pos": {
|
||||
"title": "Coarse-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"tag": {
|
||||
"title": "Fine-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"dep": {
|
||||
"title": "Dependency label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"head": {
|
||||
"title": "Index of the token's head",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
"required": ["start", "end"],
|
||||
},
|
||||
},
|
||||
"_": {"title": "Custom user space", "type": "object"},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
}
|
||||
|
||||
META_SCHEMA = {
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lang": {
|
||||
"title": "Two-letter language code, e.g. 'en'",
|
||||
"type": "string",
|
||||
"minLength": 2,
|
||||
"maxLength": 2,
|
||||
"pattern": "^[a-z]*$",
|
||||
},
|
||||
"name": {
|
||||
"title": "Model name",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[a-z_]*$",
|
||||
},
|
||||
"version": {
|
||||
"title": "Model version",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-]*$",
|
||||
},
|
||||
"spacy_version": {
|
||||
"title": "Compatible spaCy version identifier",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-><=]*$",
|
||||
},
|
||||
"parent_package": {
|
||||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"default": "spacy",
|
||||
},
|
||||
"pipeline": {
|
||||
"title": "Names of pipeline components",
|
||||
"type": "array",
|
||||
"items": {"type": "string", "minLength": 1},
|
||||
},
|
||||
"description": {"title": "Model description", "type": "string"},
|
||||
"license": {"title": "Model license", "type": "string"},
|
||||
"author": {"title": "Model author name", "type": "string"},
|
||||
"email": {"title": "Model author email", "type": "string", "format": "email"},
|
||||
"url": {"title": "Model author URL", "type": "string", "format": "uri"},
|
||||
"sources": {
|
||||
"title": "Training data sources",
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Included word vectors",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"keys": {
|
||||
"title": "Number of unique keys",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Number of unique vectors",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"width": {
|
||||
"title": "Number of dimensions",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
"accuracy": {
|
||||
"title": "Accuracy numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {"*": {"type": "number", "minimum": 0.0}},
|
||||
},
|
||||
"speed": {
|
||||
"title": "Speed evaluation numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"oneOf": [
|
||||
{"type": "number", "minimum": 0.0},
|
||||
{"type": "integer", "minimum": 0},
|
||||
]
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["lang", "name", "version"],
|
||||
}
|
|
@ -11,8 +11,6 @@ from wasabi import Printer, MESSAGES
|
|||
from ..gold import GoldCorpus, read_json_object
|
||||
from ..util import load_model, get_lang_class
|
||||
|
||||
# from .schemas import get_schema, validate_json
|
||||
|
||||
|
||||
# Minimum number of expected occurences of label in data to train new label
|
||||
NEW_LABEL_THRESHOLD = 50
|
||||
|
@ -76,7 +74,6 @@ def debug_data(
|
|||
|
||||
# Validate data format using the JSON schema
|
||||
# TODO: update once the new format is ready
|
||||
# schema = get_schema("training")
|
||||
train_data_errors = [] # TODO: validate_json(train_data, schema)
|
||||
dev_data_errors = [] # TODO: validate_json(dev_data, schema)
|
||||
if not train_data_errors:
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
from jsonschema import Draft4Validator
|
||||
import srsly
|
||||
|
||||
from ...errors import Errors
|
||||
|
||||
|
||||
SCHEMAS = {}
|
||||
|
||||
|
||||
def get_schema(name):
|
||||
"""Get the JSON schema for a given name. Looks for a .json file in
|
||||
spacy.cli.schemas, validates the schema and raises ValueError if not found.
|
||||
|
||||
EXAMPLE:
|
||||
>>> schema = get_schema('training')
|
||||
|
||||
name (unicode): The name of the schema.
|
||||
RETURNS (dict): The JSON schema.
|
||||
"""
|
||||
if name not in SCHEMAS:
|
||||
schema_path = Path(__file__).parent / "{}.json".format(name)
|
||||
if not schema_path.exists():
|
||||
raise ValueError(Errors.E104.format(name=name))
|
||||
schema = srsly.read_json(schema_path)
|
||||
# TODO: replace with (stable) Draft6Validator, if available
|
||||
validator = Draft4Validator(schema)
|
||||
validator.check_schema(schema)
|
||||
SCHEMAS[name] = schema
|
||||
return SCHEMAS[name]
|
||||
|
||||
|
||||
def validate_json(data, schema):
|
||||
"""Validate data against a given JSON schema (see https://json-schema.org).
|
||||
|
||||
data: JSON-serializable data to validate.
|
||||
schema (dict): The JSON schema.
|
||||
RETURNS (list): A list of error messages, if available.
|
||||
"""
|
||||
validator = Draft4Validator(schema)
|
||||
errors = []
|
||||
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
|
||||
if err.path:
|
||||
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
|
||||
else:
|
||||
err_path = ""
|
||||
errors.append(err.message + " " + err_path)
|
||||
return errors
|
|
@ -1,128 +0,0 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lang": {
|
||||
"title": "Two-letter language code, e.g. 'en'",
|
||||
"type": "string",
|
||||
"minLength": 2,
|
||||
"maxLength": 2,
|
||||
"pattern": "^[a-z]*$"
|
||||
},
|
||||
"name": {
|
||||
"title": "Model name",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[a-z_]*$"
|
||||
},
|
||||
"version": {
|
||||
"title": "Model version",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-]*$"
|
||||
},
|
||||
"spacy_version": {
|
||||
"title": "Compatible spaCy version identifier",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-><=]*$"
|
||||
},
|
||||
"parent_package": {
|
||||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"default": "spacy"
|
||||
},
|
||||
"pipeline": {
|
||||
"title": "Names of pipeline components",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
}
|
||||
},
|
||||
"description": {
|
||||
"title": "Model description",
|
||||
"type": "string"
|
||||
},
|
||||
"license": {
|
||||
"title": "Model license",
|
||||
"type": "string"
|
||||
},
|
||||
"author": {
|
||||
"title": "Model author name",
|
||||
"type": "string"
|
||||
},
|
||||
"email": {
|
||||
"title": "Model author email",
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
},
|
||||
"url": {
|
||||
"title": "Model author URL",
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"sources": {
|
||||
"title": "Training data sources",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Included word vectors",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"keys": {
|
||||
"title": "Number of unique keys",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Number of unique vectors",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"width": {
|
||||
"title": "Number of dimensions",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"accuracy": {
|
||||
"title": "Accuracy numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"type": "number",
|
||||
"minimum": 0.0
|
||||
}
|
||||
}
|
||||
},
|
||||
"speed": {
|
||||
"title": "Speed evaluation numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "number",
|
||||
"minimum": 0.0
|
||||
},
|
||||
{
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"lang",
|
||||
"name",
|
||||
"version"
|
||||
]
|
||||
}
|
|
@ -1,146 +0,0 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"title": "Training data for spaCy models",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"title": "The text of the training example",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"ents": {
|
||||
"title": "Named entity spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"label": {
|
||||
"title": "Entity label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Z0-9]*$"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end",
|
||||
"label"
|
||||
]
|
||||
}
|
||||
},
|
||||
"sents": {
|
||||
"title": "Sentence spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end"
|
||||
]
|
||||
}
|
||||
},
|
||||
"cats": {
|
||||
"title": "Text categories for the text classifier",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"title": "A text category",
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"propertyNames": {
|
||||
"pattern": "^[A-Z0-9]*$",
|
||||
"minLength": 1
|
||||
}
|
||||
},
|
||||
"tokens": {
|
||||
"title": "The tokens in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"minProperties": 1,
|
||||
"properties": {
|
||||
"id": {
|
||||
"title": "Token ID, usually token index",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"start": {
|
||||
"title": "Start character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"pos": {
|
||||
"title": "Coarse-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"tag": {
|
||||
"title": "Fine-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"dep": {
|
||||
"title": "Dependency label",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"head": {
|
||||
"title": "Index of the token's head",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_": {
|
||||
"title": "Custom user space",
|
||||
"type": "object"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"text"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -2,7 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.cli.schemas import get_schema, validate_json
|
||||
from spacy.cli._schemas import TRAINING_SCHEMA
|
||||
from spacy.util import validate_json
|
||||
from spacy.tokens import Doc
|
||||
from ..util import get_doc
|
||||
|
||||
|
@ -61,5 +62,5 @@ def test_doc_to_json_underscore_error_serialize(doc):
|
|||
|
||||
def test_doc_to_json_valid_training(doc):
|
||||
json_doc = doc.to_json()
|
||||
errors = validate_json([json_doc], get_schema("training"))
|
||||
errors = validate_json([json_doc], TRAINING_SCHEMA)
|
||||
assert not errors
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.cli.schemas import validate_json, get_schema
|
||||
from spacy.util import validate_json, validate_schema
|
||||
from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def training_schema():
|
||||
return get_schema("training")
|
||||
def test_validate_schema():
|
||||
validate_schema({"type": "object"})
|
||||
with pytest.raises(Exception):
|
||||
validate_schema({"type": lambda x: x})
|
||||
|
||||
|
||||
def test_json_schema_get():
|
||||
schema = get_schema("training")
|
||||
assert schema
|
||||
with pytest.raises(ValueError):
|
||||
schema = get_schema("xxx")
|
||||
@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA])
|
||||
def test_schemas(schema):
|
||||
validate_schema(schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -24,8 +24,8 @@ def test_json_schema_get():
|
|||
{"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
|
||||
],
|
||||
)
|
||||
def test_json_schema_training_valid(data, training_schema):
|
||||
errors = validate_json([data], training_schema)
|
||||
def test_json_schema_training_valid(data):
|
||||
errors = validate_json([data], TRAINING_SCHEMA)
|
||||
assert not errors
|
||||
|
||||
|
||||
|
@ -39,6 +39,6 @@ def test_json_schema_training_valid(data, training_schema):
|
|||
({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
|
||||
],
|
||||
)
|
||||
def test_json_schema_training_invalid(data, n_errors, training_schema):
|
||||
errors = validate_json([data], training_schema)
|
||||
def test_json_schema_training_invalid(data, n_errors):
|
||||
errors = validate_json([data], TRAINING_SCHEMA)
|
||||
assert len(errors) == n_errors
|
||||
|
|
|
@ -14,6 +14,8 @@ import functools
|
|||
import itertools
|
||||
import numpy.random
|
||||
import srsly
|
||||
from jsonschema import Draft4Validator
|
||||
|
||||
|
||||
try:
|
||||
import cupy.random
|
||||
|
@ -626,6 +628,31 @@ def fix_random_seed(seed=0):
|
|||
cupy.random.seed(seed)
|
||||
|
||||
|
||||
def validate_schema(schema):
|
||||
# TODO: replace with (stable) Draft6Validator, if available
|
||||
validator = Draft4Validator(schema)
|
||||
validator.check_schema(schema)
|
||||
|
||||
|
||||
def validate_json(data, schema):
|
||||
"""Validate data against a given JSON schema (see https://json-schema.org).
|
||||
|
||||
data: JSON-serializable data to validate.
|
||||
schema (dict): The JSON schema.
|
||||
RETURNS (list): A list of error messages, if available.
|
||||
"""
|
||||
# TODO: replace with (stable) Draft6Validator, if available
|
||||
validator = Draft4Validator(schema)
|
||||
errors = []
|
||||
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
|
||||
if err.path:
|
||||
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
|
||||
else:
|
||||
err_path = ""
|
||||
errors.append(err.message + " " + err_path)
|
||||
return errors
|
||||
|
||||
|
||||
class SimpleFrozenDict(dict):
|
||||
"""Simplified implementation of a frozen dict, mainly used as default
|
||||
function or method argument (for arguments that should default to empty
|
||||
|
|
Loading…
Reference in New Issue
Block a user