Store JSON schemas in Python and tidy up (#3235)

2025-07-03 11:23:12 +03:00 · 2019-02-07 09:44:31 +01:00 · 2019-02-07 09:44:31 +01:00 · 338d659bd0
commit 338d659bd0
parent 1ea4df459d
9 changed files with 263 additions and 344 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,4 @@
 recursive-include include *.h
 recursive-include spacy/cli/schemas *.json
 include LICENSE
 include README.md
 include bin/spacy
--- a/spacy/cli/_schemas.py
+++ b/spacy/cli/_schemas.py
@ -0,0 +1,220 @@
 # coding: utf-8
 from __future__ import unicode_literals
 # NB: This schema describes the new format of the training data, see #2928
 TRAINING_SCHEMA = {
    "$schema": "http://json-schema.org/draft-06/schema",
    "title": "Training data for spaCy models",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "text": {
                "title": "The text of the training example",
                "type": "string",
                "minLength": 1,
            },
            "ents": {
                "title": "Named entity spans in the text",
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "start": {
                            "title": "Start character offset of the span",
                            "type": "integer",
                            "minimum": 0,
                        },
                        "end": {
                            "title": "End character offset of the span",
                            "type": "integer",
                            "minimum": 0,
                        },
                        "label": {
                            "title": "Entity label",
                            "type": "string",
                            "minLength": 1,
                            "pattern": "^[A-Z0-9]*$",
                        },
                    },
                    "required": ["start", "end", "label"],
                },
            },
            "sents": {
                "title": "Sentence spans in the text",
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "start": {
                            "title": "Start character offset of the span",
                            "type": "integer",
                            "minimum": 0,
                        },
                        "end": {
                            "title": "End character offset of the span",
                            "type": "integer",
                            "minimum": 0,
                        },
                    },
                    "required": ["start", "end"],
                },
            },
            "cats": {
                "title": "Text categories for the text classifier",
                "type": "object",
                "patternProperties": {
                    "*": {
                        "title": "A text category",
                        "oneOf": [
                            {"type": "boolean"},
                            {"type": "number", "minimum": 0},
                        ],
                    }
                },
                "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
            },
            "tokens": {
                "title": "The tokens in the text",
                "type": "array",
                "items": {
                    "type": "object",
                    "minProperties": 1,
                    "properties": {
                        "id": {
                            "title": "Token ID, usually token index",
                            "type": "integer",
                            "minimum": 0,
                        },
                        "start": {
                            "title": "Start character offset of the token",
                            "type": "integer",
                            "minimum": 0,
                        },
                        "end": {
                            "title": "End character offset of the token",
                            "type": "integer",
                            "minimum": 0,
                        },
                        "pos": {
                            "title": "Coarse-grained part-of-speech tag",
                            "type": "string",
                            "minLength": 1,
                        },
                        "tag": {
                            "title": "Fine-grained part-of-speech tag",
                            "type": "string",
                            "minLength": 1,
                        },
                        "dep": {
                            "title": "Dependency label",
                            "type": "string",
                            "minLength": 1,
                        },
                        "head": {
                            "title": "Index of the token's head",
                            "type": "integer",
                            "minimum": 0,
                        },
                    },
                    "required": ["start", "end"],
                },
            },
            "_": {"title": "Custom user space", "type": "object"},
        },
        "required": ["text"],
    },
 }
 META_SCHEMA = {
    "$schema": "http://json-schema.org/draft-06/schema",
    "type": "object",
    "properties": {
        "lang": {
            "title": "Two-letter language code, e.g. 'en'",
            "type": "string",
            "minLength": 2,
            "maxLength": 2,
            "pattern": "^[a-z]*$",
        },
        "name": {
            "title": "Model name",
            "type": "string",
            "minLength": 1,
            "pattern": "^[a-z_]*$",
        },
        "version": {
            "title": "Model version",
            "type": "string",
            "minLength": 1,
            "pattern": "^[0-9a-z.-]*$",
        },
        "spacy_version": {
            "title": "Compatible spaCy version identifier",
            "type": "string",
            "minLength": 1,
            "pattern": "^[0-9a-z.-><=]*$",
        },
        "parent_package": {
            "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
            "type": "string",
            "minLength": 1,
            "default": "spacy",
        },
        "pipeline": {
            "title": "Names of pipeline components",
            "type": "array",
            "items": {"type": "string", "minLength": 1},
        },
        "description": {"title": "Model description", "type": "string"},
        "license": {"title": "Model license", "type": "string"},
        "author": {"title": "Model author name", "type": "string"},
        "email": {"title": "Model author email", "type": "string", "format": "email"},
        "url": {"title": "Model author URL", "type": "string", "format": "uri"},
        "sources": {
            "title": "Training data sources",
            "type": "array",
            "items": {"type": "string"},
        },
        "vectors": {
            "title": "Included word vectors",
            "type": "object",
            "properties": {
                "keys": {
                    "title": "Number of unique keys",
                    "type": "integer",
                    "minimum": 0,
                },
                "vectors": {
                    "title": "Number of unique vectors",
                    "type": "integer",
                    "minimum": 0,
                },
                "width": {
                    "title": "Number of dimensions",
                    "type": "integer",
                    "minimum": 0,
                },
            },
        },
        "accuracy": {
            "title": "Accuracy numbers",
            "type": "object",
            "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
        },
        "speed": {
            "title": "Speed evaluation numbers",
            "type": "object",
            "patternProperties": {
                "*": {
                    "oneOf": [
                        {"type": "number", "minimum": 0.0},
                        {"type": "integer", "minimum": 0},
                    ]
                }
            },
        },
    },
    "required": ["lang", "name", "version"],
 }
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -11,8 +11,6 @@ from wasabi import Printer, MESSAGES
 from ..gold import GoldCorpus, read_json_object
 from ..util import load_model, get_lang_class
 # from .schemas import get_schema, validate_json
 # Minimum number of expected occurences of label in data to train new label
 NEW_LABEL_THRESHOLD = 50
@ -76,7 +74,6 @@ def debug_data(
    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
    # schema = get_schema("training")
    train_data_errors = []  # TODO: validate_json(train_data, schema)
    dev_data_errors = []  # TODO: validate_json(dev_data, schema)
    if not train_data_errors:
--- a/spacy/cli/schemas/init.py
+++ b/spacy/cli/schemas/init.py
@ -1,51 +0,0 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from pathlib import Path
 from jsonschema import Draft4Validator
 import srsly
 from ...errors import Errors
 SCHEMAS = {}
 def get_schema(name):
    """Get the JSON schema for a given name. Looks for a .json file in
    spacy.cli.schemas, validates the schema and raises ValueError if not found.
    EXAMPLE:
        >>> schema = get_schema('training')
    name (unicode): The name of the schema.
    RETURNS (dict): The JSON schema.
    """
    if name not in SCHEMAS:
        schema_path = Path(__file__).parent / "{}.json".format(name)
        if not schema_path.exists():
            raise ValueError(Errors.E104.format(name=name))
        schema = srsly.read_json(schema_path)
        # TODO: replace with (stable) Draft6Validator, if available
        validator = Draft4Validator(schema)
        validator.check_schema(schema)
        SCHEMAS[name] = schema
    return SCHEMAS[name]
 def validate_json(data, schema):
    """Validate data against a given JSON schema (see https://json-schema.org).
    data: JSON-serializable data to validate.
    schema (dict): The JSON schema.
    RETURNS (list): A list of error messages, if available.
    """
    validator = Draft4Validator(schema)
    errors = []
    for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
        if err.path:
            err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
        else:
            err_path = ""
        errors.append(err.message + " " + err_path)
    return errors
--- a/spacy/cli/schemas/meta.json
+++ b/spacy/cli/schemas/meta.json
@ -1,128 +0,0 @@
 {
  "$schema": "http://json-schema.org/draft-06/schema",
  "type": "object",
  "properties": {
    "lang": {
      "title": "Two-letter language code, e.g. 'en'",
      "type": "string",
      "minLength": 2,
      "maxLength": 2,
      "pattern": "^[a-z]*$"
    },
    "name": {
      "title": "Model name",
      "type": "string",
      "minLength": 1,
      "pattern": "^[a-z_]*$"
    },
    "version": {
      "title": "Model version",
      "type": "string",
      "minLength": 1,
      "pattern": "^[0-9a-z.-]*$"
    },
    "spacy_version": {
      "title": "Compatible spaCy version identifier",
      "type": "string",
      "minLength": 1,
      "pattern": "^[0-9a-z.-><=]*$"
    },
    "parent_package": {
      "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
      "type": "string",
      "minLength": 1,
      "default": "spacy"
    },
    "pipeline": {
      "title": "Names of pipeline components",
      "type": "array",
      "items": {
        "type": "string",
        "minLength": 1
      }
    },
    "description": {
      "title": "Model description",
      "type": "string"
    },
    "license": {
      "title": "Model license",
      "type": "string"
    },
    "author": {
      "title": "Model author name",
      "type": "string"
    },
    "email": {
      "title": "Model author email",
      "type": "string",
      "format": "email"
    },
    "url": {
      "title": "Model author URL",
      "type": "string",
      "format": "uri"
    },
    "sources": {
      "title": "Training data sources",
      "type": "array",
      "items": {
        "type": "string"
      }
    },
    "vectors": {
      "title": "Included word vectors",
      "type": "object",
      "properties": {
        "keys": {
          "title": "Number of unique keys",
          "type": "integer",
          "minimum": 0
        },
        "vectors": {
          "title": "Number of unique vectors",
          "type": "integer",
          "minimum": 0
        },
        "width": {
          "title": "Number of dimensions",
          "type": "integer",
          "minimum": 0
        }
      }
    },
    "accuracy": {
      "title": "Accuracy numbers",
      "type": "object",
      "patternProperties": {
        "*": {
          "type": "number",
          "minimum": 0.0
        }
      }
    },
    "speed": {
      "title": "Speed evaluation numbers",
      "type": "object",
      "patternProperties": {
        "*": {
          "oneOf": [
            {
              "type": "number",
              "minimum": 0.0
            },
            {
              "type": "integer",
              "minimum": 0
            }
          ]
        }
      }
    }
  },
  "required": [
    "lang",
    "name",
    "version"
  ]
 }
--- a/spacy/cli/schemas/training.json
+++ b/spacy/cli/schemas/training.json
@ -1,146 +0,0 @@
 {
  "$schema": "http://json-schema.org/draft-06/schema",
  "title": "Training data for spaCy models",
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "text": {
        "title": "The text of the training example",
        "type": "string",
        "minLength": 1
      },
      "ents": {
        "title": "Named entity spans in the text",
        "type": "array",
        "items": {
          "type": "object",
          "properties": {
            "start": {
              "title": "Start character offset of the span",
              "type": "integer",
              "minimum": 0
            },
            "end": {
              "title": "End character offset of the span",
              "type": "integer",
              "minimum": 0
            },
            "label": {
              "title": "Entity label",
              "type": "string",
              "minLength": 1,
              "pattern": "^[A-Z0-9]*$"
            }
          },
          "required": [
            "start",
            "end",
            "label"
          ]
        }
      },
      "sents": {
        "title": "Sentence spans in the text",
        "type": "array",
        "items": {
          "type": "object",
          "properties": {
            "start": {
              "title": "Start character offset of the span",
              "type": "integer",
              "minimum": 0
            },
            "end": {
              "title": "End character offset of the span",
              "type": "integer",
              "minimum": 0
            }
          },
          "required": [
            "start",
            "end"
          ]
        }
      },
      "cats": {
        "title": "Text categories for the text classifier",
        "type": "object",
        "patternProperties": {
          "*": {
            "title": "A text category",
            "oneOf": [
              {
                "type": "boolean"
              },
              {
                "type": "number",
                "minimum": 0
              }
            ]
          }
        },
        "propertyNames": {
          "pattern": "^[A-Z0-9]*$",
          "minLength": 1
        }
      },
      "tokens": {
        "title": "The tokens in the text",
        "type": "array",
        "items": {
          "type": "object",
          "minProperties": 1,
          "properties": {
            "id": {
              "title": "Token ID, usually token index",
              "type": "integer",
              "minimum": 0
            },
            "start": {
              "title": "Start character offset of the token",
              "type": "integer",
              "minimum": 0
            },
            "end": {
              "title": "End character offset of the token",
              "type": "integer",
              "minimum": 0
            },
            "pos": {
              "title": "Coarse-grained part-of-speech tag",
              "type": "string",
              "minLength": 1
            },
            "tag": {
              "title": "Fine-grained part-of-speech tag",
              "type": "string",
              "minLength": 1
            },
            "dep": {
              "title": "Dependency label",
              "type": "string",
              "minLength": 1
            },
            "head": {
              "title": "Index of the token's head",
              "type": "integer",
              "minimum": 0
            }
          },
          "required": [
            "start",
            "end"
          ]
        }
      },
      "_": {
        "title": "Custom user space",
        "type": "object"
      }
    },
    "required": [
      "text"
    ]
  }
 }
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@ -2,7 +2,8 @@
 from __future__ import unicode_literals
 import pytest
-from spacy.cli.schemas import get_schema, validate_json
+from spacy.cli._schemas import TRAINING_SCHEMA
 from spacy.util import validate_json
 from spacy.tokens import Doc
 from ..util import get_doc
@ -61,5 +62,5 @@ def test_doc_to_json_underscore_error_serialize(doc):
 def test_doc_to_json_valid_training(doc):
    json_doc = doc.to_json()
-    errors = validate_json([json_doc], get_schema("training"))
+    errors = validate_json([json_doc], TRAINING_SCHEMA)
    assert not errors
--- a/spacy/tests/test_json_schemas.py
+++ b/spacy/tests/test_json_schemas.py
@ -1,20 +1,20 @@
 # coding: utf-8
 from __future__ import unicode_literals
-from spacy.cli.schemas import validate_json, get_schema
+from spacy.util import validate_json, validate_schema
 from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA
 import pytest
-@pytest.fixture(scope="session")
+def test_validate_schema():
-def training_schema():
+    validate_schema({"type": "object"})
-    return get_schema("training")
+    with pytest.raises(Exception):
        validate_schema({"type": lambda x: x})
-def test_json_schema_get():
+@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA])
-    schema = get_schema("training")
+def test_schemas(schema):
-    assert schema
+    validate_schema(schema)
    with pytest.raises(ValueError):
        schema = get_schema("xxx")
@pytest.mark.parametrize(
@ -24,8 +24,8 @@ def test_json_schema_get():
        {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
    ],
 )
-def test_json_schema_training_valid(data, training_schema):
+def test_json_schema_training_valid(data):
-    errors = validate_json([data], training_schema)
+    errors = validate_json([data], TRAINING_SCHEMA)
    assert not errors
@ -39,6 +39,6 @@ def test_json_schema_training_valid(data, training_schema):
        ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
    ],
 )
-def test_json_schema_training_invalid(data, n_errors, training_schema):
+def test_json_schema_training_invalid(data, n_errors):
-    errors = validate_json([data], training_schema)
+    errors = validate_json([data], TRAINING_SCHEMA)
    assert len(errors) == n_errors
--- a/spacy/util.py
+++ b/spacy/util.py
@ -14,6 +14,8 @@ import functools
 import itertools
 import numpy.random
 import srsly
 from jsonschema import Draft4Validator
 try:
    import cupy.random
@ -626,6 +628,31 @@ def fix_random_seed(seed=0):
        cupy.random.seed(seed)
 def validate_schema(schema):
    # TODO: replace with (stable) Draft6Validator, if available
    validator = Draft4Validator(schema)
    validator.check_schema(schema)
 def validate_json(data, schema):
    """Validate data against a given JSON schema (see https://json-schema.org).
    data: JSON-serializable data to validate.
    schema (dict): The JSON schema.
    RETURNS (list): A list of error messages, if available.
    """
    # TODO: replace with (stable) Draft6Validator, if available
    validator = Draft4Validator(schema)
    errors = []
    for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
        if err.path:
            err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
        else:
            err_path = ""
        errors.append(err.message + " " + err_path)
    return errors
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty