Store JSON schemas in Python and tidy up (#3235)

2026-03-07 13:21:46 +03:00 · 2019-02-07 09:44:31 +01:00 · 2019-02-07 09:44:31 +01:00 · 338d659bd0
commit 338d659bd0
parent 1ea4df459d
9 changed files with 263 additions and 344 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,4 @@
 recursive-include include *.h
-recursive-include spacy/cli/schemas *.json
 include LICENSE
 include README.md
 include bin/spacy
--- a/spacy/cli/_schemas.py
+++ b/spacy/cli/_schemas.py
@ -0,0 +1,220 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+# NB: This schema describes the new format of the training data, see #2928
+TRAINING_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-06/schema",
+    "title": "Training data for spaCy models",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "text": {
+                "title": "The text of the training example",
+                "type": "string",
+                "minLength": 1,
+            },
+            "ents": {
+                "title": "Named entity spans in the text",
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "start": {
+                            "title": "Start character offset of the span",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                        "end": {
+                            "title": "End character offset of the span",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                        "label": {
+                            "title": "Entity label",
+                            "type": "string",
+                            "minLength": 1,
+                            "pattern": "^[A-Z0-9]*$",
+                        },
+                    },
+                    "required": ["start", "end", "label"],
+                },
+            },
+            "sents": {
+                "title": "Sentence spans in the text",
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "start": {
+                            "title": "Start character offset of the span",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                        "end": {
+                            "title": "End character offset of the span",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                    },
+                    "required": ["start", "end"],
+                },
+            },
+            "cats": {
+                "title": "Text categories for the text classifier",
+                "type": "object",
+                "patternProperties": {
+                    "*": {
+                        "title": "A text category",
+                        "oneOf": [
+                            {"type": "boolean"},
+                            {"type": "number", "minimum": 0},
+                        ],
+                    }
+                },
+                "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
+            },
+            "tokens": {
+                "title": "The tokens in the text",
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "minProperties": 1,
+                    "properties": {
+                        "id": {
+                            "title": "Token ID, usually token index",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                        "start": {
+                            "title": "Start character offset of the token",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                        "end": {
+                            "title": "End character offset of the token",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                        "pos": {
+                            "title": "Coarse-grained part-of-speech tag",
+                            "type": "string",
+                            "minLength": 1,
+                        },
+                        "tag": {
+                            "title": "Fine-grained part-of-speech tag",
+                            "type": "string",
+                            "minLength": 1,
+                        },
+                        "dep": {
+                            "title": "Dependency label",
+                            "type": "string",
+                            "minLength": 1,
+                        },
+                        "head": {
+                            "title": "Index of the token's head",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                    },
+                    "required": ["start", "end"],
+                },
+            },
+            "_": {"title": "Custom user space", "type": "object"},
+        },
+        "required": ["text"],
+    },
+}
+
+META_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-06/schema",
+    "type": "object",
+    "properties": {
+        "lang": {
+            "title": "Two-letter language code, e.g. 'en'",
+            "type": "string",
+            "minLength": 2,
+            "maxLength": 2,
+            "pattern": "^[a-z]*$",
+        },
+        "name": {
+            "title": "Model name",
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^[a-z_]*$",
+        },
+        "version": {
+            "title": "Model version",
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^[0-9a-z.-]*$",
+        },
+        "spacy_version": {
+            "title": "Compatible spaCy version identifier",
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^[0-9a-z.-><=]*$",
+        },
+        "parent_package": {
+            "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
+            "type": "string",
+            "minLength": 1,
+            "default": "spacy",
+        },
+        "pipeline": {
+            "title": "Names of pipeline components",
+            "type": "array",
+            "items": {"type": "string", "minLength": 1},
+        },
+        "description": {"title": "Model description", "type": "string"},
+        "license": {"title": "Model license", "type": "string"},
+        "author": {"title": "Model author name", "type": "string"},
+        "email": {"title": "Model author email", "type": "string", "format": "email"},
+        "url": {"title": "Model author URL", "type": "string", "format": "uri"},
+        "sources": {
+            "title": "Training data sources",
+            "type": "array",
+            "items": {"type": "string"},
+        },
+        "vectors": {
+            "title": "Included word vectors",
+            "type": "object",
+            "properties": {
+                "keys": {
+                    "title": "Number of unique keys",
+                    "type": "integer",
+                    "minimum": 0,
+                },
+                "vectors": {
+                    "title": "Number of unique vectors",
+                    "type": "integer",
+                    "minimum": 0,
+                },
+                "width": {
+                    "title": "Number of dimensions",
+                    "type": "integer",
+                    "minimum": 0,
+                },
+            },
+        },
+        "accuracy": {
+            "title": "Accuracy numbers",
+            "type": "object",
+            "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
+        },
+        "speed": {
+            "title": "Speed evaluation numbers",
+            "type": "object",
+            "patternProperties": {
+                "*": {
+                    "oneOf": [
+                        {"type": "number", "minimum": 0.0},
+                        {"type": "integer", "minimum": 0},
+                    ]
+                }
+            },
+        },
+    },
+    "required": ["lang", "name", "version"],
+}
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -11,8 +11,6 @@ from wasabi import Printer, MESSAGES
 from ..gold import GoldCorpus, read_json_object
 from ..util import load_model, get_lang_class

-# from .schemas import get_schema, validate_json
-

 # Minimum number of expected occurences of label in data to train new label
 NEW_LABEL_THRESHOLD = 50
@ -76,7 +74,6 @@ def debug_data(

    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
-    # schema = get_schema("training")
    train_data_errors = []  # TODO: validate_json(train_data, schema)
    dev_data_errors = []  # TODO: validate_json(dev_data, schema)
    if not train_data_errors:
--- a/spacy/cli/schemas/init.py
+++ b/spacy/cli/schemas/init.py
@ -1,51 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from pathlib import Path
-from jsonschema import Draft4Validator
-import srsly
-
-from ...errors import Errors
-
-
-SCHEMAS = {}
-
-
-def get_schema(name):
-    """Get the JSON schema for a given name. Looks for a .json file in
-    spacy.cli.schemas, validates the schema and raises ValueError if not found.
-
-    EXAMPLE:
-        >>> schema = get_schema('training')
-
-    name (unicode): The name of the schema.
-    RETURNS (dict): The JSON schema.
-    """
-    if name not in SCHEMAS:
-        schema_path = Path(__file__).parent / "{}.json".format(name)
-        if not schema_path.exists():
-            raise ValueError(Errors.E104.format(name=name))
-        schema = srsly.read_json(schema_path)
-        # TODO: replace with (stable) Draft6Validator, if available
-        validator = Draft4Validator(schema)
-        validator.check_schema(schema)
-        SCHEMAS[name] = schema
-    return SCHEMAS[name]
-
-
-def validate_json(data, schema):
-    """Validate data against a given JSON schema (see https://json-schema.org).
-
-    data: JSON-serializable data to validate.
-    schema (dict): The JSON schema.
-    RETURNS (list): A list of error messages, if available.
-    """
-    validator = Draft4Validator(schema)
-    errors = []
-    for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
-        if err.path:
-            err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
-        else:
-            err_path = ""
-        errors.append(err.message + " " + err_path)
-    return errors
--- a/spacy/cli/schemas/meta.json
+++ b/spacy/cli/schemas/meta.json
@ -1,128 +0,0 @@
-{
-  "$schema": "http://json-schema.org/draft-06/schema",
-  "type": "object",
-  "properties": {
-    "lang": {
-      "title": "Two-letter language code, e.g. 'en'",
-      "type": "string",
-      "minLength": 2,
-      "maxLength": 2,
-      "pattern": "^[a-z]*$"
-    },
-    "name": {
-      "title": "Model name",
-      "type": "string",
-      "minLength": 1,
-      "pattern": "^[a-z_]*$"
-    },
-    "version": {
-      "title": "Model version",
-      "type": "string",
-      "minLength": 1,
-      "pattern": "^[0-9a-z.-]*$"
-    },
-    "spacy_version": {
-      "title": "Compatible spaCy version identifier",
-      "type": "string",
-      "minLength": 1,
-      "pattern": "^[0-9a-z.-><=]*$"
-    },
-    "parent_package": {
-      "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
-      "type": "string",
-      "minLength": 1,
-      "default": "spacy"
-    },
-    "pipeline": {
-      "title": "Names of pipeline components",
-      "type": "array",
-      "items": {
-        "type": "string",
-        "minLength": 1
-      }
-    },
-    "description": {
-      "title": "Model description",
-      "type": "string"
-    },
-    "license": {
-      "title": "Model license",
-      "type": "string"
-    },
-    "author": {
-      "title": "Model author name",
-      "type": "string"
-    },
-    "email": {
-      "title": "Model author email",
-      "type": "string",
-      "format": "email"
-    },
-    "url": {
-      "title": "Model author URL",
-      "type": "string",
-      "format": "uri"
-    },
-    "sources": {
-      "title": "Training data sources",
-      "type": "array",
-      "items": {
-        "type": "string"
-      }
-    },
-    "vectors": {
-      "title": "Included word vectors",
-      "type": "object",
-      "properties": {
-        "keys": {
-          "title": "Number of unique keys",
-          "type": "integer",
-          "minimum": 0
-        },
-        "vectors": {
-          "title": "Number of unique vectors",
-          "type": "integer",
-          "minimum": 0
-        },
-        "width": {
-          "title": "Number of dimensions",
-          "type": "integer",
-          "minimum": 0
-        }
-      }
-    },
-    "accuracy": {
-      "title": "Accuracy numbers",
-      "type": "object",
-      "patternProperties": {
-        "*": {
-          "type": "number",
-          "minimum": 0.0
-        }
-      }
-    },
-    "speed": {
-      "title": "Speed evaluation numbers",
-      "type": "object",
-      "patternProperties": {
-        "*": {
-          "oneOf": [
-            {
-              "type": "number",
-              "minimum": 0.0
-            },
-            {
-              "type": "integer",
-              "minimum": 0
-            }
-          ]
-        }
-      }
-    }
-  },
-  "required": [
-    "lang",
-    "name",
-    "version"
-  ]
-}
--- a/spacy/cli/schemas/training.json
+++ b/spacy/cli/schemas/training.json
@ -1,146 +0,0 @@
-{
-  "$schema": "http://json-schema.org/draft-06/schema",
-  "title": "Training data for spaCy models",
-  "type": "array",
-  "items": {
-    "type": "object",
-    "properties": {
-      "text": {
-        "title": "The text of the training example",
-        "type": "string",
-        "minLength": 1
-      },
-      "ents": {
-        "title": "Named entity spans in the text",
-        "type": "array",
-        "items": {
-          "type": "object",
-          "properties": {
-            "start": {
-              "title": "Start character offset of the span",
-              "type": "integer",
-              "minimum": 0
-            },
-            "end": {
-              "title": "End character offset of the span",
-              "type": "integer",
-              "minimum": 0
-            },
-            "label": {
-              "title": "Entity label",
-              "type": "string",
-              "minLength": 1,
-              "pattern": "^[A-Z0-9]*$"
-            }
-          },
-          "required": [
-            "start",
-            "end",
-            "label"
-          ]
-        }
-      },
-      "sents": {
-        "title": "Sentence spans in the text",
-        "type": "array",
-        "items": {
-          "type": "object",
-          "properties": {
-            "start": {
-              "title": "Start character offset of the span",
-              "type": "integer",
-              "minimum": 0
-            },
-            "end": {
-              "title": "End character offset of the span",
-              "type": "integer",
-              "minimum": 0
-            }
-          },
-          "required": [
-            "start",
-            "end"
-          ]
-        }
-      },
-      "cats": {
-        "title": "Text categories for the text classifier",
-        "type": "object",
-        "patternProperties": {
-          "*": {
-            "title": "A text category",
-            "oneOf": [
-              {
-                "type": "boolean"
-              },
-              {
-                "type": "number",
-                "minimum": 0
-              }
-            ]
-          }
-        },
-        "propertyNames": {
-          "pattern": "^[A-Z0-9]*$",
-          "minLength": 1
-        }
-      },
-      "tokens": {
-        "title": "The tokens in the text",
-        "type": "array",
-        "items": {
-          "type": "object",
-          "minProperties": 1,
-          "properties": {
-            "id": {
-              "title": "Token ID, usually token index",
-              "type": "integer",
-              "minimum": 0
-            },
-            "start": {
-              "title": "Start character offset of the token",
-              "type": "integer",
-              "minimum": 0
-            },
-            "end": {
-              "title": "End character offset of the token",
-              "type": "integer",
-              "minimum": 0
-            },
-            "pos": {
-              "title": "Coarse-grained part-of-speech tag",
-              "type": "string",
-              "minLength": 1
-            },
-            "tag": {
-              "title": "Fine-grained part-of-speech tag",
-              "type": "string",
-              "minLength": 1
-            },
-            "dep": {
-              "title": "Dependency label",
-              "type": "string",
-              "minLength": 1
-            },
-            "head": {
-              "title": "Index of the token's head",
-              "type": "integer",
-              "minimum": 0
-            }
-          },
-          "required": [
-            "start",
-            "end"
-          ]
-        }
-      },
-      "_": {
-        "title": "Custom user space",
-        "type": "object"
-      }
-    },
-    "required": [
-      "text"
-    ]
-  }
-}
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@ -2,7 +2,8 @@
 from __future__ import unicode_literals

 import pytest
-from spacy.cli.schemas import get_schema, validate_json
+from spacy.cli._schemas import TRAINING_SCHEMA
+from spacy.util import validate_json
 from spacy.tokens import Doc
 from ..util import get_doc

@ -61,5 +62,5 @@ def test_doc_to_json_underscore_error_serialize(doc):

 def test_doc_to_json_valid_training(doc):
    json_doc = doc.to_json()
-    errors = validate_json([json_doc], get_schema("training"))
+    errors = validate_json([json_doc], TRAINING_SCHEMA)
    assert not errors
--- a/spacy/tests/test_json_schemas.py
+++ b/spacy/tests/test_json_schemas.py
@ -1,20 +1,20 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from spacy.cli.schemas import validate_json, get_schema
+from spacy.util import validate_json, validate_schema
+from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA
 import pytest


-@pytest.fixture(scope="session")
-def training_schema():
-    return get_schema("training")
+def test_validate_schema():
+    validate_schema({"type": "object"})
+    with pytest.raises(Exception):
+        validate_schema({"type": lambda x: x})


-def test_json_schema_get():
-    schema = get_schema("training")
-    assert schema
-    with pytest.raises(ValueError):
-        schema = get_schema("xxx")
+@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA])
+def test_schemas(schema):
+    validate_schema(schema)


@pytest.mark.parametrize(
@ -24,8 +24,8 @@ def test_json_schema_get():
        {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
    ],
 )
-def test_json_schema_training_valid(data, training_schema):
-    errors = validate_json([data], training_schema)
+def test_json_schema_training_valid(data):
+    errors = validate_json([data], TRAINING_SCHEMA)
    assert not errors


@ -39,6 +39,6 @@ def test_json_schema_training_valid(data, training_schema):
        ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
    ],
 )
-def test_json_schema_training_invalid(data, n_errors, training_schema):
-    errors = validate_json([data], training_schema)
+def test_json_schema_training_invalid(data, n_errors):
+    errors = validate_json([data], TRAINING_SCHEMA)
    assert len(errors) == n_errors
--- a/spacy/util.py
+++ b/spacy/util.py
@ -14,6 +14,8 @@ import functools
 import itertools
 import numpy.random
 import srsly
+from jsonschema import Draft4Validator
+

 try:
    import cupy.random
@ -626,6 +628,31 @@ def fix_random_seed(seed=0):
        cupy.random.seed(seed)


+def validate_schema(schema):
+    # TODO: replace with (stable) Draft6Validator, if available
+    validator = Draft4Validator(schema)
+    validator.check_schema(schema)
+
+
+def validate_json(data, schema):
+    """Validate data against a given JSON schema (see https://json-schema.org).
+
+    data: JSON-serializable data to validate.
+    schema (dict): The JSON schema.
+    RETURNS (list): A list of error messages, if available.
+    """
+    # TODO: replace with (stable) Draft6Validator, if available
+    validator = Draft4Validator(schema)
+    errors = []
+    for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
+        if err.path:
+            err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
+        else:
+            err_path = ""
+        errors.append(err.message + " " + err_path)
+    return errors
+
+
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty