mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			221 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			221 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf-8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
 | 
						|
# NB: This schema describes the new format of the training data, see #2928
 | 
						|
TRAINING_SCHEMA = {
 | 
						|
    "$schema": "http://json-schema.org/draft-06/schema",
 | 
						|
    "title": "Training data for spaCy models",
 | 
						|
    "type": "array",
 | 
						|
    "items": {
 | 
						|
        "type": "object",
 | 
						|
        "properties": {
 | 
						|
            "text": {
 | 
						|
                "title": "The text of the training example",
 | 
						|
                "type": "string",
 | 
						|
                "minLength": 1,
 | 
						|
            },
 | 
						|
            "ents": {
 | 
						|
                "title": "Named entity spans in the text",
 | 
						|
                "type": "array",
 | 
						|
                "items": {
 | 
						|
                    "type": "object",
 | 
						|
                    "properties": {
 | 
						|
                        "start": {
 | 
						|
                            "title": "Start character offset of the span",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                        "end": {
 | 
						|
                            "title": "End character offset of the span",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                        "label": {
 | 
						|
                            "title": "Entity label",
 | 
						|
                            "type": "string",
 | 
						|
                            "minLength": 1,
 | 
						|
                            "pattern": "^[A-Z0-9]*$",
 | 
						|
                        },
 | 
						|
                    },
 | 
						|
                    "required": ["start", "end", "label"],
 | 
						|
                },
 | 
						|
            },
 | 
						|
            "sents": {
 | 
						|
                "title": "Sentence spans in the text",
 | 
						|
                "type": "array",
 | 
						|
                "items": {
 | 
						|
                    "type": "object",
 | 
						|
                    "properties": {
 | 
						|
                        "start": {
 | 
						|
                            "title": "Start character offset of the span",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                        "end": {
 | 
						|
                            "title": "End character offset of the span",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                    },
 | 
						|
                    "required": ["start", "end"],
 | 
						|
                },
 | 
						|
            },
 | 
						|
            "cats": {
 | 
						|
                "title": "Text categories for the text classifier",
 | 
						|
                "type": "object",
 | 
						|
                "patternProperties": {
 | 
						|
                    "*": {
 | 
						|
                        "title": "A text category",
 | 
						|
                        "oneOf": [
 | 
						|
                            {"type": "boolean"},
 | 
						|
                            {"type": "number", "minimum": 0},
 | 
						|
                        ],
 | 
						|
                    }
 | 
						|
                },
 | 
						|
                "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
 | 
						|
            },
 | 
						|
            "tokens": {
 | 
						|
                "title": "The tokens in the text",
 | 
						|
                "type": "array",
 | 
						|
                "items": {
 | 
						|
                    "type": "object",
 | 
						|
                    "minProperties": 1,
 | 
						|
                    "properties": {
 | 
						|
                        "id": {
 | 
						|
                            "title": "Token ID, usually token index",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                        "start": {
 | 
						|
                            "title": "Start character offset of the token",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                        "end": {
 | 
						|
                            "title": "End character offset of the token",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                        "pos": {
 | 
						|
                            "title": "Coarse-grained part-of-speech tag",
 | 
						|
                            "type": "string",
 | 
						|
                            "minLength": 1,
 | 
						|
                        },
 | 
						|
                        "tag": {
 | 
						|
                            "title": "Fine-grained part-of-speech tag",
 | 
						|
                            "type": "string",
 | 
						|
                            "minLength": 1,
 | 
						|
                        },
 | 
						|
                        "dep": {
 | 
						|
                            "title": "Dependency label",
 | 
						|
                            "type": "string",
 | 
						|
                            "minLength": 1,
 | 
						|
                        },
 | 
						|
                        "head": {
 | 
						|
                            "title": "Index of the token's head",
 | 
						|
                            "type": "integer",
 | 
						|
                            "minimum": 0,
 | 
						|
                        },
 | 
						|
                    },
 | 
						|
                    "required": ["start", "end"],
 | 
						|
                },
 | 
						|
            },
 | 
						|
            "_": {"title": "Custom user space", "type": "object"},
 | 
						|
        },
 | 
						|
        "required": ["text"],
 | 
						|
    },
 | 
						|
}
 | 
						|
 | 
						|
META_SCHEMA = {
 | 
						|
    "$schema": "http://json-schema.org/draft-06/schema",
 | 
						|
    "type": "object",
 | 
						|
    "properties": {
 | 
						|
        "lang": {
 | 
						|
            "title": "Two-letter language code, e.g. 'en'",
 | 
						|
            "type": "string",
 | 
						|
            "minLength": 2,
 | 
						|
            "maxLength": 2,
 | 
						|
            "pattern": "^[a-z]*$",
 | 
						|
        },
 | 
						|
        "name": {
 | 
						|
            "title": "Model name",
 | 
						|
            "type": "string",
 | 
						|
            "minLength": 1,
 | 
						|
            "pattern": "^[a-z_]*$",
 | 
						|
        },
 | 
						|
        "version": {
 | 
						|
            "title": "Model version",
 | 
						|
            "type": "string",
 | 
						|
            "minLength": 1,
 | 
						|
            "pattern": "^[0-9a-z.-]*$",
 | 
						|
        },
 | 
						|
        "spacy_version": {
 | 
						|
            "title": "Compatible spaCy version identifier",
 | 
						|
            "type": "string",
 | 
						|
            "minLength": 1,
 | 
						|
            "pattern": "^[0-9a-z.-><=]*$",
 | 
						|
        },
 | 
						|
        "parent_package": {
 | 
						|
            "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
 | 
						|
            "type": "string",
 | 
						|
            "minLength": 1,
 | 
						|
            "default": "spacy",
 | 
						|
        },
 | 
						|
        "pipeline": {
 | 
						|
            "title": "Names of pipeline components",
 | 
						|
            "type": "array",
 | 
						|
            "items": {"type": "string", "minLength": 1},
 | 
						|
        },
 | 
						|
        "description": {"title": "Model description", "type": "string"},
 | 
						|
        "license": {"title": "Model license", "type": "string"},
 | 
						|
        "author": {"title": "Model author name", "type": "string"},
 | 
						|
        "email": {"title": "Model author email", "type": "string", "format": "email"},
 | 
						|
        "url": {"title": "Model author URL", "type": "string", "format": "uri"},
 | 
						|
        "sources": {
 | 
						|
            "title": "Training data sources",
 | 
						|
            "type": "array",
 | 
						|
            "items": {"type": "string"},
 | 
						|
        },
 | 
						|
        "vectors": {
 | 
						|
            "title": "Included word vectors",
 | 
						|
            "type": "object",
 | 
						|
            "properties": {
 | 
						|
                "keys": {
 | 
						|
                    "title": "Number of unique keys",
 | 
						|
                    "type": "integer",
 | 
						|
                    "minimum": 0,
 | 
						|
                },
 | 
						|
                "vectors": {
 | 
						|
                    "title": "Number of unique vectors",
 | 
						|
                    "type": "integer",
 | 
						|
                    "minimum": 0,
 | 
						|
                },
 | 
						|
                "width": {
 | 
						|
                    "title": "Number of dimensions",
 | 
						|
                    "type": "integer",
 | 
						|
                    "minimum": 0,
 | 
						|
                },
 | 
						|
            },
 | 
						|
        },
 | 
						|
        "accuracy": {
 | 
						|
            "title": "Accuracy numbers",
 | 
						|
            "type": "object",
 | 
						|
            "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
 | 
						|
        },
 | 
						|
        "speed": {
 | 
						|
            "title": "Speed evaluation numbers",
 | 
						|
            "type": "object",
 | 
						|
            "patternProperties": {
 | 
						|
                "*": {
 | 
						|
                    "oneOf": [
 | 
						|
                        {"type": "number", "minimum": 0.0},
 | 
						|
                        {"type": "integer", "minimum": 0},
 | 
						|
                    ]
 | 
						|
                }
 | 
						|
            },
 | 
						|
        },
 | 
						|
    },
 | 
						|
    "required": ["lang", "name", "version"],
 | 
						|
}
 |