mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			221 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			221 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf-8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| 
 | |
| # NB: This schema describes the new format of the training data, see #2928
 | |
| TRAINING_SCHEMA = {
 | |
|     "$schema": "http://json-schema.org/draft-06/schema",
 | |
|     "title": "Training data for spaCy models",
 | |
|     "type": "array",
 | |
|     "items": {
 | |
|         "type": "object",
 | |
|         "properties": {
 | |
|             "text": {
 | |
|                 "title": "The text of the training example",
 | |
|                 "type": "string",
 | |
|                 "minLength": 1,
 | |
|             },
 | |
|             "ents": {
 | |
|                 "title": "Named entity spans in the text",
 | |
|                 "type": "array",
 | |
|                 "items": {
 | |
|                     "type": "object",
 | |
|                     "properties": {
 | |
|                         "start": {
 | |
|                             "title": "Start character offset of the span",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                         "end": {
 | |
|                             "title": "End character offset of the span",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                         "label": {
 | |
|                             "title": "Entity label",
 | |
|                             "type": "string",
 | |
|                             "minLength": 1,
 | |
|                             "pattern": "^[A-Z0-9]*$",
 | |
|                         },
 | |
|                     },
 | |
|                     "required": ["start", "end", "label"],
 | |
|                 },
 | |
|             },
 | |
|             "sents": {
 | |
|                 "title": "Sentence spans in the text",
 | |
|                 "type": "array",
 | |
|                 "items": {
 | |
|                     "type": "object",
 | |
|                     "properties": {
 | |
|                         "start": {
 | |
|                             "title": "Start character offset of the span",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                         "end": {
 | |
|                             "title": "End character offset of the span",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                     },
 | |
|                     "required": ["start", "end"],
 | |
|                 },
 | |
|             },
 | |
|             "cats": {
 | |
|                 "title": "Text categories for the text classifier",
 | |
|                 "type": "object",
 | |
|                 "patternProperties": {
 | |
|                     "*": {
 | |
|                         "title": "A text category",
 | |
|                         "oneOf": [
 | |
|                             {"type": "boolean"},
 | |
|                             {"type": "number", "minimum": 0},
 | |
|                         ],
 | |
|                     }
 | |
|                 },
 | |
|                 "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
 | |
|             },
 | |
|             "tokens": {
 | |
|                 "title": "The tokens in the text",
 | |
|                 "type": "array",
 | |
|                 "items": {
 | |
|                     "type": "object",
 | |
|                     "minProperties": 1,
 | |
|                     "properties": {
 | |
|                         "id": {
 | |
|                             "title": "Token ID, usually token index",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                         "start": {
 | |
|                             "title": "Start character offset of the token",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                         "end": {
 | |
|                             "title": "End character offset of the token",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                         "pos": {
 | |
|                             "title": "Coarse-grained part-of-speech tag",
 | |
|                             "type": "string",
 | |
|                             "minLength": 1,
 | |
|                         },
 | |
|                         "tag": {
 | |
|                             "title": "Fine-grained part-of-speech tag",
 | |
|                             "type": "string",
 | |
|                             "minLength": 1,
 | |
|                         },
 | |
|                         "dep": {
 | |
|                             "title": "Dependency label",
 | |
|                             "type": "string",
 | |
|                             "minLength": 1,
 | |
|                         },
 | |
|                         "head": {
 | |
|                             "title": "Index of the token's head",
 | |
|                             "type": "integer",
 | |
|                             "minimum": 0,
 | |
|                         },
 | |
|                     },
 | |
|                     "required": ["start", "end"],
 | |
|                 },
 | |
|             },
 | |
|             "_": {"title": "Custom user space", "type": "object"},
 | |
|         },
 | |
|         "required": ["text"],
 | |
|     },
 | |
| }
 | |
| 
 | |
| META_SCHEMA = {
 | |
|     "$schema": "http://json-schema.org/draft-06/schema",
 | |
|     "type": "object",
 | |
|     "properties": {
 | |
|         "lang": {
 | |
|             "title": "Two-letter language code, e.g. 'en'",
 | |
|             "type": "string",
 | |
|             "minLength": 2,
 | |
|             "maxLength": 2,
 | |
|             "pattern": "^[a-z]*$",
 | |
|         },
 | |
|         "name": {
 | |
|             "title": "Model name",
 | |
|             "type": "string",
 | |
|             "minLength": 1,
 | |
|             "pattern": "^[a-z_]*$",
 | |
|         },
 | |
|         "version": {
 | |
|             "title": "Model version",
 | |
|             "type": "string",
 | |
|             "minLength": 1,
 | |
|             "pattern": "^[0-9a-z.-]*$",
 | |
|         },
 | |
|         "spacy_version": {
 | |
|             "title": "Compatible spaCy version identifier",
 | |
|             "type": "string",
 | |
|             "minLength": 1,
 | |
|             "pattern": "^[0-9a-z.-><=]*$",
 | |
|         },
 | |
|         "parent_package": {
 | |
|             "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
 | |
|             "type": "string",
 | |
|             "minLength": 1,
 | |
|             "default": "spacy",
 | |
|         },
 | |
|         "pipeline": {
 | |
|             "title": "Names of pipeline components",
 | |
|             "type": "array",
 | |
|             "items": {"type": "string", "minLength": 1},
 | |
|         },
 | |
|         "description": {"title": "Model description", "type": "string"},
 | |
|         "license": {"title": "Model license", "type": "string"},
 | |
|         "author": {"title": "Model author name", "type": "string"},
 | |
|         "email": {"title": "Model author email", "type": "string", "format": "email"},
 | |
|         "url": {"title": "Model author URL", "type": "string", "format": "uri"},
 | |
|         "sources": {
 | |
|             "title": "Training data sources",
 | |
|             "type": "array",
 | |
|             "items": {"type": "string"},
 | |
|         },
 | |
|         "vectors": {
 | |
|             "title": "Included word vectors",
 | |
|             "type": "object",
 | |
|             "properties": {
 | |
|                 "keys": {
 | |
|                     "title": "Number of unique keys",
 | |
|                     "type": "integer",
 | |
|                     "minimum": 0,
 | |
|                 },
 | |
|                 "vectors": {
 | |
|                     "title": "Number of unique vectors",
 | |
|                     "type": "integer",
 | |
|                     "minimum": 0,
 | |
|                 },
 | |
|                 "width": {
 | |
|                     "title": "Number of dimensions",
 | |
|                     "type": "integer",
 | |
|                     "minimum": 0,
 | |
|                 },
 | |
|             },
 | |
|         },
 | |
|         "accuracy": {
 | |
|             "title": "Accuracy numbers",
 | |
|             "type": "object",
 | |
|             "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
 | |
|         },
 | |
|         "speed": {
 | |
|             "title": "Speed evaluation numbers",
 | |
|             "type": "object",
 | |
|             "patternProperties": {
 | |
|                 "*": {
 | |
|                     "oneOf": [
 | |
|                         {"type": "number", "minimum": 0.0},
 | |
|                         {"type": "integer", "minimum": 0},
 | |
|                     ]
 | |
|                 }
 | |
|             },
 | |
|         },
 | |
|     },
 | |
|     "required": ["lang", "name", "version"],
 | |
| }
 |