mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			212 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			212 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| include ../../_includes/_mixins
 | |
| 
 | |
| +h(2, "101") Serialization 101
 | |
| 
 | |
| include _spacy-101/_serialization
 | |
| 
 | |
| +infobox("Important note")
 | |
|     |  In spaCy v2.0, the API for saving and loading has changed to only use the
 | |
|     |  four methods listed above consistently across objects and classes. For an
 | |
|     |  overview of the changes, see #[+a("/docs/usage/v2#incompat") this table]
 | |
|     |  and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating].
 | |
| 
 | |
| +h(3, "example-doc") Example: Saving and loading a document
 | |
| 
 | |
| p
 | |
|     |  For simplicity, let's assume you've
 | |
|     |  #[+a("/docs/usage/entity-recognition#setting") added custom entities] to
 | |
|     |  a #[code Doc], either manually, or by using a
 | |
|     |  #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can
 | |
|     |  save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]],
 | |
|     |  and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]].
 | |
|     |  This will overwrite the existing object and return it.
 | |
| 
 | |
| +code.
 | |
|     import spacy
 | |
|     from spacy.tokens import Span
 | |
| 
 | |
|     text = u'Netflix is hiring a new VP of global policy'
 | |
| 
 | |
|     nlp = spacy.load('en')
 | |
|     doc = nlp(text)
 | |
|     assert len(doc.ents) == 0 # Doc has no entities
 | |
|     doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity
 | |
|     doc.to_disk('/path/to/doc') # save Doc to disk
 | |
| 
 | |
|     new_doc = nlp(text)
 | |
|     assert len(new_doc.ents) == 0 # new Doc has no entities
 | |
|     new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite
 | |
|     assert len(new_doc.ents) == 1 # entity is now recognised!
 | |
|     assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')]
 | |
| 
 | |
| +h(2, "models") Saving models
 | |
| 
 | |
| p
 | |
|     |  After training your model, you'll usually want to save its state, and load
 | |
|     |  it back later. You can do this with the
 | |
|     |  #[+api("language#to_disk") #[code Language.to_disk()]]
 | |
|     |  method:
 | |
| 
 | |
| +code.
 | |
|     nlp.to_disk('/home/me/data/en_example_model')
 | |
| 
 | |
| p
 | |
|     |  The directory will be created if it doesn't exist, and the whole pipeline
 | |
|     |  will be written out. To make the model more convenient to deploy, we
 | |
|     |  recommend wrapping it as a Python package.
 | |
| 
 | |
| +h(3, "models-generating") Generating a model package
 | |
| 
 | |
| +infobox("Important note")
 | |
|     |  The model packages are #[strong not suitable] for the public
 | |
|     |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
 | |
|     |  designed for binary data and files over 50 MB. However, if your company
 | |
|     |  is running an #[strong internal installation] of PyPi, publishing your
 | |
|     |  models on there can be a convenient way to share them with your team.
 | |
| 
 | |
| p
 | |
|     |  spaCy comes with a handy CLI command that will create all required files,
 | |
|     |  and walk you through generating the meta data. You can also create the
 | |
|     |  meta.json manually and place it in the model data directory, or supply a
 | |
|     |  path to it using the #[code --meta] flag. For more info on this, see
 | |
|     |  the #[+api("cli#package") #[code package]] docs.
 | |
| 
 | |
| +aside-code("meta.json", "json").
 | |
|     {
 | |
|         "name": "example_model",
 | |
|         "lang": "en",
 | |
|         "version": "1.0.0",
 | |
|         "spacy_version": ">=2.0.0,<3.0.0",
 | |
|         "description": "Example model for spaCy",
 | |
|         "author": "You",
 | |
|         "email": "you@example.com",
 | |
|         "license": "CC BY-SA 3.0",
 | |
|         "pipeline": ["token_vectors", "tagger"]
 | |
|     }
 | |
| 
 | |
| +code(false, "bash").
 | |
|     python -m spacy package /home/me/data/en_example_model /home/me/my_models
 | |
| 
 | |
| p This command will create a model package directory that should look like this:
 | |
| 
 | |
| +code("Directory structure", "yaml").
 | |
|     └── /
 | |
|         ├── MANIFEST.in                   # to include meta.json
 | |
|         ├── meta.json                     # model meta data
 | |
|         ├── setup.py                      # setup file for pip installation
 | |
|         └── en_example_model              # model directory
 | |
|             ├── __init__.py               # init for pip installation
 | |
|             └── en_example_model-1.0.0    # model data
 | |
| 
 | |
| p
 | |
|     |  You can also find templates for all files in our
 | |
|     |  #[+src(gh("spacy-dev-resources", "templates/model")) spaCy dev resources].
 | |
|     |  If you're creating the package manually, keep in mind that the directories
 | |
|     |  need to be named according to the naming conventions of
 | |
|     |  #[code lang_name] and #[code lang_name-version].
 | |
| 
 | |
| +h(3, "models-custom") Customising the model setup
 | |
| 
 | |
| p
 | |
|     |  The meta.json includes the model details, like name, requirements and
 | |
|     |  license, and lets you customise how the model should be initialised and
 | |
|     |  loaded. You can define the language data to be loaded and the
 | |
|     |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
 | |
|     |  execute.
 | |
| 
 | |
| +table(["Setting", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code lang]
 | |
|         +cell unicode
 | |
|         +cell ID of the language class to initialise.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code pipeline]
 | |
|         +cell list
 | |
|         +cell
 | |
|             |  A list of strings mapping to the IDs of pipeline factories to
 | |
|             |  apply in that order. If not set, spaCy's
 | |
|             |  #[+a("/docs/usage/language-processing/pipelines") default pipeline]
 | |
|             |  will be used.
 | |
| 
 | |
| p
 | |
|     |  The #[code load()] method that comes with our model package
 | |
|     |  templates will take care of putting all this together and returning a
 | |
|     |  #[code Language] object with the loaded pipeline and data. If your model
 | |
|     |  requires custom pipeline components, you should
 | |
|     |  #[strong ship then with your model] and register their
 | |
|     |  #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories]
 | |
|     |  via  #[+api("spacy#set_factory") #[code set_factory()]].
 | |
| 
 | |
| +aside-code("Factory example").
 | |
|     def my_factory(vocab):
 | |
|         # load some state
 | |
|         def my_component(doc):
 | |
|             # process the doc
 | |
|             return doc
 | |
|         return my_component
 | |
| 
 | |
| +code.
 | |
|     spacy.set_factory('custom_component', custom_component_factory)
 | |
| 
 | |
| +infobox("Custom models with pipeline components")
 | |
|     |  For more details and an example of how to package a sentiment model
 | |
|     |  with a custom pipeline component, see the usage workflow on
 | |
|     |  #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines].
 | |
| 
 | |
| +h(3, "models-building") Building the model package
 | |
| 
 | |
| p
 | |
|     |  To build the package, run the following command from within the
 | |
|     |  directory. For more information on building Python packages, see the
 | |
|     |  docs on Python's
 | |
|     |  #[+a("https://setuptools.readthedocs.io/en/latest/") Setuptools].
 | |
| 
 | |
| +code(false, "bash").
 | |
|     python setup.py sdist
 | |
| 
 | |
| p
 | |
|     |  This will create a #[code .tar.gz] archive in a directory #[code /dist].
 | |
|     |  The model can be installed by pointing pip to the path of the archive:
 | |
| 
 | |
| +code(false, "bash").
 | |
|     pip install /path/to/en_example_model-1.0.0.tar.gz
 | |
| 
 | |
| p
 | |
|     |  You can then load the model via its name, #[code en_example_model], or
 | |
|     |  import it directly as a module and then call its #[code load()] method.
 | |
| 
 | |
| +h(2, "loading") Loading a custom model package
 | |
| 
 | |
| p
 | |
|     |  To load a model from a data directory, you can use
 | |
|     |  #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
 | |
|     |  look for a meta.json in the directory and use the #[code lang] and
 | |
|     |  #[code pipeline] settings to initialise a #[code Language] class with a
 | |
|     |  processing pipeline and load in the model data.
 | |
| 
 | |
| +code.
 | |
|     nlp = spacy.load('/path/to/model')
 | |
| 
 | |
| p
 | |
|     |  If you want to #[strong load only the binary data], you'll have to create
 | |
|     |  a #[code Language] class and call
 | |
|     |  #[+api("language#from_disk") #[code from_disk]] instead.
 | |
| 
 | |
| +code.
 | |
|     from spacy.lang.en import English
 | |
|     nlp = English().from_disk('/path/to/data')
 | |
| 
 | |
| +infobox("Important note: Loading data in v2.x")
 | |
|     .o-block
 | |
|         |  In spaCy 1.x, the distinction between #[code spacy.load()] and the
 | |
|         |  #[code Language] class constructor was quite unclear. You could call
 | |
|         |  #[code spacy.load()] when no model was present, and it would silently
 | |
|         |  return an empty object. Likewise, you could pass a path to
 | |
|         |  #[code English], even if the mode required a different language.
 | |
|         |  spaCy v2.0 solves this with a clear distinction between setting up
 | |
|         |  the instance and loading the data.
 | |
| 
 | |
|     +code-new nlp = English().from_disk('/path/to/data')
 | |
|     +code-old nlp = spacy.load('en', path='/path/to/data')
 |