mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			69 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			69 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
//- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION
 | 
						||
 | 
						||
p
 | 
						||
    |  If you've been modifying the pipeline, vocabulary vectors and entities, or made
 | 
						||
    |  updates to the model, you'll eventually want
 | 
						||
    |  to #[strong save your progress] – for example, everything that's in your #[code nlp]
 | 
						||
    |  object. This means you'll have to translate its contents and structure
 | 
						||
    |  into a format that can be saved, like a file or a byte string. This
 | 
						||
    |  process is called serialization. spaCy comes with
 | 
						||
    |  #[strong built-in serialization methods] and supports the
 | 
						||
    |  #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol].
 | 
						||
 | 
						||
+aside("What's pickle?")
 | 
						||
    |  Pickle is Python's built-in object persistance system. It lets you
 | 
						||
    |  transfer arbitrary Python objects between processes. This is usually used
 | 
						||
    |  to load an object to and from disk, but it's also used for distributed
 | 
						||
    |  computing, e.g. with
 | 
						||
    |  #[+a("https://spark.apache.org/docs/0.9.0/python-programming-guide.html") PySpark]
 | 
						||
    |  or #[+a("http://dask.pydata.org/en/latest/") Dask]. When you unpickle an
 | 
						||
    |  object, you're agreeing to execute whatever code it contains. It's like
 | 
						||
    |  calling #[code eval()] on a string – so don't unpickle objects from
 | 
						||
    |  untrusted sources.
 | 
						||
 | 
						||
p
 | 
						||
    |  All container classes, i.e. #[+api("language") #[code Language]],
 | 
						||
    |  #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and
 | 
						||
    |  #[+api("stringstore") #[code StringStore]] have the following methods
 | 
						||
    |  available:
 | 
						||
 | 
						||
+table(["Method", "Returns", "Example"])
 | 
						||
    - style = [1, 0, 1]
 | 
						||
    +annotation-row(["to_bytes", "bytes", "nlp.to_bytes()"], style)
 | 
						||
    +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style)
 | 
						||
    +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
 | 
						||
    +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
 | 
						||
 | 
						||
p
 | 
						||
    |  For example, if you've processed a very large document, you can use
 | 
						||
    |  #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your
 | 
						||
    |  local machine. This will save the document and its tokens, as well as
 | 
						||
    |  the vocabulary associated with the #[code Doc].
 | 
						||
 | 
						||
+aside("Why saving the vocab?")
 | 
						||
    |  Saving the vocabulary with the #[code Doc] is important, because the
 | 
						||
    |  #[code Vocab] holds the context-independent information about the words,
 | 
						||
    |  tags and labels, and their #[strong hash values]. If the #[code Vocab]
 | 
						||
    |  wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
 | 
						||
    |  those IDs – for example, the word text or the dependency labels. You
 | 
						||
    |  might be saving #[code 446] for "whale", but in a different vocabulary,
 | 
						||
    |  this ID could map to "VERB". Similarly, if your document was processed by
 | 
						||
    |  a German model, its vocab will include the specific
 | 
						||
    |  #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
 | 
						||
 | 
						||
+code.
 | 
						||
    moby_dick = open('moby_dick.txt', 'r') # open a large document
 | 
						||
    doc = nlp(moby_dick) # process it
 | 
						||
    doc.to_disk('/moby_dick.bin') # save the processed Doc
 | 
						||
 | 
						||
p
 | 
						||
    |  If you need it again later, you can load it back into an empty #[code Doc]
 | 
						||
    |  with an empty #[code Vocab] by calling
 | 
						||
    |  #[+api("doc#from_disk") #[code from_disk()]]:
 | 
						||
 | 
						||
+code.
 | 
						||
    from spacy.tokens import Doc # to create empty Doc
 | 
						||
    from spacy.vocab import Vocab # to create empty Vocab
 | 
						||
 | 
						||
    doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc
 |