mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			155 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			155 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > API > LANGUAGE
 | |
| 
 | |
| include ../../_includes/_mixins
 | |
| 
 | |
| p A text processing pipeline.
 | |
| 
 | |
| +h(2, "attributes") Attributes
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code vocab]
 | |
|         +cell #[code Vocab]
 | |
|         +cell A container for the lexical types.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code tokenizer]
 | |
|         +cell #[code Tokenizer]
 | |
|         +cell Find word boundaries and create #[code Doc] object.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code tagger]
 | |
|         +cell #[code Tagger]
 | |
|         +cell Annotate #[code Doc] objects with POS tags.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code parser]
 | |
|         +cell #[code DependencyParser]
 | |
|         +cell Annotate #[code Doc] objects with syntactic dependencies.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code entity]
 | |
|         +cell #[code EntityRecognizer]
 | |
|         +cell Annotate #[code Doc] objects with named entities.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code matcher]
 | |
|         +cell #[code Matcher]
 | |
|         +cell Rule-based sequence matcher.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code make_doc]
 | |
|         +cell #[code lambda text: Doc]
 | |
|         +cell Create a #[code Doc] object from unicode text.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code pipeline]
 | |
|         +cell -
 | |
|         +cell Sequence of annotation functions.
 | |
| 
 | |
| 
 | |
| +h(2, "init") Language.__init__
 | |
|     +tag method
 | |
| 
 | |
| p Create or load the pipeline.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code **overrides]
 | |
|         +cell -
 | |
|         +cell Keyword arguments indicating which defaults to override.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code Language]
 | |
|         +cell The newly constructed object.
 | |
| 
 | |
| +h(2, "call") Language.__call__
 | |
|     +tag method
 | |
| 
 | |
| p Apply the pipeline to a single text.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.en import English
 | |
|     nlp = English()
 | |
|     doc = nlp('An example sentence. Another example sentence.')
 | |
|     doc[0].orth_, doc[0].head.tag_
 | |
|     # ('An', 'NN')
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code text]
 | |
|         +cell unicode
 | |
|         +cell The text to be processed.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code tag]
 | |
|         +cell bool
 | |
|         +cell Whether to apply the part-of-speech tagger.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code parse]
 | |
|         +cell bool
 | |
|         +cell Whether to apply the syntactic dependency parser.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code entity]
 | |
|         +cell bool
 | |
|         +cell Whether to apply the named entity recognizer.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code Doc]
 | |
|         +cell A container for accessing the linguistic annotations.
 | |
| 
 | |
| +h(2, "pipe") Language.pipe
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Process texts as a stream, and yield #[code Doc] objects in order.
 | |
|     |  Supports GIL-free multi-threading.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     texts = [u'One document.', u'...', u'Lots of documents']
 | |
|     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
 | |
|         assert doc.is_parsed
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code texts]
 | |
|         +cell -
 | |
|         +cell A sequence of unicode objects.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code n_threads]
 | |
|         +cell int
 | |
|         +cell
 | |
|             |  The number of worker threads to use. If #[code -1], OpenMP will
 | |
|             |  decide how many to use at run time. Default is #[code 2].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code batch_size]
 | |
|         +cell int
 | |
|         +cell The number of texts to buffer.
 | |
| 
 | |
|     +footrow
 | |
|         +cell yield
 | |
|         +cell #[code Doc]
 | |
|         +cell Containers for accessing the linguistic annotations.
 | |
| 
 | |
| +h(2, "save_to_directory") Language.save_to_directory
 | |
|     +tag method
 | |
| 
 | |
| p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code path]
 | |
|         +cell string or pathlib path
 | |
|         +cell Path to save the model.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code None]
 | |
|         +cell -
 |