mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			250 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			250 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > API > TOKENIZER
 | |
| 
 | |
| include ../../_includes/_mixins
 | |
| 
 | |
| p
 | |
|     |  Segment text, and create #[code Doc] objects with the discovered segment
 | |
|     |  boundaries.
 | |
| 
 | |
| +h(2, "attributes") Attributes
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code vocab]
 | |
|         +cell #[code Vocab]
 | |
|         +cell The vocab object of the parent #[code Doc].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code prefix_search]
 | |
|         +cell -
 | |
|         +cell
 | |
|             |  A function to find segment boundaries from the start of a
 | |
|             |  string. Returns the length of the segment, or #[code None].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code suffix_search]
 | |
|         +cell -
 | |
|         +cell
 | |
|             |  A function to find segment boundaries from the end of a string.
 | |
|             |  Returns the length of the segment, or #[code None].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code infix_finditer]
 | |
|         +cell -
 | |
|         +cell
 | |
|             |  A function to find internal segment separators, e.g. hyphens.
 | |
|             |  Returns a (possibly empty) list of #[code re.MatchObject]
 | |
|             |  objects.
 | |
| 
 | |
| +h(2, "load") Tokenizer.load
 | |
|     +tag classmethod
 | |
| 
 | |
| p Load a #[code Tokenizer], reading unsupplied components from the path.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code path]
 | |
|         +cell #[code Path]
 | |
|         +cell The path to load from.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code vocab]
 | |
|         +cell #[code Vocab]
 | |
|         +cell A storage container for lexical types.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code rules]
 | |
|         +cell dict
 | |
|         +cell Exceptions and special-cases for the tokenizer.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code prefix_search]
 | |
|         +cell callable
 | |
|         +cell
 | |
|             |  A function matching the signature of
 | |
|             |  #[code re.compile(string).search] to match prefixes.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code suffix_search]
 | |
|         +cell callable
 | |
|         +cell
 | |
|             |  A function matching the signature of
 | |
|             |  #[code re.compile(string).search] to match suffixes.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code infix_finditer]
 | |
|         +cell callable
 | |
|         +cell
 | |
|             |  A function matching the signature of
 | |
|             |  #[code re.compile(string).finditer] to find infixes.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code Tokenizer]
 | |
|         +cell The newly constructed object.
 | |
| 
 | |
| +h(2, "init") Tokenizer.__init__
 | |
|     +tag method
 | |
| 
 | |
| p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code vocab]
 | |
|         +cell #[code Vocab]
 | |
|         +cell A storage container for lexical types.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code rules]
 | |
|         +cell dict
 | |
|         +cell Exceptions and special-cases for the tokenizer.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code prefix_search]
 | |
|         +cell callable
 | |
|         +cell
 | |
|             |  A function matching the signature of
 | |
|             |  #[code re.compile(string).search] to match prefixes.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code suffix_search]
 | |
|         +cell callable
 | |
|         +cell
 | |
|             |  A function matching the signature of
 | |
|             |  #[code re.compile(string).search] to match suffixes.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code infix_finditer]
 | |
|         +cell callable
 | |
|         +cell
 | |
|             |  A function matching the signature of
 | |
|             |  #[code re.compile(string).finditer] to find infixes.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code Tokenizer]
 | |
|         +cell The newly constructed object.
 | |
| 
 | |
| +h(2, "call") Tokenizer.__call__
 | |
|     +tag method
 | |
| 
 | |
| p Tokenize a string.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code string]
 | |
|         +cell unicode
 | |
|         +cell The string to tokenize.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code Doc]
 | |
|         +cell A container for linguistic annotations.
 | |
| 
 | |
| +h(2, "pipe") Tokenizer.pipe
 | |
|     +tag method
 | |
| 
 | |
| p Tokenize a stream of texts.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code texts]
 | |
|         +cell -
 | |
|         +cell A sequence of unicode texts.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code batch_size]
 | |
|         +cell int
 | |
|         +cell The number of texts to accumulate in an internal buffer.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code n_threads]
 | |
|         +cell int
 | |
|         +cell
 | |
|             |  The number of threads to use, if the implementation supports
 | |
|             |  multi-threading. The default tokenizer is single-threaded.
 | |
| 
 | |
|     +footrow
 | |
|         +cell yield
 | |
|         +cell #[code Doc]
 | |
|         +cell A sequence of Doc objects, in order.
 | |
| 
 | |
| +h(2, "find_infix") Tokenizer.find_infix
 | |
|     +tag method
 | |
| 
 | |
| p Find internal split points of the string.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code string]
 | |
|         +cell unicode
 | |
|         +cell The string to split.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code List[re.MatchObject]]
 | |
|         +cell
 | |
|             |  A list of objects that have #[code .start()] and #[code .end()]
 | |
|             |  methods, denoting the placement of internal segment separators,
 | |
|             |  e.g. hyphens.
 | |
| 
 | |
| +h(2, "find_prefix") Tokenizer.find_prefix
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Find the length of a prefix that should be segmented from the string, or
 | |
|     |  #[code None] if no prefix rules match.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code string]
 | |
|         +cell unicode
 | |
|         +cell The string to segment.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell int / #[code None]
 | |
|         +cell The length of the prefix if present, otherwise #[code None].
 | |
| 
 | |
| +h(2, "find_suffix") Tokenizer.find_suffix
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Find the length of a suffix that should be segmented from the string, or
 | |
|     |  #[code None] if no suffix rules match.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code string]
 | |
|         +cell unicode
 | |
|         +cell The string to segment.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell int / #[code None]
 | |
|         +cell The length of the suffix if present, otherwise #[code None].
 | |
| 
 | |
| +h(2, "add_special_case") Tokenizer.add_special_case
 | |
|     +tag method
 | |
| 
 | |
| p Add a special-case tokenization rule.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code string]
 | |
|         +cell unicode
 | |
|         +cell The string to specially tokenize.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code token_attrs]
 | |
|         +cell -
 | |
|         +cell
 | |
|             |  A sequence of dicts, where each dict describes a token and its
 | |
|             |  attributes. The #[code ORTH] fields of the attributes must
 | |
|             |  exactly match the string when they are concatenated.
 | |
| 
 | |
|     +footrow
 | |
|         +cell return
 | |
|         +cell #[code None]
 | |
|         +cell -
 |