//- 💫 DOCS > API > TOKENIZER include ../../_includes/_mixins p | Segment text, and create #[code Doc] objects with the discovered segment | boundaries. +h(2, "init") Tokenizer.__init__ +tag method p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text. +aside-code("Example"). # Construction 1 from spacy.tokenizer import Tokenizer tokenizer = Tokenizer(nlp.vocab) # Construction 2 from spacy.lang.en import English tokenizer = English().Defaults.create_tokenizer(nlp) +table(["Name", "Type", "Description"]) +row +cell #[code vocab] +cell #[code Vocab] +cell A storage container for lexical types. +row +cell #[code rules] +cell dict +cell Exceptions and special-cases for the tokenizer. +row +cell #[code prefix_search] +cell callable +cell | A function matching the signature of | #[code re.compile(string).search] to match prefixes. +row +cell #[code suffix_search] +cell callable +cell | A function matching the signature of | #[code re.compile(string).search] to match suffixes. +row +cell #[code infix_finditer] +cell callable +cell | A function matching the signature of | #[code re.compile(string).finditer] to find infixes. +row +cell #[code token_match] +cell callable +cell A boolean function matching strings to be recognised as tokens. +footrow +cell returns +cell #[code Tokenizer] +cell The newly constructed object. +h(2, "call") Tokenizer.__call__ +tag method p Tokenize a string. +aside-code("Example"). tokens = tokenizer(u'This is a sentence') assert len(tokens) == 4 +table(["Name", "Type", "Description"]) +row +cell #[code string] +cell unicode +cell The string to tokenize. +footrow +cell returns +cell #[code Doc] +cell A container for linguistic annotations. +h(2, "pipe") Tokenizer.pipe +tag method p Tokenize a stream of texts. +aside-code("Example"). texts = [u'One document.', u'...', u'Lots of documents'] for doc in tokenizer.pipe(texts, batch_size=50): pass +table(["Name", "Type", "Description"]) +row +cell #[code texts] +cell - +cell A sequence of unicode texts. +row +cell #[code batch_size] +cell int +cell The number of texts to accumulate in an internal buffer. +row +cell #[code n_threads] +cell int +cell | The number of threads to use, if the implementation supports | multi-threading. The default tokenizer is single-threaded. +footrow +cell yields +cell #[code Doc] +cell A sequence of Doc objects, in order. +h(2, "find_infix") Tokenizer.find_infix +tag method p Find internal split points of the string. +table(["Name", "Type", "Description"]) +row +cell #[code string] +cell unicode +cell The string to split. +footrow +cell returns +cell list +cell | A list of #[code re.MatchObject] objects that have #[code .start()] | and #[code .end()] methods, denoting the placement of internal | segment separators, e.g. hyphens. +h(2, "find_prefix") Tokenizer.find_prefix +tag method p | Find the length of a prefix that should be segmented from the string, or | #[code None] if no prefix rules match. +table(["Name", "Type", "Description"]) +row +cell #[code string] +cell unicode +cell The string to segment. +footrow +cell returns +cell int +cell The length of the prefix if present, otherwise #[code None]. +h(2, "find_suffix") Tokenizer.find_suffix +tag method p | Find the length of a suffix that should be segmented from the string, or | #[code None] if no suffix rules match. +table(["Name", "Type", "Description"]) +row +cell #[code string] +cell unicode +cell The string to segment. +footrow +cell returns +cell int / #[code None] +cell The length of the suffix if present, otherwise #[code None]. +h(2, "add_special_case") Tokenizer.add_special_case +tag method p | Add a special-case tokenization rule. This mechanism is also used to add | custom tokenizer exceptions to the language data. See the usage workflow | on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages] | for more details and examples. +aside-code("Example"). from spacy.attrs import ORTH, LEMMA case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}] tokenizer.add_special_case(case) +table(["Name", "Type", "Description"]) +row +cell #[code string] +cell unicode +cell The string to specially tokenize. +row +cell #[code token_attrs] +cell iterable +cell | A sequence of dicts, where each dict describes a token and its | attributes. The #[code ORTH] fields of the attributes must | exactly match the string when they are concatenated. +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) +row +cell #[code vocab] +cell #[code Vocab] +cell The vocab object of the parent #[code Doc]. +row +cell #[code prefix_search] +cell - +cell | A function to find segment boundaries from the start of a | string. Returns the length of the segment, or #[code None]. +row +cell #[code suffix_search] +cell - +cell | A function to find segment boundaries from the end of a string. | Returns the length of the segment, or #[code None]. +row +cell #[code infix_finditer] +cell - +cell | A function to find internal segment separators, e.g. hyphens. | Returns a (possibly empty) list of #[code re.MatchObject] | objects.