//- 💫 DOCS > API > TOKEN include ../_includes/_mixins p An individual token — i.e. a word, punctuation symbol, whitespace, etc. +h(2, "init") Token.__init__ +tag method p Construct a #[code Token] object. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') token = doc[0] assert token.text == u'Give' +table(["Name", "Type", "Description"]) +row +cell #[code vocab] +cell #[code Vocab] +cell A storage container for lexical types. +row +cell #[code doc] +cell #[code Doc] +cell The parent document. +row +cell #[code offset] +cell int +cell The index of the token within the document. +row("foot") +cell returns +cell #[code Token] +cell The newly constructed object. +h(2, "len") Token.__len__ +tag method p The number of unicode characters in the token, i.e. #[code token.text]. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') token = doc[0] assert len(token) == 4 +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell int +cell The number of unicode characters in the token. +h(2, "set_extension") Token.set_extension +tag classmethod +tag-new(2) p | Define a custom attribute on the #[code Token] which becomes available | via #[code Token._]. For details, see the documentation on | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. +aside-code("Example"). from spacy.tokens import Token fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana') Token.set_extension('is_fruit', getter=fruit_getter) doc = nlp(u'I have an apple') assert doc[3]._.is_fruit +table(["Name", "Type", "Description"]) +row +cell #[code name] +cell unicode +cell | Name of the attribute to set by the extension. For example, | #[code 'my_attr'] will be available as #[code token._.my_attr]. +row +cell #[code default] +cell - +cell | Optional default value of the attribute if no getter or method | is defined. +row +cell #[code method] +cell callable +cell | Set a custom method on the object, for example | #[code token._.compare(other_token)]. +row +cell #[code getter] +cell callable +cell | Getter function that takes the object and returns an attribute | value. Is called when the user accesses the #[code ._] attribute. +row +cell #[code setter] +cell callable +cell | Setter function that takes the #[code Token] and a value, and | modifies the object. Is called when the user writes to the | #[code Token._] attribute. +h(2, "get_extension") Token.get_extension +tag classmethod +tag-new(2) p | Look up a previously registered extension by name. Returns a 4-tuple | #[code.u-break (default, method, getter, setter)] if the extension is | registered. Raises a #[code KeyError] otherwise. +aside-code("Example"). from spacy.tokens import Token Token.set_extension('is_fruit', default=False) extension = Token.get_extension('is_fruit') assert extension == (False, None, None, None) +table(["Name", "Type", "Description"]) +row +cell #[code name] +cell unicode +cell Name of the extension. +row("foot") +cell returns +cell tuple +cell | A #[code.u-break (default, method, getter, setter)] tuple of the | extension. +h(2, "has_extension") Token.has_extension +tag classmethod +tag-new(2) p Check whether an extension has been registered on the #[code Token] class. +aside-code("Example"). from spacy.tokens import Token Token.set_extension('is_fruit', default=False) assert Token.has_extension('is_fruit') +table(["Name", "Type", "Description"]) +row +cell #[code name] +cell unicode +cell Name of the extension to check. +row("foot") +cell returns +cell bool +cell Whether the extension has been registered. +h(2, "remove_extension") Token.remove_extension +tag classmethod +tag-new("2.0.11") p Remove a previously registered extension. +aside-code("Example"). from spacy.tokens import Token Token.set_extension('is_fruit', default=False) removed = Token.remove_extension('is_fruit') assert not Token.has_extension('is_fruit') +table(["Name", "Type", "Description"]) +row +cell #[code name] +cell unicode +cell Name of the extension. +row("foot") +cell returns +cell tuple +cell | A #[code.u-break (default, method, getter, setter)] tuple of the | removed extension. +h(2, "check_flag") Token.check_flag +tag method p Check the value of a boolean flag. +aside-code("Example"). from spacy.attrs import IS_TITLE doc = nlp(u'Give it back! He pleaded.') token = doc[0] assert token.check_flag(IS_TITLE) == True +table(["Name", "Type", "Description"]) +row +cell #[code flag_id] +cell int +cell The attribute ID of the flag to check. +row("foot") +cell returns +cell bool +cell Whether the flag is set. +h(2, "similarity") Token.similarity +tag method +tag-model("vectors") p Compute a semantic similarity estimate. Defaults to cosine over vectors. +aside-code("Example"). apples, _, oranges = nlp(u'apples and oranges') apples_oranges = apples.similarity(oranges) oranges_apples = oranges.similarity(apples) assert apples_oranges == oranges_apples +table(["Name", "Type", "Description"]) +row +cell other +cell - +cell | The object to compare with. By default, accepts #[code Doc], | #[code Span], #[code Token] and #[code Lexeme] objects. +row("foot") +cell returns +cell float +cell A scalar similarity score. Higher is more similar. +h(2, "nbor") Token.nbor +tag method p Get a neighboring token. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') give_nbor = doc[0].nbor() assert give_nbor.text == u'it' +table(["Name", "Type", "Description"]) +row +cell #[code i] +cell int +cell The relative position of the token to get. Defaults to #[code 1]. +row("foot") +cell returns +cell #[code Token] +cell The token at position #[code self.doc[self.i+i]]. +h(2, "is_ancestor") Token.is_ancestor +tag method +tag-model("parse") p | Check whether this token is a parent, grandparent, etc. of another | in the dependency tree. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') give = doc[0] it = doc[1] assert give.is_ancestor(it) +table(["Name", "Type", "Description"]) +row +cell descendant +cell #[code Token] +cell Another token. +row("foot") +cell returns +cell bool +cell Whether this token is the ancestor of the descendant. +h(2, "ancestors") Token.ancestors +tag property +tag-model("parse") p The rightmost token of this token's syntactic descendants. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') it_ancestors = doc[1].ancestors assert [t.text for t in it_ancestors] == [u'Give'] he_ancestors = doc[4].ancestors assert [t.text for t in he_ancestors] == [u'pleaded'] +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell #[code Token] +cell | A sequence of ancestor tokens such that | #[code ancestor.is_ancestor(self)]. +h(2, "conjuncts") Token.conjuncts +tag property +tag-model("parse") p A sequence of coordinated tokens, including the token itself. +aside-code("Example"). doc = nlp(u'I like apples and oranges') apples_conjuncts = doc[2].conjuncts assert [t.text for t in apples_conjuncts] == [u'oranges'] +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell #[code Token] +cell A coordinated token. +h(2, "children") Token.children +tag property +tag-model("parse") p A sequence of the token's immediate syntactic children. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') give_children = doc[0].children assert [t.text for t in give_children] == [u'it', u'back', u'!'] +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell #[code Token] +cell A child token such that #[code child.head==self]. +h(2, "lefts") Token.lefts +tag property +tag-model("parse") p | The leftward immediate children of the word, in the syntactic dependency | parse. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') lefts = [t.text for t in doc[3].lefts] assert lefts == [u'New'] +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell #[code Token] +cell A left-child of the token. +h(2, "rights") Token.rights +tag property +tag-model("parse") p | The rightward immediate children of the word, in the syntactic | dependency parse. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') rights = [t.text for t in doc[3].rights] assert rights == [u'in'] +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell #[code Token] +cell A right-child of the token. +h(2, "n_lefts") Token.n_lefts +tag property +tag-model("parse") p | The number of leftward immediate children of the word, in the syntactic | dependency parse. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') assert doc[3].n_lefts == 1 +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell int +cell The number of left-child tokens. +h(2, "n_rights") Token.n_rights +tag property +tag-model("parse") p | The number of rightward immediate children of the word, in the syntactic | dependency parse. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') assert doc[3].n_rights == 1 +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell int +cell The number of right-child tokens. +h(2, "subtree") Token.subtree +tag property +tag-model("parse") p A sequence of all the token's syntactic descendents. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') give_subtree = doc[0].subtree assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!'] +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell #[code Token] +cell A descendant token such that #[code self.is_ancestor(descendant)]. +h(2, "is_sent_start") Token.is_sent_start +tag property +tag-new(2) p | A boolean value indicating whether the token starts a sentence. | #[code None] if unknown. +aside-code("Example"). doc = nlp(u'Give it back! He pleaded.') assert doc[4].is_sent_start assert not doc[5].is_sent_start +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell bool +cell Whether the token starts a sentence. +infobox("Changed in v2.0", "⚠️") | As of spaCy v2.0, the #[code Token.sent_start] property is deprecated and | has been replaced with #[code Token.is_sent_start], which returns a | boolean value instead of a misleading #[code 0] for #[code False] and | #[code 1] for #[code True]. It also now returns #[code None] if the | answer is unknown, and fixes a quirk in the old logic that would always | set the property to #[code 0] for the first word of the document. +code-wrapper +code-new assert doc[4].is_sent_start == True +code-old assert doc[4].sent_start == 1 +h(2, "has_vector") Token.has_vector +tag property +tag-model("vectors") p | A boolean value indicating whether a word vector is associated with the | token. +aside-code("Example"). doc = nlp(u'I like apples') apples = doc[2] assert apples.has_vector +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell bool +cell Whether the token has a vector data attached. +h(2, "vector") Token.vector +tag property +tag-model("vectors") p A real-valued meaning representation. +aside-code("Example"). doc = nlp(u'I like apples') apples = doc[2] assert apples.vector.dtype == 'float32' assert apples.vector.shape == (300,) +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the token's semantics. +h(2, "vector_norm") Token.vector_norm +tag property +tag-model("vectors") p The L2 norm of the token's vector representation. +aside-code("Example"). doc = nlp(u'I like apples and pasta') apples = doc[2] pasta = doc[4] apples.vector_norm # 6.89589786529541 pasta.vector_norm # 7.759851932525635 assert apples.vector_norm != pasta.vector_norm +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell float +cell The L2 norm of the vector representation. +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) +row +cell #[code text] +cell unicode +cell Verbatim text content. +row +cell #[code text_with_ws] +cell unicode +cell Text content, with trailing space character if present. +row +cell #[code whitespace_] +cell unicode +cell Trailing space character if present. +row +cell #[code orth] +cell int +cell ID of the verbatim text content. +row +cell #[code orth_] +cell unicode +cell | Verbatim text content (identical to #[code Token.text]). Existst | mostly for consistency with the other attributes. +row +cell #[code vocab] +cell #[code Vocab] +cell The vocab object of the parent #[code Doc]. +row +cell #[code doc] +cell #[code Doc] +cell The parent document. +row +cell #[code head] +cell #[code Token] +cell The syntactic parent, or "governor", of this token. +row +cell #[code left_edge] +cell #[code Token] +cell The leftmost token of this token's syntactic descendants. +row +cell #[code right_edge] +cell #[code Token] +cell The rightmost token of this token's syntactic descendents. +row +cell #[code i] +cell int +cell The index of the token within the parent document. +row +cell #[code ent_type] +cell int +cell Named entity type. +row +cell #[code ent_type_] +cell unicode +cell Named entity type. +row +cell #[code ent_iob] +cell int +cell | IOB code of named entity tag. #[code "B"] | means the token begins an entity, #[code "I"] means it is inside | an entity, #[code "O"] means it is outside an entity, and | #[code ""] means no entity tag is set. +row +cell #[code ent_iob_] +cell unicode +cell | IOB code of named entity tag. #[code "B"] | means the token begins an entity, #[code "I"] means it is inside | an entity, #[code "O"] means it is outside an entity, and | #[code ""] means no entity tag is set. +row +cell #[code ent_id] +cell int +cell | ID of the entity the token is an instance of, if any. Currently | not used, but potentially for coreference resolution. +row +cell #[code ent_id_] +cell unicode +cell | ID of the entity the token is an instance of, if any. Currently | not used, but potentially for coreference resolution. +row +cell #[code lemma] +cell int +cell | Base form of the token, with no inflectional suffixes. +row +cell #[code lemma_] +cell unicode +cell Base form of the token, with no inflectional suffixes. +row +cell #[code norm] +cell int +cell | The token's norm, i.e. a normalised form of the token text. | Usually set in the language's | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. +row +cell #[code norm_] +cell unicode +cell | The token's norm, i.e. a normalised form of the token text. | Usually set in the language's | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. +row +cell #[code lower] +cell int +cell Lowercase form of the token. +row +cell #[code lower_] +cell unicode +cell | Lowercase form of the token text. Equivalent to | #[code Token.text.lower()]. +row +cell #[code shape] +cell int +cell | Transform of the tokens's string, to show orthographic features. | For example, "Xxxx" or "dd". +row +cell #[code shape_] +cell unicode +cell | Transform of the tokens's string, to show orthographic features. | For example, "Xxxx" or "dd". +row +cell #[code prefix] +cell int +cell | Hash value of a length-N substring from the start of the | token. Defaults to #[code N=1]. +row +cell #[code prefix_] +cell unicode +cell | A length-N substring from the start of the token. Defaults to | #[code N=1]. +row +cell #[code suffix] +cell int +cell | Hash value of a length-N substring from the end of the token. | Defaults to #[code N=3]. +row +cell #[code suffix_] +cell unicode +cell | Length-N substring from the end of the token. Defaults to | #[code N=3]. +row +cell #[code is_alpha] +cell bool +cell | Does the token consist of alphabetic characters? Equivalent to | #[code token.text.isalpha()]. +row +cell #[code is_ascii] +cell bool +cell | Does the token consist of ASCII characters? Equivalent to | #[code [any(ord(c) >= 128 for c in token.text)]]. +row +cell #[code is_digit] +cell bool +cell | Does the token consist of digits? Equivalent to | #[code token.text.isdigit()]. +row +cell #[code is_lower] +cell bool +cell | Is the token in lowercase? Equivalent to | #[code token.text.islower()]. +row +cell #[code is_upper] +cell bool +cell | Is the token in uppercase? Equivalent to | #[code token.text.isupper()]. +row +cell #[code is_title] +cell bool +cell | Is the token in titlecase? Equivalent to | #[code token.text.istitle()]. +row +cell #[code is_punct] +cell bool +cell Is the token punctuation? +row +cell #[code is_left_punct] +cell bool +cell Is the token a left punctuation mark, e.g. #[code (]? +row +cell #[code is_right_punct] +cell bool +cell Is the token a right punctuation mark, e.g. #[code )]? +row +cell #[code is_space] +cell bool +cell | Does the token consist of whitespace characters? Equivalent to | #[code token.text.isspace()]. +row +cell #[code is_bracket] +cell bool +cell Is the token a bracket? +row +cell #[code is_quote] +cell bool +cell Is the token a quotation mark? +row +cell #[code is_currency] +tag-new("2.0.8") +cell bool +cell Is the token a currency symbol? +row +cell #[code like_url] +cell bool +cell Does the token resemble a URL? +row +cell #[code like_num] +cell bool +cell Does the token represent a number? e.g. "10.9", "10", "ten", etc. +row +cell #[code like_email] +cell bool +cell Does the token resemble an email address? +row +cell #[code is_oov] +cell bool +cell Is the token out-of-vocabulary? +row +cell #[code is_stop] +cell bool +cell Is the token part of a "stop list"? +row +cell #[code pos] +cell int +cell Coarse-grained part-of-speech. +row +cell #[code pos_] +cell unicode +cell Coarse-grained part-of-speech. +row +cell #[code tag] +cell int +cell Fine-grained part-of-speech. +row +cell #[code tag_] +cell unicode +cell Fine-grained part-of-speech. +row +cell #[code dep] +cell int +cell Syntactic dependency relation. +row +cell #[code dep_] +cell unicode +cell Syntactic dependency relation. +row +cell #[code lang] +cell int +cell Language of the parent document's vocabulary. +row +cell #[code lang_] +cell unicode +cell Language of the parent document's vocabulary. +row +cell #[code prob] +cell float +cell Smoothed log probability estimate of token's type. +row +cell #[code idx] +cell int +cell The character offset of the token within the parent document. +row +cell #[code sentiment] +cell float +cell | A scalar value indicating the positivity or negativity of the | token. +row +cell #[code lex_id] +cell int +cell Sequential ID of the token's lexical type. +row +cell #[code rank] +cell int +cell | Sequential ID of the token's lexical type, used to index into | tables, e.g. for word vectors. +row +cell #[code cluster] +cell int +cell Brown cluster ID. +row +cell #[code _] +cell #[code Underscore] +cell | User space for adding custom | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].