spaCy/website/api/token.jade

848 lines
22 KiB
Plaintext
Raw Normal View History

2016-10-31 21:04:15 +03:00
//- 💫 DOCS > API > TOKEN
2017-10-03 15:27:22 +03:00
include ../_includes/_mixins
2016-10-31 21:04:15 +03:00
p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
2017-05-19 20:59:02 +03:00
assert token.text == u'Give'
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
2017-10-03 15:27:22 +03:00
+row("foot")
+cell returns
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p The number of unicode characters in the token, i.e. #[code token.text].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert len(token) == 4
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
+cell returns
+cell int
+cell The number of unicode characters in the token.
+h(2, "set_extension") Token.set_extension
+tag classmethod
+tag-new(2)
p
| Define a custom attribute on the #[code Token] which becomes available
| via #[code Token._]. For details, see the documentation on
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+aside-code("Example").
from spacy.tokens import Token
fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana')
Token.set_extension('is_fruit', getter=fruit_getter)
doc = nlp(u'I have an apple')
assert doc[3]._.is_fruit
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Name of the attribute to set by the extension. For example,
| #[code 'my_attr'] will be available as #[code token._.my_attr].
+row
+cell #[code default]
+cell -
+cell
| Optional default value of the attribute if no getter or method
| is defined.
+row
+cell #[code method]
+cell callable
+cell
| Set a custom method on the object, for example
| #[code token._.compare(other_token)].
+row
+cell #[code getter]
+cell callable
+cell
| Getter function that takes the object and returns an attribute
| value. Is called when the user accesses the #[code ._] attribute.
+row
+cell #[code setter]
+cell callable
+cell
| Setter function that takes the #[code Token] and a value, and
| modifies the object. Is called when the user writes to the
| #[code Token._] attribute.
+h(2, "get_extension") Token.get_extension
+tag classmethod
+tag-new(2)
p
| Look up a previously registered extension by name. Returns a 4-tuple
| #[code.u-break (default, method, getter, setter)] if the extension is
| registered. Raises a #[code KeyError] otherwise.
+aside-code("Example").
from spacy.tokens import Token
Token.set_extension('is_fruit', default=False)
extension = Token.get_extension('is_fruit')
assert extension == (False, None, None, None)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension.
+row("foot")
+cell returns
+cell tuple
+cell
| A #[code.u-break (default, method, getter, setter)] tuple of the
| extension.
+h(2, "has_extension") Token.has_extension
+tag classmethod
+tag-new(2)
p Check whether an extension has been registered on the #[code Token] class.
+aside-code("Example").
from spacy.tokens import Token
Token.set_extension('is_fruit', default=False)
assert Token.has_extension('is_fruit')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the extension has been registered.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
from spacy.attrs import IS_TITLE
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
2017-05-19 20:59:02 +03:00
assert token.check_flag(IS_TITLE) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
2017-10-03 15:27:22 +03:00
+row("foot")
+cell returns
+cell bool
+cell Whether the flag is set.
+h(2, "similarity") Token.similarity
+tag method
2017-05-19 21:24:46 +03:00
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
2017-05-19 20:59:02 +03:00
apples, _, oranges = nlp(u'apples and oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
2017-10-03 15:27:22 +03:00
+row("foot")
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
2017-05-19 20:59:02 +03:00
+h(2, "nbor") Token.nbor
+tag method
2017-05-19 20:59:02 +03:00
p Get a neighboring token.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_nbor = doc[0].nbor()
assert give_nbor.text == u'it'
+table(["Name", "Type", "Description"])
+row
2017-05-19 20:59:02 +03:00
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
2017-10-03 15:27:22 +03:00
+row("foot")
+cell returns
2017-05-19 20:59:02 +03:00
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]].
2017-05-19 20:59:02 +03:00
+h(2, "is_ancestor") Token.is_ancestor
+tag method
2017-05-19 21:24:46 +03:00
+tag-model("parse")
p
2017-05-19 20:59:02 +03:00
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+aside-code("Example").
2017-05-19 20:59:02 +03:00
doc = nlp(u'Give it back! He pleaded.')
give = doc[0]
it = doc[1]
assert give.is_ancestor(it)
+table(["Name", "Type", "Description"])
2017-05-19 20:59:02 +03:00
+row
+cell descendant
+cell #[code Token]
+cell Another token.
2017-10-03 15:27:22 +03:00
+row("foot")
+cell returns
+cell bool
2017-05-19 20:59:02 +03:00
+cell Whether this token is the ancestor of the descendant.
2017-05-19 20:59:02 +03:00
+h(2, "ancestors") Token.ancestors
+tag property
2017-05-19 21:24:46 +03:00
+tag-model("parse")
2017-05-19 20:59:02 +03:00
p The rightmost token of this token's syntactic descendants.
+aside-code("Example").
2017-05-19 20:59:02 +03:00
doc = nlp(u'Give it back! He pleaded.')
it_ancestors = doc[1].ancestors
assert [t.text for t in it_ancestors] == [u'Give']
he_ancestors = doc[4].ancestors
assert [t.text for t in he_ancestors] == [u'pleaded']
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
2017-05-19 20:59:02 +03:00
+cell yields
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].
+h(2, "conjuncts") Token.conjuncts
+tag property
2017-05-19 21:24:46 +03:00
+tag-model("parse")
p A sequence of coordinated tokens, including the token itself.
2017-05-19 20:59:02 +03:00
+aside-code("Example").
doc = nlp(u'I like apples and oranges')
apples_conjuncts = doc[2].conjuncts
assert [t.text for t in apples_conjuncts] == [u'oranges']
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
+cell yields
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
2017-05-19 21:24:46 +03:00
+tag-model("parse")
p A sequence of the token's immediate syntactic children.
2017-05-19 20:59:02 +03:00
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_children = doc[0].children
assert [t.text for t in give_children] == [u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
+cell yields
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "lefts") Token.lefts
+tag property
+tag-model("parse")
p
| The leftward immediate children of the word, in the syntactic dependency
| parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
lefts = [t.text for t in doc[3].lefts]
assert lefts == [u'New']
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code Token]
+cell A left-child of the token.
+h(2, "rights") Token.rights
+tag property
+tag-model("parse")
p
| The rightward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
rights = [t.text for t in doc[3].rights]
assert rights == [u'in']
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code Token]
+cell A right-child of the token.
+h(2, "n_lefts") Token.n_lefts
+tag property
+tag-model("parse")
p
| The number of leftward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_lefts == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of left-child tokens.
+h(2, "n_rights") Token.n_rights
+tag property
+tag-model("parse")
p
| The number of rightward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_rights == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of right-child tokens.
+h(2, "subtree") Token.subtree
+tag property
2017-05-19 21:24:46 +03:00
+tag-model("parse")
p A sequence of all the token's syntactic descendents.
2017-05-19 20:59:02 +03:00
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_subtree = doc[0].subtree
assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
+cell yields
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
2017-11-01 16:13:22 +03:00
+h(2, "is_sent_start") Token.is_sent_start
+tag property
+tag-new(2)
p
| A boolean value indicating whether the token starts a sentence.
| #[code None] if unknown.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[4].is_sent_start
assert not doc[5].is_sent_start
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell bool
+cell Whether the token starts a sentence.
2017-11-07 14:00:43 +03:00
+infobox("Changed in v2.0", "⚠️")
2017-11-01 16:13:22 +03:00
| As of spaCy v2.0, the #[code Token.sent_start] property is deprecated and
| has been replaced with #[code Token.is_sent_start], which returns a
| boolean value instead of a misleading #[code 0] for #[code False] and
| #[code 1] for #[code True]. It also now returns #[code None] if the
| answer is unknown, and fixes a quirk in the old logic that would always
| set the property to #[code 0] for the first word of the document.
+code-wrapper
+code-new assert doc[4].is_sent_start == True
+code-old assert doc[4].sent_start == 1
2017-05-19 20:59:02 +03:00
+h(2, "has_vector") Token.has_vector
+tag property
2017-05-19 21:24:46 +03:00
+tag-model("vectors")
2017-05-19 20:59:02 +03:00
p
| A boolean value indicating whether a word vector is associated with the
| token.
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.has_vector
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
2017-05-19 20:59:02 +03:00
+cell returns
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "vector") Token.vector
+tag property
2017-05-19 21:24:46 +03:00
+tag-model("vectors")
2017-05-19 20:59:02 +03:00
p A real-valued meaning representation.
2017-05-19 20:59:02 +03:00
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.vector.dtype == 'float32'
assert apples.vector.shape == (300,)
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
2017-05-19 20:59:02 +03:00
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
2017-05-19 20:59:02 +03:00
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm
+tag property
2017-05-19 21:24:46 +03:00
+tag-model("vectors")
2017-05-19 20:59:02 +03:00
p The L2 norm of the token's vector representation.
2017-05-19 20:59:02 +03:00
+aside-code("Example").
doc = nlp(u'I like apples and pasta')
apples = doc[2]
pasta = doc[4]
apples.vector_norm # 6.89589786529541
pasta.vector_norm # 7.759851932525635
assert apples.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
2017-10-03 15:27:22 +03:00
+row("foot")
2017-05-19 20:59:02 +03:00
+cell returns
+cell float
+cell The L2 norm of the vector representation.
2016-10-31 21:04:15 +03:00
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
2017-10-03 15:27:22 +03:00
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
2017-10-03 15:27:22 +03:00
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Token.text]). Existst
| mostly for consistency with the other attributes.
2016-10-31 21:04:15 +03:00
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocab object of the parent #[code Doc].
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code head]
+cell #[code Token]
+cell The syntactic parent, or "governor", of this token.
+row
+cell #[code left_edge]
+cell #[code Token]
+cell The leftmost token of this token's syntactic descendants.
+row
+cell #[code right_edge]
+cell #[code Token]
+cell The rightmost token of this token's syntactic descendents.
2016-10-31 21:04:15 +03:00
+row
+cell #[code i]
+cell int
+cell The index of the token within the parent document.
2016-10-31 21:04:15 +03:00
+row
+cell #[code ent_type]
+cell int
+cell Named entity type.
2016-10-31 21:04:15 +03:00
+row
+cell #[code ent_type_]
+cell unicode
+cell Named entity type.
+row
+cell #[code ent_iob]
+cell int
+cell
2017-05-24 00:15:50 +03:00
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
| #[code ""] means no entity tag is set.
2016-10-31 21:04:15 +03:00
+row
+cell #[code ent_iob_]
+cell unicode
+cell
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
2016-10-31 21:04:15 +03:00
| #[code ""] means no entity tag is set.
+row
+cell #[code ent_id]
+cell int
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
2016-10-31 21:04:15 +03:00
+row
+cell #[code ent_id_]
+cell unicode
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
2016-10-31 21:04:15 +03:00
+row
+cell #[code lemma]
+cell int
+cell
2017-05-26 13:43:16 +03:00
| Base form of the token, with no inflectional suffixes.
2016-10-31 21:04:15 +03:00
+row
+cell #[code lemma_]
+cell unicode
2017-05-26 13:43:16 +03:00
+cell Base form of the token, with no inflectional suffixes.
2016-10-31 21:04:15 +03:00
+row
+cell #[code norm]
+cell int
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+row
+cell #[code norm_]
+cell unicode
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
2016-10-31 21:04:15 +03:00
+row
+cell #[code lower]
+cell int
+cell Lowercase form of the token.
2016-10-31 21:04:15 +03:00
+row
+cell #[code lower_]
+cell unicode
+cell
| Lowercase form of the token text. Equivalent to
| #[code Token.text.lower()].
2016-10-31 21:04:15 +03:00
+row
+cell #[code shape]
+cell int
2017-05-26 13:43:16 +03:00
+cell
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
2016-10-31 21:04:15 +03:00
+row
+cell #[code shape_]
+cell unicode
+cell
2017-05-26 13:43:16 +03:00
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
2016-10-31 21:04:15 +03:00
+row
+cell #[code prefix]
+cell int
+cell
| Hash value of a length-N substring from the start of the
2017-05-26 13:43:16 +03:00
| token. Defaults to #[code N=1].
2016-10-31 21:04:15 +03:00
+row
+cell #[code prefix_]
+cell unicode
+cell
2017-05-26 13:43:16 +03:00
| A length-N substring from the start of the token. Defaults to
2016-10-31 21:04:15 +03:00
| #[code N=1].
+row
+cell #[code suffix]
+cell int
+cell
| Hash value of a length-N substring from the end of the token.
| Defaults to #[code N=3].
2016-10-31 21:04:15 +03:00
+row
+cell #[code suffix_]
+cell unicode
+cell
| Length-N substring from the end of the token. Defaults to
| #[code N=3].
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_alpha]
+cell bool
2017-05-26 13:43:16 +03:00
+cell
| Does the token consist of alphabetic characters? Equivalent to
| #[code token.text.isalpha()].
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_ascii]
+cell bool
2017-05-26 13:43:16 +03:00
+cell
| Does the token consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in token.text)]].
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_digit]
+cell bool
2017-05-26 13:43:16 +03:00
+cell
| Does the token consist of digits? Equivalent to
| #[code token.text.isdigit()].
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_lower]
+cell bool
2017-05-26 13:43:16 +03:00
+cell
| Is the token in lowercase? Equivalent to
| #[code token.text.islower()].
2016-10-31 21:04:15 +03:00
2017-10-07 16:04:16 +03:00
+row
+cell #[code is_upper]
+cell bool
+cell
| Is the token in uppercase? Equivalent to
| #[code token.text.isupper()].
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_title]
+cell bool
2017-05-26 13:43:16 +03:00
+cell
| Is the token in titlecase? Equivalent to
| #[code token.text.istitle()].
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_punct]
+cell bool
2017-05-26 13:43:16 +03:00
+cell Is the token punctuation?
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_left_punct]
+cell bool
+cell Is the token a left punctuation mark, e.g. #[code (]?
+row
+cell #[code is_right_punct]
+cell bool
+cell Is the token a right punctuation mark, e.g. #[code )]?
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_space]
+cell bool
2017-05-26 13:43:16 +03:00
+cell
| Does the token consist of whitespace characters? Equivalent to
| #[code token.text.isspace()].
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_bracket]
+cell bool
+cell Is the token a bracket?
+row
+cell #[code is_quote]
+cell bool
+cell Is the token a quotation mark?
2016-10-31 21:04:15 +03:00
+row
+cell #[code like_url]
+cell bool
2017-05-26 13:43:16 +03:00
+cell Does the token resemble a URL?
2016-10-31 21:04:15 +03:00
+row
+cell #[code like_num]
+cell bool
2017-05-26 13:43:16 +03:00
+cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.
2016-10-31 21:04:15 +03:00
+row
+cell #[code like_email]
+cell bool
2017-05-26 13:43:16 +03:00
+cell Does the token resemble an email address?
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_oov]
+cell bool
2017-05-26 13:43:16 +03:00
+cell Is the token out-of-vocabulary?
2016-10-31 21:04:15 +03:00
+row
+cell #[code is_stop]
+cell bool
2017-05-26 13:43:16 +03:00
+cell Is the token part of a "stop list"?
2016-10-31 21:04:15 +03:00
+row
+cell #[code pos]
+cell int
+cell Coarse-grained part-of-speech.
+row
+cell #[code pos_]
+cell unicode
+cell Coarse-grained part-of-speech.
+row
+cell #[code tag]
+cell int
+cell Fine-grained part-of-speech.
+row
+cell #[code tag_]
+cell unicode
+cell Fine-grained part-of-speech.
+row
+cell #[code dep]
+cell int
+cell Syntactic dependency relation.
+row
+cell #[code dep_]
+cell unicode
+cell Syntactic dependency relation.
+row
+cell #[code lang]
+cell int
+cell Language of the parent document's vocabulary.
2016-10-31 21:04:15 +03:00
+row
+cell #[code lang_]
+cell unicode
+cell Language of the parent document's vocabulary.
+row
+cell #[code prob]
+cell float
+cell Smoothed log probability estimate of token's type.
+row
+cell #[code idx]
+cell int
+cell The character offset of the token within the parent document.
+row
+cell #[code sentiment]
+cell float
+cell
| A scalar value indicating the positivity or negativity of the
| token.
2016-10-31 21:04:15 +03:00
+row
+cell #[code lex_id]
+cell int
+cell Sequential ID of the token's lexical type.
+row
+cell #[code rank]
+cell int
+cell
| Sequential ID of the token's lexical type, used to index into
2017-10-27 22:07:50 +03:00
| tables, e.g. for word vectors.
+row
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+row
+cell #[code _]
+cell #[code Underscore]
+cell
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].