Update docstrings and API docs for Token

This commit is contained in:
ines 2017-05-19 18:47:56 +02:00
parent 62ceec4fc6
commit e9e62b01b0
2 changed files with 374 additions and 302 deletions

View File

@ -23,10 +23,14 @@ from .. import about
cdef class Token:
"""
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
"""
"""An individual token i.e. a word, punctuation symbol, whitespace, etc."""
def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object.
vocab (Vocab): A storage container for lexical types.
doc (Doc): The parent document.
offset (int): The index of the token within the document.
"""
self.vocab = vocab
self.doc = doc
self.c = &self.doc.c[offset]
@ -36,8 +40,9 @@ cdef class Token:
return hash((self.doc, self.i))
def __len__(self):
"""
Number of unicode characters in token.text.
"""The number of unicode characters in the token, i.e. `token.text`.
RETURNS (int): The number of unicode characters in the token.
"""
return self.c.lex.length
@ -75,37 +80,35 @@ cdef class Token:
raise ValueError(op)
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
"""
Check the value of a boolean flag.
"""Check the value of a boolean flag.
Arguments:
flag_id (int): The ID of the flag attribute.
Returns:
is_set (bool): Whether the flag is set.
flag_id (int): The ID of the flag attribute.
RETURNS (bool): Whether the flag is set.
EXAMPLE:
>>> from spacy.attrs import IS_TITLE
>>> doc = nlp(u'Give it back! He pleaded.')
>>> token = doc[0]
>>> token.check_flag(IS_TITLE)
True
"""
return Lexeme.c_check_flag(self.c.lex, flag_id)
def nbor(self, int i=1):
"""
Get a neighboring token.
"""Get a neighboring token.
Arguments:
i (int): The relative position of the token to get. Defaults to 1.
Returns:
neighbor (Token): The token at position self.doc[self.i+i]
i (int): The relative position of the token to get. Defaults to 1.
RETURNS (Token): The token at position `self.doc[self.i+i]`.
"""
return self.doc[self.i+i]
def similarity(self, other):
"""
Compute a semantic similarity estimate. Defaults to cosine over vectors.
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self)
@ -114,10 +117,14 @@ cdef class Token:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property lex_id:
"""ID of the token's lexical type.
RETURNS (int): ID of the token's lexical type."""
def __get__(self):
return self.c.lex.id
property rank:
# TODO: add docstring
def __get__(self):
return self.c.lex.id
@ -126,10 +133,19 @@ cdef class Token:
return self.text_with_ws
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self):
return self.orth_
property text_with_ws:
"""The text content of the token with a trailing whitespace character if
it has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
"""
def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
if self.c.spacy:
@ -184,6 +200,10 @@ cdef class Token:
return self.c.lex.suffix
property lemma:
"""Base form of the word, with no inflectional suffixes.
RETURNS (int): Token lemma.
"""
def __get__(self):
return self.c.lemma
def __set__(self, int lemma):
@ -206,8 +226,10 @@ cdef class Token:
self.c.dep = label
property has_vector:
"""
A boolean value indicating whether a word vector is associated with the object.
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
if 'has_vector' in self.doc.user_token_hooks:
@ -220,10 +242,10 @@ cdef class Token:
return False
property vector:
"""
A real-valued meaning representation.
"""A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32']
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the token's semantics.
"""
def __get__(self):
if 'vector' in self.doc.user_token_hooks:
@ -239,15 +261,11 @@ cdef class Token:
vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(vector_view)
property repvec:
def __get__(self):
raise AttributeError("repvec was renamed to vector in v0.100")
property has_repvec:
def __get__(self):
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
property vector_norm:
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self):
if 'vector_norm' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector_norm'](self)
@ -324,28 +342,26 @@ cdef class Token:
yield from word.subtree
property left_edge:
"""
The leftmost token of this token's syntactic descendents.
"""The leftmost token of this token's syntactic descendents.
Returns: Token The first token such that self.is_ancestor(token)
RETURNS (Token): The first token such that `self.is_ancestor(token)`.
"""
def __get__(self):
return self.doc[self.c.l_edge]
property right_edge:
"""
The rightmost token of this token's syntactic descendents.
"""The rightmost token of this token's syntactic descendents.
Returns: Token The last token such that self.is_ancestor(token)
RETURNS (Token): The last token such that `self.is_ancestor(token)`.
"""
def __get__(self):
return self.doc[self.c.r_edge]
property ancestors:
"""
A sequence of this token's syntactic ancestors.
"""A sequence of this token's syntactic ancestors.
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
YIELDS (Token): A sequence of ancestor tokens such that
`ancestor.is_ancestor(self)`.
"""
def __get__(self):
cdef const TokenC* head_ptr = self.c
@ -357,33 +373,25 @@ cdef class Token:
yield self.doc[head_ptr - (self.c - self.i)]
i += 1
def is_ancestor_of(self, descendant):
# TODO: Remove after backward compatibility check.
return self.is_ancestor(descendant)
def is_ancestor(self, descendant):
"""
Check whether this token is a parent, grandparent, etc. of another
"""Check whether this token is a parent, grandparent, etc. of another
in the dependency tree.
Arguments:
descendant (Token): Another token.
Returns:
is_ancestor (bool): Whether this token is the ancestor of the descendant.
descendant (Token): Another token.
RETURNS (bool): Whether this token is the ancestor of the descendant.
"""
if self.doc is not descendant.doc:
return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head:
"""
The syntactic parent, or "governor", of this token.
"""The syntactic parent, or "governor", of this token.
Returns: Token
RETURNS (Token): The token head.
"""
def __get__(self):
"""
The token predicted by the parser to be the head of the current token.
"""The token predicted by the parser to be the head of the current
token.
"""
return self.doc[self.i + self.c.head]
def __set__(self, Token new_head):
@ -477,10 +485,9 @@ cdef class Token:
self.c.head = rel_newhead_i
property conjuncts:
"""
A sequence of coordinated tokens, including the token itself.
"""A sequence of coordinated tokens, including the token itself.
Yields: Token A coordinated token
YIELDS (Token): A coordinated token.
"""
def __get__(self):
"""Get a list of conjoined words."""
@ -495,25 +502,46 @@ cdef class Token:
yield from word.conjuncts
property ent_type:
"""Named entity type.
RETURNS (int): Named entity type.
"""
def __get__(self):
return self.c.ent_type
property ent_iob:
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
is assigned.
RETURNS (int): IOB code of named entity tag.
"""
def __get__(self):
return self.c.ent_iob
property ent_type_:
"""Named entity type.
RETURNS (unicode): Named entity type.
"""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity, and
"" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag.
"""
def __get__(self):
iob_strings = ('', 'I', 'O', 'B')
return iob_strings[self.c.ent_iob]
property ent_id:
"""
An (integer) entity ID. Usually assigned by patterns in the Matcher.
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
RETURNS (int): ID of the entity.
"""
def __get__(self):
return self.c.ent_id
@ -522,8 +550,10 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
"""
A (string) entity ID. Usually assigned by patterns in the Matcher.
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
RETURNS (unicode): ID of the entity.
"""
def __get__(self):
return self.vocab.strings[self.c.ent_id]
@ -564,6 +594,10 @@ cdef class Token:
return self.vocab.strings[self.c.lex.lang]
property lemma_:
"""Base form of the word, with no inflectional suffixes.
RETURNS (unicode): Token lemma.
"""
def __get__(self):
return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_):

View File

@ -4,9 +4,255 @@ include ../../_includes/_mixins
p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
+footrow
+cell returns
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p The number of unicode characters in the token, i.e. #[code token.text].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert len(token) == 4
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of unicode characters in the token.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
from spacy.attrs import IS_TITLE
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
token.check_flag(IS_TITLE)
# True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
+footrow
+cell returns
+cell bool
+cell Whether the flag is set.
+h(2, "nbor") Token.nbor
+tag method
p Get a neighboring token.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
token.nbor()
# it
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
+footrow
+cell returns
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]].
+h(2, "similarity") Token.similarity
+tag method
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
apples, and, oranges = nlp(u'apples and oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "is_ancestor") Token.is_ancestor
+tag method
p
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+table(["Name", "Type", "Description"])
+row
+cell descendant
+cell #[code Token]
+cell Another token.
+footrow
+cell returns
+cell bool
+cell Whether this token is the ancestor of the descendant.
+h(2, "has_vector") Token.has_vector
+tag property
+tag requires model
p
| A boolean value indicating whether a word vector is associated with the
| token.
+aside-code("Example").
apple = nlp(u'apple')
assert apple.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "vector") Token.vector
+tag property
+tag requires model
p
| A real-valued meaning representation.
+aside-code("Example").
apple = nlp(u'apple')
(apple.vector.dtype, apple.vector.shape)
# (dtype('float32'), (300,))
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm
+tag property
+tag requires model
p
| The L2 norm of the token's vector representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "conjuncts") Token.conjuncts
+tag property
p A sequence of coordinated tokens, including the token itself.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
p A sequence of the token's immediate syntactic children.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "subtree") Token.subtree
+tag property
p A sequence of all the token's syntactic descendents.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
+h(2, "ancestors") Token.ancestors
+tag property
p The rightmost token of this token's syntactic descendants.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace]
+cell int
+cell Trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
+row
+cell #[code vocab]
+cell #[code Vocab]
@ -17,14 +263,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code head]
+cell #[code Token]
+cell The syntactic parent, or "governor", of this token.
+row
+cell #[code left_edge]
+cell #[code Token]
+cell The leftmost token of this token's syntactic descendants.
+row
+cell #[code right_edge]
+cell #[code Token]
+cell The rightmost token of this token's syntactic descendents.
+row
+cell #[code i]
+cell int
+cell The index of the token within the parent document.
+row
+cell #[code ent_type]
+cell int
+cell Named entity type.
+row
+cell #[code ent_type_]
+cell unicode
@ -42,19 +305,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell unicode
+cell
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it inside an
| entity, #[code "O"] means it is outside an entity, and
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
| #[code ""] means no entity tag is set.
+row
+cell #[code ent_id]
+cell int
+cell ID of the entity the token is an instance of, if any.
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
+row
+cell #[code ent_id_]
+cell unicode
+cell ID of the entity the token is an instance of, if any.
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
+row
+cell #[code lemma]
@ -229,232 +496,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell #[code lex_id]
+cell int
+cell ID of the token's lexical type.
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace]
+cell int
+cell Trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
+footrow
+cell returns
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p Get the number of unicode characters in the token.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of unicode characters in the token.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
+footrow
+cell returns
+cell bool
+cell Whether the flag is set.
+h(2, "nbor") Token.nbor
+tag method
p Get a neighboring token.
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
+footrow
+cell returns
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]]
+h(2, "similarity") Token.similarity
+tag method
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "is_ancestor") Token.is_ancestor
+tag method
p
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+table(["Name", "Type", "Description"])
+row
+cell descendant
+cell #[code Token]
+cell Another token.
+footrow
+cell returns
+cell bool
+cell Whether this token is the ancestor of the descendant.
+h(2, "vector") Token.vector
+tag property
p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "has_vector") Token.has_vector
+tag property
p
| A boolean value indicating whether a word vector is associated with the
| object.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "head") Token.head
+tag property
p The syntactic parent, or "governor", of this token.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code Token]
+cell The head.
+h(2, "conjuncts") Token.conjuncts
+tag property
p A sequence of coordinated tokens, including the token itself.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
p A sequence of the token's immediate syntactic children.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "subtree") Token.subtree
+tag property
p A sequence of all the token's syntactic descendents.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
+h(2, "left_edge") Token.left_edge
+tag property
p The leftmost token of this token's syntactic descendants.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code Token]
+cell The first token such that #[code self.is_ancestor(token)].
+h(2, "right_edge") Token.right_edge
+tag property
p The rightmost token of this token's syntactic descendents.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code Token]
+cell The last token such that #[code self.is_ancestor(token)].
+h(2, "ancestors") Token.ancestors
+tag property
p The rightmost token of this token's syntactic descendants.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].