Update docstrings and API docs for Token

2025-11-22 02:36:03 +03:00 · 2017-05-19 18:47:56 +02:00 · 2017-05-19 18:47:56 +02:00 · e9e62b01b0
commit e9e62b01b0
parent 62ceec4fc6
2 changed files with 374 additions and 302 deletions
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -23,10 +23,14 @@ from .. import about
 cdef class Token:
-    """
+    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
    An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
    """
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.
        vocab (Vocab): A storage container for lexical types.
        doc (Doc): The parent document.
        offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.c = &self.doc.c[offset]
@ -36,8 +40,9 @@ cdef class Token:
        return hash((self.doc, self.i))
    def __len__(self):
-        """
+        """The number of unicode characters in the token, i.e. `token.text`.
-        Number of unicode characters in token.text.
+
        RETURNS (int): The number of unicode characters in the token.
        """
        return self.c.lex.length
@ -75,37 +80,35 @@ cdef class Token:
            raise ValueError(op)
    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        """
+        """Check the value of a boolean flag.
        Check the value of a boolean flag.
-        Arguments:
+        flag_id (int): The ID of the flag attribute.
-            flag_id (int): The ID of the flag attribute.
+        RETURNS (bool): Whether the flag is set.
-        Returns:
+
-            is_set (bool): Whether the flag is set.
+        EXAMPLE:
            >>> from spacy.attrs import IS_TITLE
            >>> doc = nlp(u'Give it back! He pleaded.')
            >>> token = doc[0]
            >>> token.check_flag(IS_TITLE)
            True
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)
    def nbor(self, int i=1):
-        """
+        """Get a neighboring token.
        Get a neighboring token.
-        Arguments:
+        i (int): The relative position of the token to get. Defaults to 1.
-            i (int): The relative position of the token to get. Defaults to 1.
+        RETURNS (Token): The token at position `self.doc[self.i+i]`.
        Returns:
            neighbor (Token): The token at position self.doc[self.i+i]
        """
        return self.doc[self.i+i]
    def similarity(self, other):
-        """
+        """Make a semantic similarity estimate. The default estimate is cosine
-        Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        similarity using an average of word vectors.
-        Arguments:
+        other (object): The object to compare with. By default, accepts `Doc`,
-            other:
+            `Span`, `Token` and `Lexeme` objects.
-                The object to compare with. By default, accepts Doc, Span,
+        RETURNS (float): A scalar similarity score. Higher is more similar.
                Token and Lexeme objects.
        Returns:
            score (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['similarity'](self)
@ -114,10 +117,14 @@ cdef class Token:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    property lex_id:
        """ID of the token's lexical type.
        RETURNS (int): ID of the token's lexical type."""
        def __get__(self):
            return self.c.lex.id
    property rank:
        # TODO: add docstring
        def __get__(self):
            return self.c.lex.id
@ -126,10 +133,19 @@ cdef class Token:
            return self.text_with_ws
    property text:
        """A unicode representation of the token text.
        RETURNS (unicode): The original verbatim text of the token.
        """
        def __get__(self):
            return self.orth_
    property text_with_ws:
        """The text content of the token with a trailing whitespace character if
        it has one.
        RETURNS (unicode): The text content of the span (with trailing whitespace).
        """
        def __get__(self):
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
            if self.c.spacy:
@ -184,6 +200,10 @@ cdef class Token:
            return self.c.lex.suffix
    property lemma:
        """Base form of the word, with no inflectional suffixes.
        RETURNS (int): Token lemma.
        """
        def __get__(self):
            return self.c.lemma
        def __set__(self, int lemma):
@ -206,8 +226,10 @@ cdef class Token:
            self.c.dep = label
    property has_vector:
-        """
+        """A boolean value indicating whether a word vector is associated with
-        A boolean value indicating whether a word vector is associated with the object.
+        the object.
        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
@ -220,10 +242,10 @@ cdef class Token:
                return False
    property vector:
-        """
+        """A real-valued meaning representation.
        A real-valued meaning representation.
-        Type: numpy.ndarray[ndim=1, dtype='float32']
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the token's semantics.
        """
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
@ -239,15 +261,11 @@ cdef class Token:
            vector_view = <float[:length,]>self.c.lex.vector
            return numpy.asarray(vector_view)
    property repvec:
        def __get__(self):
            raise AttributeError("repvec was renamed to vector in v0.100")
    property has_repvec:
        def __get__(self):
            raise AttributeError("has_repvec was renamed to has_vector in v0.100")
    property vector_norm:
        """The L2 norm of the document's vector representation.
        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
            if 'vector_norm' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector_norm'](self)
@ -324,28 +342,26 @@ cdef class Token:
                yield from word.subtree
    property left_edge:
-        """
+        """The leftmost token of this token's syntactic descendents.
        The leftmost token of this token's syntactic descendents.
-        Returns: Token The first token such that self.is_ancestor(token)
+        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.l_edge]
    property right_edge:
-        """
+        """The rightmost token of this token's syntactic descendents.
        The rightmost token of this token's syntactic descendents.
-        Returns: Token The last token such that self.is_ancestor(token)
+        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.r_edge]
    property ancestors:
-        """
+        """A sequence of this token's syntactic ancestors.
        A sequence of this token's syntactic ancestors.
-        Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
+        YIELDS (Token): A sequence of ancestor tokens such that
            `ancestor.is_ancestor(self)`.
        """
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
@ -357,33 +373,25 @@ cdef class Token:
                yield self.doc[head_ptr - (self.c - self.i)]
                i += 1
    def is_ancestor_of(self, descendant):
        # TODO: Remove after backward compatibility check.
        return self.is_ancestor(descendant)
    def is_ancestor(self, descendant):
-        """
+        """Check whether this token is a parent, grandparent, etc. of another
        Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.
-        Arguments:
+        descendant (Token): Another token.
-            descendant (Token): Another token.
+        RETURNS (bool): Whether this token is the ancestor of the descendant.
        Returns:
            is_ancestor (bool): Whether this token is the ancestor of the descendant.
        """
        if self.doc is not descendant.doc:
            return False
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )
    property head:
-        """
+        """The syntactic parent, or "governor", of this token.
        The syntactic parent, or "governor", of this token.
-        Returns: Token
+        RETURNS (Token): The token head.
        """
        def __get__(self):
-            """
+            """The token predicted by the parser to be the head of the current
-            The token predicted by the parser to be the head of the current token.
+            token.
            """
            return self.doc[self.i + self.c.head]
        def __set__(self, Token new_head):
@ -477,10 +485,9 @@ cdef class Token:
            self.c.head = rel_newhead_i
    property conjuncts:
-        """
+        """A sequence of coordinated tokens, including the token itself.
        A sequence of coordinated tokens, including the token itself.
-        Yields: Token A coordinated token
+        YIELDS (Token): A coordinated token.
        """
        def __get__(self):
            """Get a list of conjoined words."""
@ -495,25 +502,46 @@ cdef class Token:
                            yield from word.conjuncts
    property ent_type:
        """Named entity type.
        RETURNS (int): Named entity type.
        """
        def __get__(self):
            return self.c.ent_type
    property ent_iob:
        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
        is assigned.
        RETURNS (int): IOB code of named entity tag.
        """
        def __get__(self):
            return self.c.ent_iob
    property ent_type_:
        """Named entity type.
        RETURNS (unicode): Named entity type.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]
    property ent_iob_:
        """IOB code of named entity tag. "B" means the token begins an entity,
        "I" means it is inside an entity, "O" means it is outside an entity, and
        "" means no entity tag is set.
        RETURNS (unicode): IOB code of named entity tag.
        """
        def __get__(self):
            iob_strings = ('', 'I', 'O', 'B')
            return iob_strings[self.c.ent_iob]
    property ent_id:
-        """
+        """ID of the entity the token is an instance of, if any. Usually
-        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        assigned by patterns in the Matcher.
        RETURNS (int): ID of the entity.
        """
        def __get__(self):
            return self.c.ent_id
@ -522,8 +550,10 @@ cdef class Token:
            self.c.ent_id = key
    property ent_id_:
-        """
+        """ID of the entity the token is an instance of, if any. Usually
-        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        assigned by patterns in the Matcher.
        RETURNS (unicode): ID of the entity.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]
@ -564,6 +594,10 @@ cdef class Token:
            return self.vocab.strings[self.c.lex.lang]
    property lemma_:
        """Base form of the word, with no inflectional suffixes.
        RETURNS (unicode): Token lemma.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
        def __set__(self, unicode lemma_):
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@ -4,9 +4,255 @@ include ../../_includes/_mixins
 p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
 +h(2, "init") Token.__init__
    +tag method
 p Construct a #[code Token] object.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    token = doc[0]
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code offset]
        +cell int
        +cell The index of the token within the document.
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The newly constructed object.
 +h(2, "len") Token.__len__
    +tag method
 p The number of unicode characters in the token, i.e. #[code token.text].
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    token = doc[0]
    assert len(token) == 4
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of unicode characters in the token.
 +h(2, "check_flag") Token.check_flag
    +tag method
 p Check the value of a boolean flag.
 +aside-code("Example").
    from spacy.attrs import IS_TITLE
    doc = nlp(u'Give it back! He pleaded.')
    token = doc[0]
    token.check_flag(IS_TITLE)
    # True
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to check.
    +footrow
        +cell returns
        +cell bool
        +cell Whether the flag is set.
 +h(2, "nbor") Token.nbor
    +tag method
 p Get a neighboring token.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    token = doc[0]
    token.nbor()
    # it
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The relative position of the token to get. Defaults to #[code 1].
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The token at position #[code self.doc[self.i+i]].
 +h(2, "similarity") Token.similarity
    +tag method
 p Compute a semantic similarity estimate. Defaults to cosine over vectors.
 +aside-code("Example").
    apples, and, oranges = nlp(u'apples and oranges')
    apples_oranges = apples.similarity(oranges)
    oranges_apples = oranges.similarity(apples)
    assert apples_oranges == oranges_apples
 +table(["Name", "Type", "Description"])
    +row
        +cell other
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell returns
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "is_ancestor") Token.is_ancestor
    +tag method
 p
    |  Check whether this token is a parent, grandparent, etc. of another
    |  in the dependency tree.
 +table(["Name", "Type", "Description"])
    +row
        +cell descendant
        +cell #[code Token]
        +cell Another token.
    +footrow
        +cell returns
        +cell bool
        +cell Whether this token is the ancestor of the descendant.
 +h(2, "has_vector") Token.has_vector
    +tag property
    +tag requires model
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  token.
 +aside-code("Example").
    apple = nlp(u'apple')
    assert apple.has_vector
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell Whether the token has a vector data attached.
 +h(2, "vector") Token.vector
    +tag property
    +tag requires model
 p
    |  A real-valued meaning representation.
 +aside-code("Example").
    apple = nlp(u'apple')
    (apple.vector.dtype, apple.vector.shape)
    # (dtype('float32'), (300,))
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the token's semantics.
 +h(2, "vector_norm") Span.vector_norm
    +tag property
    +tag requires model
 p
    |  The L2 norm of the token's vector representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell float
        +cell The L2 norm of the vector representation.
 +h(2, "conjuncts") Token.conjuncts
    +tag property
 p A sequence of coordinated tokens, including the token itself.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A coordinated token.
 +h(2, "children") Token.children
    +tag property
 p A sequence of the token's immediate syntactic children.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A child token such that #[code child.head==self].
 +h(2, "subtree") Token.subtree
    +tag property
 p A sequence of all the token's syntactic descendents.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A descendant token such that #[code self.is_ancestor(descendant)].
 +h(2, "ancestors") Token.ancestors
    +tag property
 p The rightmost token of this token's syntactic descendants.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell
            |  A sequence of ancestor tokens such that
            |  #[code ancestor.is_ancestor(self)].
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code text]
        +cell unicode
        +cell Verbatim text content.
    +row
        +cell #[code text_with_ws]
        +cell unicode
        +cell Text content, with trailing space character if present.
    +row
        +cell #[code whitespace]
        +cell int
        +cell Trailing space character if present.
    +row
        +cell #[code whitespace_]
        +cell unicode
        +cell Trailing space character if present.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
@ -17,14 +263,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code head]
        +cell #[code Token]
        +cell The syntactic parent, or "governor", of this token.
    +row
        +cell #[code left_edge]
        +cell #[code Token]
        +cell The leftmost token of this token's syntactic descendants.
    +row
        +cell #[code right_edge]
        +cell #[code Token]
        +cell The rightmost token of this token's syntactic descendents.
    +row
        +cell #[code i]
        +cell int
        +cell The index of the token within the parent document.
    +row
        +cell #[code ent_type]
        +cell int
        +cell Named entity type.
    +row
        +cell #[code ent_type_]
        +cell unicode
@ -42,19 +305,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell unicode
        +cell
            |  IOB code of named entity tag. #[code "B"]
-            |  means the token begins an entity, #[code "I"] means it inside an
+            |  means the token begins an entity, #[code "I"] means it is inside
-            |  entity, #[code "O"] means it is outside an entity, and
+            |  an entity, #[code "O"] means it is outside an entity, and
            |  #[code ""] means no entity tag is set.
    +row
        +cell #[code ent_id]
        +cell int
-        +cell ID of the entity the token is an instance of, if any.
+        +cell
            |  ID of the entity the token is an instance of, if any. Usually
            |  assigned by patterns in the Matcher.
    +row
        +cell #[code ent_id_]
        +cell unicode
-        +cell ID of the entity the token is an instance of, if any.
+        +cell
            |  ID of the entity the token is an instance of, if any. Usually
            |  assigned by patterns in the Matcher.
    +row
        +cell #[code lemma]
@ -229,232 +496,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell #[code lex_id]
        +cell int
        +cell ID of the token's lexical type.
    +row
        +cell #[code text]
        +cell unicode
        +cell Verbatim text content.
    +row
        +cell #[code text_with_ws]
        +cell unicode
        +cell Text content, with trailing space character if present.
    +row
        +cell #[code whitespace]
        +cell int
        +cell Trailing space character if present.
    +row
        +cell #[code whitespace_]
        +cell unicode
        +cell Trailing space character if present.
 +h(2, "init") Token.__init__
    +tag method
 p Construct a #[code Token] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code offset]
        +cell int
        +cell The index of the token within the document.
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The newly constructed object.
 +h(2, "len") Token.__len__
    +tag method
 p Get the number of unicode characters in the token.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of unicode characters in the token.
 +h(2, "check_flag") Token.check_flag
    +tag method
 p Check the value of a boolean flag.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to check.
    +footrow
        +cell returns
        +cell bool
        +cell Whether the flag is set.
 +h(2, "nbor") Token.nbor
    +tag method
 p Get a neighboring token.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The relative position of the token to get. Defaults to #[code 1].
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The token at position #[code self.doc[self.i+i]]
 +h(2, "similarity") Token.similarity
    +tag method
 p Compute a semantic similarity estimate. Defaults to cosine over vectors.
 +table(["Name", "Type", "Description"])
    +row
        +cell other
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell returns
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "is_ancestor") Token.is_ancestor
    +tag method
 p
    |  Check whether this token is a parent, grandparent, etc. of another
    |  in the dependency tree.
 +table(["Name", "Type", "Description"])
    +row
        +cell descendant
        +cell #[code Token]
        +cell Another token.
    +footrow
        +cell returns
        +cell bool
        +cell Whether this token is the ancestor of the descendant.
 +h(2, "vector") Token.vector
    +tag property
 p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the token's semantics.
 +h(2, "has_vector") Token.has_vector
    +tag property
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  object.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell Whether the token has a vector data attached.
 +h(2, "head") Token.head
    +tag property
 p The syntactic parent, or "governor", of this token.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The head.
 +h(2, "conjuncts") Token.conjuncts
    +tag property
 p A sequence of coordinated tokens, including the token itself.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A coordinated token.
 +h(2, "children") Token.children
    +tag property
 p A sequence of the token's immediate syntactic children.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A child token such that #[code child.head==self].
 +h(2, "subtree") Token.subtree
    +tag property
 p A sequence of all the token's syntactic descendents.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A descendant token such that #[code self.is_ancestor(descendant)].
 +h(2, "left_edge") Token.left_edge
    +tag property
 p The leftmost token of this token's syntactic descendants.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The first token such that #[code self.is_ancestor(token)].
 +h(2, "right_edge") Token.right_edge
    +tag property
 p The rightmost token of this token's syntactic descendents.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The last token such that #[code self.is_ancestor(token)].
 +h(2, "ancestors") Token.ancestors
    +tag property
 p The rightmost token of this token's syntactic descendants.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell
            |  A sequence of ancestor tokens such that
            |  #[code ancestor.is_ancestor(self)].