Tidy up Doc, Token and Span and add missing docs

2025-09-18 18:12:45 +03:00 · 2017-10-27 17:07:26 +02:00 · 2017-10-27 17:07:26 +02:00 · 544a407b93
commit 544a407b93
parent a6135336f5
7 changed files with 384 additions and 237 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -326,7 +326,8 @@ cdef class Doc:
            if self._vector is not None:
                return self._vector
            elif not len(self):
-                self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
+                self._vector = numpy.zeros((self.vocab.vectors_length,),
+                                           dtype='f')
                return self._vector
            elif self.has_vector:
                vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
@ -338,7 +339,8 @@ cdef class Doc:
                self._vector = self.tensor.mean(axis=0)
                return self._vector
            else:
-                return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
+                return numpy.zeros((self.vocab.vectors_length,),
+                                   dtype='float32')

        def __set__(self, value):
            self._vector = value
@ -424,7 +426,8 @@ cdef class Doc:
        def __set__(self, ents):
            # TODO:
            # 1. Allow negative matches
-            # 2. Ensure pre-set NERs are not over-written during statistical prediction
+            # 2. Ensure pre-set NERs are not over-written during statistical
+            #    prediction
            # 3. Test basic data-driven ORTH gazetteer
            # 4. Test more nuanced date and currency regex
            cdef int i
@ -433,7 +436,7 @@ cdef class Doc:
                # At this point we don't know whether the NER has run over the
                # Doc. If the ent_iob is missing, leave it missing.
                if self.c[i].ent_iob != 0:
-                    self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
+                    self.c[i].ent_iob = 2  # Means O. Non-O are set from ents.
            cdef attr_t ent_type
            cdef int start, end
            for ent_info in ents:
@ -574,18 +577,19 @@ cdef class Doc:
        # Allow strings, e.g. 'lemma' or 'LEMMA'
        py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
                       for id_ in py_attr_ids]
-        # Make an array from the attributes --- otherwise our inner loop is Python
-        # dict iteration.
+        # Make an array from the attributes --- otherwise our inner loop is
+        # Python dict iteration.
        attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
-        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
+        output = numpy.ndarray(shape=(self.length, len(attr_ids)),
+                               dtype=numpy.uint64)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.c[i], feature)
        # Handle 1d case
        return output if len(attr_ids) >= 2 else output.reshape((self.length,))

-
-    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
+    def count_by(self, attr_id_t attr_id, exclude=None,
+                 PreshCounter counts=None):
        """Count the frequencies of a given attribute. Produces a dict of
        `{attribute (int): count (ints)}` frequencies, keyed by the values of
        the given attribute ID.
@ -708,7 +712,8 @@ cdef class Doc:
            elif (token_j.head == token_j) and (token_k.head == token_k):
                lca_index = -1
            else:
-                lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
+                lca_index = __pairwise_lca(token_j.head, token_k.head,
+                                           lca_matrix)
            lca_matrix[token_j.i][token_k.i] = lca_index
            lca_matrix[token_k.i][token_j.i] = lca_index

@ -728,7 +733,7 @@ cdef class Doc:
        """Save the current state to a directory.

        path (unicode or Path): A path to a directory, which will be created if
-            it doesn't exist. Paths may be either strings or `Path`-like objects.
+            it doesn't exist. Paths may be either strings or Path-like objects.
        """
        with path.open('wb') as file_:
            file_.write(self.to_bytes(**exclude))
@ -751,7 +756,7 @@ cdef class Doc:
        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
            all annotations.
        """
-        array_head = [LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE]
+        array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE]
        # Msgpack doesn't distinguish between lists and tuples, which is
        # vexing for user data. As a best guess, we *know* that within
        # keys, we must have tuples. In values we just have to hope
@ -794,7 +799,8 @@ cdef class Doc:
        # keys, we must have tuples. In values we just have to hope
        # users don't mind getting a list instead of a tuple.
        if 'user_data' not in exclude and 'user_data_keys' in msg:
-            user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False)
+            user_data_keys = msgpack.loads(msg['user_data_keys'],
+                                           use_list=False)
            user_data_values = msgpack.loads(msg['user_data_values'])
            for key, value in zip(user_data_keys, user_data_values):
                self.user_data[key] = value
@ -853,7 +859,8 @@ cdef class Doc:
                "Doc.merge received %d non-keyword arguments. Expected either "
                "3 arguments (deprecated), or 0 (use keyword arguments). "
                "Arguments supplied:\n%s\n"
-                "Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))
+                "Keyword arguments: %s\n" % (len(args), repr(args),
+                                             repr(attributes)))

        # More deprecated attribute handling =/
        if 'label' in attributes:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -128,14 +128,17 @@ cdef class Span:

    @property
    def _(self):
+        """User space for adding custom attribute extensions."""
        return Underscore(Underscore.span_extensions, self,
                          start=self.start_char, end=self.end_char)

    def as_doc(self):
-        '''Create a Doc object view of the Span's data.
+        # TODO: fix
+        """Create a `Doc` object view of the Span's data. This is mostly
+        useful for C-typed interfaces.

-        This is mostly useful for C-typed interfaces.
-        '''
+        RETURNS (Doc): The `Doc` view of the span.
+        """
        cdef Doc doc = Doc(self.doc.vocab)
        doc.length = self.end-self.start
        doc.c = &self.doc.c[self.start]
@ -259,10 +262,7 @@ cdef class Span:
            self.end = end + 1

    property sent:
-        """The sentence span that this span is a part of.
-
-        RETURNS (Span): The sentence span that the span is a part of.
-        """
+        """RETURNS (Span): The sentence span that the span is a part of."""
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
@ -275,13 +275,10 @@ cdef class Span:
                n += 1
                if n >= self.doc.length:
                    raise RuntimeError
-            return self.doc[root.l_edge : root.r_edge + 1]
+            return self.doc[root.l_edge:root.r_edge + 1]

    property has_vector:
-        """A boolean value indicating whether a word vector is associated with
-        the object.
-
-        RETURNS (bool): Whether a word vector is associated with the object.
+        """RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.doc.user_span_hooks:
@ -303,10 +300,7 @@ cdef class Span:
            return self._vector

    property vector_norm:
-        """The L2 norm of the document's vector representation.
-
-        RETURNS (float): The L2 norm of the vector representation.
-        """
+        """RETURNS (float): The L2 norm of the vector representation."""
        def __get__(self):
            if 'vector_norm' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['vector'](self)
@ -320,7 +314,9 @@ cdef class Span:
            return self._vector_norm

    property sentiment:
-        # TODO: docstring
+        """RETURNS (float): A scalar value indicating the positivity or
+            negativity of the span.
+        """
        def __get__(self):
            if 'sentiment' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sentiment'](self)
@ -328,10 +324,7 @@ cdef class Span:
                return sum([token.sentiment for token in self]) / len(self)

    property text:
-        """A unicode representation of the span text.
-
-        RETURNS (unicode): The original verbatim text of the span.
-        """
+        """RETURNS (unicode): The original verbatim text of the span."""
        def __get__(self):
            text = self.text_with_ws
            if self[-1].whitespace_:
@ -364,10 +357,11 @@ cdef class Span:
                    "requires a statistical model to be installed and loaded. "
                    "For more info, see the "
                    "documentation: \n%s\n" % about.__docs_models__)
-            # Accumulate the result before beginning to iterate over it. This prevents
-            # the tokenisation from being changed out from under us during the iteration.
-            # The tricky thing here is that Span accepts its tokenisation changing,
-            # so it's okay once we have the Span objects. See Issue #375
+            # Accumulate the result before beginning to iterate over it. This
+            # prevents the tokenisation from being changed out from under us
+            # during the iteration. The tricky thing here is that Span accepts
+            # its tokenisation changing, so it's okay once we have the Span
+            # objects. See Issue #375
            spans = []
            cdef attr_t label
            for start, end, label in self.doc.noun_chunks_iterator(self):
@ -459,7 +453,7 @@ cdef class Span:
        YIELDS (Token):A left-child of a token of the span.
        """
        def __get__(self):
-            for token in reversed(self): # Reverse, so we get tokens in order
+            for token in reversed(self):  # Reverse, so we get tokens in order
                for left in token.lefts:
                    if left.i < self.start:
                        yield left
@ -476,6 +470,20 @@ cdef class Span:
                    if right.i >= self.end:
                        yield right

+    property n_lefts:
+        """RETURNS (int): The number of leftward immediate children of the
+            span, in the syntactic dependency parse.
+        """
+        # TODO: implement
+        raise NotImplementedError()
+
+    property n_rights:
+        """RETURNS (int): The number of rightward immediate children of the
+            span, in the syntactic dependency parse.
+        """
+        # TODO: implement
+        raise NotImplementedError()
+
    property subtree:
        """Tokens that descend from tokens in the span, but fall outside it.

@ -489,29 +497,21 @@ cdef class Span:
                yield from word.subtree

    property ent_id:
-        """An (integer) entity ID.
-
-        RETURNS (uint64): The entity ID.
-        """
+        """RETURNS (uint64): The entity ID."""
        def __get__(self):
            return self.root.ent_id

        def __set__(self, hash_t key):
-            # TODO
            raise NotImplementedError(
                "Can't yet set ent_id from Span. Vote for this feature on "
                "the issue tracker: http://github.com/explosion/spaCy/issues")

    property ent_id_:
-        """A (string) entity ID. Usually assigned by patterns in the `Matcher`.
-
-        RETURNS (unicode): The entity ID.
-        """
+        """RETURNS (unicode): The (string) entity ID."""
        def __get__(self):
            return self.root.ent_id_

        def __set__(self, hash_t key):
-            # TODO
            raise NotImplementedError(
                "Can't yet set ent_id_ from Span. Vote for this feature on the "
                "issue tracker: http://github.com/explosion/spaCy/issues")
@ -525,10 +525,7 @@ cdef class Span:
            return ''.join([t.orth_ for t in self]).strip()

    property lemma_:
-        """The span's lemma.
-
-        RETURNS (unicode): The span's lemma.
-        """
+        """RETURNS (unicode): The span's lemma."""
        def __get__(self):
            return ' '.join([t.lemma_ for t in self]).strip()

@ -543,15 +540,12 @@ cdef class Span:
            return ''.join([t.text_with_ws.lower() for t in self]).strip()

    property string:
-        """Deprecated: Use Span.text instead."""
+        """Deprecated: Use Span.text_with_ws instead."""
        def __get__(self):
            return ''.join([t.text_with_ws for t in self])

    property label_:
-        """The span's label.
-
-        RETURNS (unicode): The span's label.
-        """
+        """RETURNS (unicode): The span's label."""
        def __get__(self):
            return self.doc.vocab.strings[self.label]

--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -145,37 +145,32 @@ cdef class Token:
            return self.doc.user_token_hooks['similarity'](self)
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
-        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
+        return (numpy.dot(self.vector, other.vector) /
+                (self.vector_norm * other.vector_norm))

    property lex_id:
-        """ID of the token's lexical type.
-
-        RETURNS (int): ID of the token's lexical type."""
+        """RETURNS (int): Sequential ID of the token's lexical type."""
        def __get__(self):
            return self.c.lex.id

    property rank:
-        # TODO: add docstring
+        """RETURNS (int): Sequential ID of the token's lexical type, used to
+        index into tables, e.g. for word vectors."""
        def __get__(self):
            return self.c.lex.id

    property string:
+        """Deprecated: Use Token.text_with_ws instead."""
        def __get__(self):
            return self.text_with_ws

    property text:
-        """A unicode representation of the token text.
-
-        RETURNS (unicode): The original verbatim text of the token.
-        """
+        """RETURNS (unicode): The original verbatim text of the token."""
        def __get__(self):
            return self.orth_

    property text_with_ws:
-        """The text content of the token with a trailing whitespace character
-        if it has one.
-
-        RETURNS (unicode): The text content of the span (with trailing
+        """RETURNS (unicode): The text content of the span (with trailing
            whitespace).
        """
        def __get__(self):
@ -186,74 +181,104 @@ cdef class Token:
                return orth

    property prob:
+        """RETURNS (float): Smoothed log probability estimate of token type."""
        def __get__(self):
            return self.c.lex.prob

    property sentiment:
+        """RETURNS (float): A scalar value indicating the positivity or
+            negativity of the token."""
        def __get__(self):
            if 'sentiment' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['sentiment'](self)
            return self.c.lex.sentiment

    property lang:
+        """RETURNS (uint64): ID of the language of the parent document's
+            vocabulary.
+        """
        def __get__(self):
            return self.c.lex.lang

    property idx:
+        """RETURNS (int): The character offset of the token within the parent
+            document.
+        """
        def __get__(self):
            return self.c.idx

    property cluster:
+        """RETURNS (int): Brown cluster ID."""
        def __get__(self):
            return self.c.lex.cluster

    property orth:
+        """RETURNS (uint64): ID of the verbatim text content."""
        def __get__(self):
            return self.c.lex.orth

    property lower:
+        """RETURNS (uint64): ID of the lowercase token text."""
        def __get__(self):
            return self.c.lex.lower

    property norm:
+        """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
+            the token text. Usually set in the language's tokenizer exceptions
+            or norm exceptions.
+        """
        def __get__(self):
            return self.c.lex.norm

    property shape:
+        """RETURNS (uint64): ID of the token's shape, a transform of the
+            tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
+        """
        def __get__(self):
            return self.c.lex.shape

    property prefix:
+        """RETURNS (uint64): ID of a length-N substring from the start of the
+            token. Defaults to `N=1`.
+        """
        def __get__(self):
            return self.c.lex.prefix

    property suffix:
+        """RETURNS (uint64): ID of a length-N substring from the end of the
+            token. Defaults to `N=3`.
+        """
        def __get__(self):
            return self.c.lex.suffix

    property lemma:
-        """Base form of the word, with no inflectional suffixes.
-
-        RETURNS (uint64): Token lemma.
+        """RETURNS (uint64): ID of the base form of the word, with no
+            inflectional suffixes.
        """
        def __get__(self):
            return self.c.lemma
+
        def __set__(self, attr_t lemma):
            self.c.lemma = lemma

    property pos:
+        """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
        def __get__(self):
            return self.c.pos

    property tag:
+        """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
        def __get__(self):
            return self.c.tag
+
        def __set__(self, attr_t tag):
            self.vocab.morphology.assign_tag(self.c, tag)

    property dep:
+        """RETURNS (uint64): ID of syntactic dependency label."""
        def __get__(self):
            return self.c.dep
+
        def __set__(self, attr_t label):
            self.c.dep = label

@ -294,14 +319,21 @@ cdef class Token:
            return numpy.sqrt((vector ** 2).sum())

    property n_lefts:
+        """RETURNS (int): The number of leftward immediate children of the
+            word, in the syntactic dependency parse.
+        """
        def __get__(self):
            return self.c.l_kids

    property n_rights:
+        """RETURNS (int): The number of rightward immediate children of the
+            word, in the syntactic dependency parse.
+        """
        def __get__(self):
            return self.c.r_kids

    property sent_start:
+        # TODO: fix and document
        def __get__(self):
            return self.c.sent_start

@ -321,10 +353,12 @@ cdef class Token:
                                 "one of: None, True, False")

    property lefts:
+        """The leftward immediate children of the word, in the syntactic
+        dependency parse.
+
+        YIELDS (Token): A left-child of the token.
+        """
        def __get__(self):
-            """The leftward immediate children of the word, in the syntactic
-            dependency parse.
-            """
            cdef int nr_iter = 0
            cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
            while ptr < self.c:
@ -338,10 +372,12 @@ cdef class Token:
                                       "while looking for token.lefts")

    property rights:
+        """The rightward immediate children of the word, in the syntactic
+        dependency parse.
+
+        YIELDS (Token): A right-child of the token.
+        """
        def __get__(self):
-            """The rightward immediate children of the word, in the syntactic
-            dependency parse.
-            """
            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
            tokens = []
            cdef int nr_iter = 0
@ -420,18 +456,17 @@ cdef class Token:
        """
        if self.doc is not descendant.doc:
            return False
-        return any( ancestor.i == self.i for ancestor in descendant.ancestors )
+        return any(ancestor.i == self.i for ancestor in descendant.ancestors)

    property head:
        """The syntactic parent, or "governor", of this token.

-        RETURNS (Token): The token head.
+        RETURNS (Token): The token predicted by the parser to be the head of
+            the current token.
        """
        def __get__(self):
-            """The token predicted by the parser to be the head of the current
-            token.
-            """
            return self.doc[self.i + self.c.head]
+
        def __set__(self, Token new_head):
            # this function sets the head of self to new_head
            # and updates the counters for left/right dependents
@ -451,7 +486,7 @@ cdef class Token:
            cdef Token anc, child

            # update number of deps of old head
-            if self.c.head > 0: # left dependent
+            if self.c.head > 0:  # left dependent
                old_head.c.l_kids -= 1
                if self.c.l_edge == old_head.c.l_edge:
                    # the token dominates the left edge so the left edge of
@ -543,12 +578,10 @@ cdef class Token:
                            yield from word.conjuncts

    property ent_type:
-        """Named entity type.
-
-        RETURNS (uint64): Named entity type.
-        """
+        """RETURNS (uint64): Named entity type."""
        def __get__(self):
            return self.c.ent_type
+
        def __set__(self, ent_type):
            self.c.ent_type = ent_type

@ -562,12 +595,10 @@ cdef class Token:
            return self.c.ent_iob

    property ent_type_:
-        """Named entity type.
-
-        RETURNS (unicode): Named entity type.
-        """
+        """RETURNS (unicode): Named entity type."""
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]
+
        def __set__(self, ent_type):
            self.c.ent_type = self.vocab.strings.add(ent_type)

@ -583,9 +614,8 @@ cdef class Token:
            return iob_strings[self.c.ent_iob]

    property ent_id:
-        """ID of the entity the token is an instance of, if any.
-
-        RETURNS (uint64): ID of the entity.
+        """RETURNS (uint64): ID of the entity the token is an instance of,
+            if any.
        """
        def __get__(self):
            return self.c.ent_id
@ -594,9 +624,8 @@ cdef class Token:
            self.c.ent_id = key

    property ent_id_:
-        """ID of the entity the token is an instance of, if any.
-
-        RETURNS (unicode): ID of the entity.
+        """RETURNS (unicode): ID of the entity the token is an instance of,
+            if any.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]
@ -605,230 +634,192 @@ cdef class Token:
            self.c.ent_id = self.vocab.strings.add(name)

    property whitespace_:
-        """Trailing space character if present.
-
-        RETURNS (unicode): The whitespace character.
+        """RETURNS (unicode): The trailing whitespace character, if present.
        """
        def __get__(self):
            return ' ' if self.c.spacy else ''

    property orth_:
-        """Verbatim text content (identical to `Token.text`). Existst mostly
-        for consistency with the other attributes.
-
-        RETURNS (unicode): The token text.
+        """RETURNS (unicode): Verbatim text content (identical to
+            `Token.text`). Existst mostly for consistency with the other
+            attributes.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.orth]

    property lower_:
-        """Lowercase form of the token text. Equivalent to
-        `Token.text.lower()`.
-
-        RETURNS (unicode): The lowercase token text.
+        """RETURNS (unicode): The lowercase token text. Equivalent to
+            `Token.text.lower()`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lower]

    property norm_:
-        """The token's norm, i.e. a normalised form of the token text.
-        Usually set in the language's tokenizer exceptions or norm exceptions.
-
-        RETURNS (unicode): The norm.
+        """RETURNS (unicode): The token's norm, i.e. a normalised form of the
+            token text. Usually set in the language's tokenizer exceptions or
+            norm exceptions.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.norm]

    property shape_:
-        """Transform of the tokens's string, to show orthographic features.
-        For example, "Xxxx" or "dd".
-
-        RETURNS (unicode): The token shape.
+        """RETURNS (unicode): Transform of the tokens's string, to show
+            orthographic features. For example, "Xxxx" or "dd".
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.shape]

    property prefix_:
-        """A length-N substring from the start of the token. Defaults to `N=1`.
-
-        RETURNS (unicode): The token's prefix.
+        """RETURNS (unicode): A length-N substring from the start of the token.
+            Defaults to `N=1`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.prefix]

    property suffix_:
-        """A length-N substring from the end of the token. Defaults to `N=3`.
-
-        RETURNS (unicode): The token's suffix.
+        """RETURNS (unicode): A length-N substring from the end of the token.
+            Defaults to `N=3`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]

    property lang_:
-        """Language of the parent document's vocabulary, e.g. 'en'.
-
-        RETURNS (unicode): The language code.
+        """RETURNS (unicode): Language of the parent document's vocabulary,
+            e.g. 'en'.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lang]

    property lemma_:
-        """Base form of the word, with no inflectional suffixes.
-
-        RETURNS (unicode): Token lemma.
+        """RETURNS (unicode): The token lemma, i.e. the base form of the word,
+            with no inflectional suffixes.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
+
        def __set__(self, unicode lemma_):
            self.c.lemma = self.vocab.strings.add(lemma_)

    property pos_:
-        """Coarse-grained part-of-speech.
-
-        RETURNS (unicode): The part-of-speech tag.
-        """
+        """RETURNS (unicode): Coarse-grained part-of-speech tag."""
        def __get__(self):
            return parts_of_speech.NAMES[self.c.pos]

    property tag_:
-        """Fine-grained part-of-speech.
-
-        RETURNS (unicode): The part-of-speech tag.
-        """
+        """RETURNS (unicode): Fine-grained part-of-speech tag."""
        def __get__(self):
            return self.vocab.strings[self.c.tag]
+
        def __set__(self, tag):
            self.tag = self.vocab.strings.add(tag)

    property dep_:
-        """Syntactic dependency relation.
-
-        RETURNS (unicode): The dependency label.
-        """
+        """RETURNS (unicode): The syntactic dependency label."""
        def __get__(self):
            return self.vocab.strings[self.c.dep]
+
        def __set__(self, unicode label):
            self.c.dep = self.vocab.strings.add(label)

    property is_oov:
-        """Is the token out-of-vocabulary?
-
-        RETURNS (bool): Whether the token is out-of-vocabulary.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
+        """RETURNS (bool): Whether the token is out-of-vocabulary."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_OOV)

    property is_stop:
-        """Is the token part of a "stop list"? (defined by the language data)
-
-        RETURNS (bool): Whether the token is a stop word.
+        """RETURNS (bool): Whether the token is a stop word, i.e. part of a
+            "stop list" defined by the language data.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_STOP)

    property is_alpha:
-        """Does the token consist of alphabetic characters? Equivalent to
-        `token.text.isalpha()`.
-
-        RETURNS (bool): Whether the token consists of alpha characters.
+        """RETURNS (bool): Whether the token consists of alpha characters.
+            Equivalent to `token.text.isalpha()`.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)

    property is_ascii:
-        """Does the token consist of ASCII characters? Equivalent to
-        `[any(ord(c) >= 128 for c in token.text)]`.
-
-        RETURNS (bool): Whether the token consists of ASCII characters.
+        """RETURNS (bool): Whether the token consists of ASCII characters.
+            Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_ASCII)

    property is_digit:
-        """Does the token consist of digits? Equivalent to
-        `token.text.isdigit()`.
-
-        RETURNS (bool): Whether the token consists of digits.
+        """RETURNS (bool): Whether the token consists of digits. Equivalent to
+            `token.text.isdigit()`.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)

    property is_lower:
-        """Is the token in lowercase? Equivalent to `token.text.islower()`.
-
-        RETURNS (bool): Whether the token is in lowercase.
+        """RETURNS (bool): Whether the token is in lowercase. Equivalent to
+            `token.text.islower()`.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_LOWER)

    property is_upper:
-        """Is the token in uppercase? Equivalent to `token.text.isupper()`.
-
-        RETURNS (bool): Whether the token is in uppercase.
+        """RETURNS (bool): Whether the token is in uppercase. Equivalent to
+            `token.text.isupper()`
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_UPPER)

    property is_title:
-        """Is the token in titlecase? Equivalent to `token.text.istitle()`.
-
-        RETURNS (bool): Whether the token is in titlecase.
+        """RETURNS (bool): Whether the token is in titlecase. Equivalent to
+            `token.text.istitle()`.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_TITLE)

    property is_punct:
-        """Is the token punctuation?
-
-        RETURNS (bool): Whether the token is punctuation.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
+        """RETURNS (bool): Whether the token is punctuation."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)

    property is_space:
-        """Does the token consist of whitespace characters? Equivalent to
-        `token.text.isspace()`.
-
-        RETURNS (bool): Whether the token consists of whitespace characters.
+        """RETURNS (bool): Whether the token consists of whitespace characters.
+            Equivalent to `token.text.isspace()`.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_SPACE)

    property is_bracket:
-        """Is the token a bracket?
-
-        RETURNS (bool): Whether the token is a bracket.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
+        """RETURNS (bool): Whether the token is a bracket."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)

    property is_quote:
-        """Is the token a quotation mark?
-
-        RETURNS (bool): Whether the token is a quotation mark.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
+        """RETURNS (bool): Whether the token is a quotation mark."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)

    property is_left_punct:
-        """Is the token a left punctuation mark, e.g. "("?
-
-        RETURNS (bool): Whether the token is a left punctuation mark.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
+        """RETURNS (bool): Whether the token is a left punctuation mark."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)

    property is_right_punct:
-        """Is the token a left punctuation mark, e.g. "("?
-
-        RETURNS (bool): Whether the token is a left punctuation mark.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
+        """RETURNS (bool): Whether the token is a left punctuation mark."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)

    property like_url:
-        """Does the token resemble a URL?
-
-        RETURNS (bool): Whether the token resembles a URL.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
+        """RETURNS (bool): Whether the token resembles a URL."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, LIKE_URL)

    property like_num:
-        """Does the token represent a number? e.g. "10.9", "10", "ten", etc.
-
-        RETURNS (bool): Whether the token resembles a number.
+        """RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
+            "10", "ten", etc.
        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)

    property like_email:
-        """Does the token resemble an email address?
-
-        RETURNS (bool): Whether the token resembles an email address.
-        """
-        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
+        """RETURNS (bool): Whether the token resembles an email address."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
--- a/website/api/doc.jade
+++ b/website/api/doc.jade
@ -784,3 +784,10 @@ p
        +cell
            |  A dictionary that allows customisation of properties of
            |  #[code Span] children.
+
+    +row
+        +cell #[code _]
+        +cell #[code Underscore]
+        +cell
+            |  User space for adding custom
+            |  #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
--- a/website/api/span.jade
+++ b/website/api/span.jade
@ -369,7 +369,7 @@ p
    +tag property
    +tag-model("parse")

-p Tokens that are to the left of the span, whose head is within the span.
+p Tokens that are to the left of the span, whose heads are within the span.

 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
@ -386,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span.
    +tag property
    +tag-model("parse")

-p Tokens that are to the right of the span, whose head is within the span.
+p Tokens that are to the right of the span, whose heads are within the span.

 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
@ -399,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span.
        +cell #[code Token]
        +cell A right-child of a token of the span.

+h(2, "n_lefts") Span.n_lefts
+    +tag property
+    +tag-model("parse")
+
+p
+    |  The number of tokens that are to the left of the span, whose heads are
+    |  within the span.
+
+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    assert doc[3:7].n_lefts == 1
+
+table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell int
+        +cell The number of left-child tokens.
+
+h(2, "n_rights") Span.n_rights
+    +tag property
+    +tag-model("parse")
+
+p
+    |  The number of tokens that are to the right of the span, whose heads are
+    |  within the span.
+
+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    assert doc[2:4].n_rights == 1
+
+table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell int
+        +cell The number of right-child tokens.
+
 +h(2, "subtree") Span.subtree
    +tag property
    +tag-model("parse")
@ -553,3 +589,17 @@ p
        +cell #[code ent_id_]
        +cell unicode
        +cell The string ID of the named entity the token is an instance of.
+
+    +row
+        +cell #[code sentiment]
+        +cell float
+        +cell
+            |  A scalar value indicating the positivity or negativity of the
+            |  span.
+
+    +row
+        +cell #[code _]
+        +cell #[code Underscore]
+        +cell
+            |  User space for adding custom
+            |  #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
--- a/website/api/token.jade
+++ b/website/api/token.jade
@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children.
        +cell #[code Token]
        +cell A child token such that #[code child.head==self].

+h(2, "lefts") Token.lefts
+    +tag property
+    +tag-model("parse")
+
+p
+    |  The leftward immediate children of the word, in the syntactic dependency
+    |  parse.
+
+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    lefts = [t.text for t in doc[3].lefts]
+    assert lefts == [u'New']
+
+table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell yields
+        +cell #[code Token]
+        +cell A left-child of the token.
+
+h(2, "rights") Token.rights
+    +tag property
+    +tag-model("parse")
+
+p
+    |  The rightward immediate children of the word, in the syntactic
+    |  dependency parse.
+
+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    rights = [t.text for t in doc[3].rights]
+    assert rights == [u'in']
+
+table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell yields
+        +cell #[code Token]
+        +cell A right-child of the token.
+
+h(2, "n_lefts") Token.n_lefts
+    +tag property
+    +tag-model("parse")
+
+p
+    |  The number of leftward immediate children of the word, in the syntactic
+    |  dependency parse.
+
+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    assert doc[3].n_lefts == 1
+
+table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell int
+        +cell The number of left-child tokens.
+
+h(2, "n_rights") Token.n_rights
+    +tag property
+    +tag-model("parse")
+
+p
+    |  The number of rightward immediate children of the word, in the syntactic
+    |  dependency parse.
+
+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    assert doc[3].n_rights == 1
+
+table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell int
+        +cell The number of right-child tokens.
+
 +h(2, "subtree") Token.subtree
    +tag property
    +tag-model("parse")
@ -713,9 +787,30 @@ p The L2 norm of the token's vector representation.
    +row
        +cell #[code sentiment]
        +cell float
-        +cell A scalar value indicating the positivity or negativity of the token.
+        +cell
+            |  A scalar value indicating the positivity or negativity of the
+            |  token.

    +row
        +cell #[code lex_id]
        +cell int
-        +cell ID of the token's lexical type.
+        +cell Sequential ID of the token's lexical type.
+
+    +row
+        +cell #[code rank]
+        +cell int
+        +cell
+            |  Sequential ID of the token's lexical type, used to index into
+            |  tagles, e.g. for word vectors.
+
+    +row
+        +cell #[code cluster]
+        +cell int
+        +cell Brown cluster ID.
+
+    +row
+        +cell #[code _]
+        +cell #[code Underscore]
+        +cell
+            |  User space for adding custom
+            |  #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
--- a/website/usage/_linguistic-features/_dependency-parse.jade
+++ b/website/usage/_linguistic-features/_dependency-parse.jade
@ -111,11 +111,13 @@ p

 p
    |  A few more convenience attributes are provided for iterating around the
-    |  local tree from the token. The #[code .lefts] and #[code .rights]
-    |  attributes provide sequences of syntactic children that occur before and
-    |  after the token. Both sequences are in sentences order. There are also
-    |  two integer-typed attributes, #[code .n_rights] and #[code .n_lefts],
-    |  that give the number of left and right children.
+    |  local tree from the token. The #[+api("token#lefts") #[code Token.lefts]]
+    |  and #[+api("token#rights") #[code Token.rights]] attributes provide
+    |  sequences of syntactic children that occur before and after the token.
+    |  Both sequences are in sentence order. There are also two integer-typed
+    |  attributes, #[+api("token#n_rights") #[code Token.n_rights]] and
+    |  #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of
+    |  left and right children.

 +code.
    doc = nlp(u'bright red apples on the tree')
@ -126,10 +128,11 @@ p

 p
    |  You can get a whole phrase by its syntactic head using the
-    |  #[code .subtree] attribute. This returns an ordered sequence of tokens.
-    |  You can walk up the tree with the #[code .ancestors] attribute, and
-    |  check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]]
-    |  method.
+    |  #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an
+    |  ordered  sequence of tokens. You can walk up the tree with the
+    |  #[+api("token#ancestors") #[code Token.ancestors]] attribute, and
+    |  check dominance with
+    |  #[+api("token#is_ancestor") #[code Token.is_ancestor()]].

 +aside("Projective vs. non-projective")
    |  For the #[+a("/models/en") default English model], the