Tidy up and document Doc, Token and Span

2025-11-01 16:37:45 +03:00 · 2017-10-27 15:41:45 +02:00 · 2017-10-27 15:41:45 +02:00 · 6a0483b7aa
commit 6a0483b7aa
parent 1a559d4c95
6 changed files with 356 additions and 173 deletions
--- a/spacy/tokens/init.py
+++ b/spacy/tokens/init.py
@ -2,4 +2,4 @@ from .doc import Doc
 from .token import Token
 from .span import Span
-__all__ = [Doc, Token, Span]
+__all__ = ['Doc', 'Token', 'Span']
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs import intify_attrs, IDS
 from ..attrs cimport attr_id_t
-from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
+from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
+from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport SENT_START
+from ..attrs cimport ENT_TYPE, SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 from ..util import normalize_slice
 from ..compat import is_config, copy_reg, pickle
@ -78,17 +78,18 @@ def _get_chunker(lang):
 cdef class Doc:
    """A sequence of Token objects. Access sentences and named entities, export
-    annotations to numpy arrays, losslessly serialize to compressed binary strings.
+    annotations to numpy arrays, losslessly serialize to compressed binary
-    The `Doc` object holds an array of `TokenC` structs. The Python-level
+    strings. The `Doc` object holds an array of `TokenC` structs. The
-    `Token` and `Span` objects are views of this array, i.e. they don't own
+    Python-level `Token` and `Span` objects are views of this array, i.e.
-    the data themselves.
+    they don't own the data themselves.
    EXAMPLE: Construction 1
        >>> doc = nlp(u'Some text')
        Construction 2
        >>> from spacy.tokens import Doc
-        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
+        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
                      spaces=[True, False, False])
    """
    @classmethod
    def set_extension(cls, name, default=None, method=None,
@ -109,15 +110,14 @@ cdef class Doc:
                 orths_and_spaces=None):
        """Create a Doc object.
-        vocab (Vocab): A vocabulary object, which must match any models you want
+        vocab (Vocab): A vocabulary object, which must match any models you
-            to use (e.g. tokenizer, parser, entity recognizer).
+            want to use (e.g. tokenizer, parser, entity recognizer).
        words (list or None): A list of unicode strings to add to the document
            as words. If `None`, defaults to empty list.
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
@ -153,10 +153,10 @@ cdef class Doc:
                spaces = [True] * len(words)
            elif len(spaces) != len(words):
                raise ValueError(
-                    "Arguments 'words' and 'spaces' should be sequences of the "
+                    "Arguments 'words' and 'spaces' should be sequences of "
-                    "same length, or 'spaces' should be left default at None. "
+                    "the same length, or 'spaces' should be left default at "
-                    "spaces should be a sequence of booleans, with True meaning "
+                    "None. spaces should be a sequence of booleans, with True "
-                    "that the word owns a ' ' character following it.")
+                    "meaning that the word owns a ' ' character following it.")
            orths_and_spaces = zip(words, spaces)
        if orths_and_spaces is not None:
            for orth_space in orths_and_spaces:
@ -166,7 +166,8 @@ cdef class Doc:
                elif isinstance(orth_space, bytes):
                    raise ValueError(
                        "orths_and_spaces expects either List(unicode) or "
-                        "List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
+                        "List((unicode, bool)). "
                        "Got bytes instance: %s" % (str(orth_space)))
                else:
                    orth, has_space = orth_space
                # Note that we pass self.mem here --- we have ownership, if LexemeC
@ -186,7 +187,8 @@ cdef class Doc:
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.
-        i (int or tuple) The index of the token, or the slice of the document to get.
+        i (int or tuple) The index of the token, or the slice of the document
            to get.
        RETURNS (Token or Span): The token at `doc[i]]`, or the span at
            `doc[start : end]`.
@ -199,11 +201,11 @@ cdef class Doc:
            >>> doc[start : end]]
            Get a `Span` object, starting at position `start` and ending at
            position `end`, where `start` and `end` are token indices. For
-            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
+            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
-            Stepped slices (e.g. `doc[start : end : step]`) are not supported,
+            4. Stepped slices (e.g. `doc[start : end : step]`) are not
-            as `Span` objects must be contiguous (cannot have gaps). You can use
+            supported, as `Span` objects must be contiguous (cannot have gaps).
-            negative indices and open-ended ranges, which have their normal
+            You can use negative indices and open-ended ranges, which have
-            Python semantics.
+            their normal Python semantics.
        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -262,8 +264,10 @@ cdef class Doc:
        doc (Doc): The parent document.
        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for named entities.
+        label (uint64 or string): A label to attach to the Span, e.g. for
-        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+            named entities.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
        RETURNS (Span): The newly constructed object.
        """
        if not isinstance(label, int):
@ -377,13 +381,14 @@ cdef class Doc:
            return self.text
    property ents:
-        """Iterate over the entities in the document. Yields named-entity `Span`
+        """Iterate over the entities in the document. Yields named-entity
-        objects, if the entity recognizer has been applied to the document.
+        `Span` objects, if the entity recognizer has been applied to the
        document.
        YIELDS (Span): Entities in the document.
-        EXAMPLE: Iterate over the span to get individual Token objects, or access
+        EXAMPLE: Iterate over the span to get individual Token objects,
-            the label:
+            or access the label:
            >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
            >>> ents = list(tokens.ents)
@ -456,10 +461,11 @@ cdef class Doc:
    property noun_chunks:
        """Iterate over the base noun phrases in the document. Yields base
-        noun-phrase #[code Span] objects, if the document has been syntactically
+        noun-phrase #[code Span] objects, if the document has been
-        parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
+        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
-        not permit other NPs to be nested within it – so no NP-level
+        phrase that does not permit other NPs to be nested within it – so no
-        coordination, no prepositional phrases, and no relative clauses.
+        NP-level coordination, no prepositional phrases, and no relative
        clauses.
        YIELDS (Span): Noun chunks in the document.
        """
@ -467,12 +473,14 @@ cdef class Doc:
            if not self.is_parsed:
                raise ValueError(
                    "noun_chunks requires the dependency parse, which "
-                    "requires data to be installed. For more info, see the "
+                    "requires a statistical model to be installed and loaded. "
                    "For more info, see the "
                    "documentation: \n%s\n" % about.__docs_models__)
-            # Accumulate the result before beginning to iterate over it. This prevents
+            # Accumulate the result before beginning to iterate over it. This
-            # the tokenisation from being changed out from under us during the iteration.
+            # prevents the tokenisation from being changed out from under us
-            # The tricky thing here is that Span accepts its tokenisation changing,
+            # during the iteration. The tricky thing here is that Span accepts
-            # so it's okay once we have the Span objects. See Issue #375
+            # its tokenisation changing, so it's okay once we have the Span
            # objects. See Issue #375.
            spans = []
            for start, end, label in self.noun_chunks_iterator(self):
                spans.append(Span(self, start, end, label=label))
@ -497,8 +505,9 @@ cdef class Doc:
            if not self.is_parsed:
                raise ValueError(
-                    "sentence boundary detection requires the dependency parse, which "
+                    "Sentence boundary detection requires the dependency "
-                    "requires data to be installed. For more info, see the "
+                    "parse, which requires a statistical model to be "
                    "installed and loaded. For more info, see the "
                    "documentation: \n%s\n" % about.__docs_models__)
            cdef int i
            start = 0
@ -537,12 +546,11 @@ cdef class Doc:
    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Export given token attributes to a numpy `ndarray`.
-
+        If `attr_ids` is a sequence of M attributes, the output array will be
-	If `attr_ids` is a sequence of M attributes, the output array will
+        of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
-	be of shape `(N, M)`, where N is the length of the `Doc`
+        `attr_ids` is a single attribute, the output shape will be (N,). You
-	(in tokens). If `attr_ids` is a single attribute, the output shape will
+        can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
-	be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
+        string name (e.g. 'LEMMA' or 'lemma').
 	or string name (e.g. 'LEMMA' or 'lemma').
        attr_ids (list[]): A list of attributes (int IDs or string names).
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
@ -641,13 +649,12 @@ cdef class Doc:
    def from_array(self, attrs, array):
        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(
-                "Conflicting attributes specified in doc.from_array():\n"
+                "Conflicting attributes specified in doc.from_array(): "
                "(HEAD, SENT_START)\n"
-                "The HEAD attribute currently sets sentence boundaries implicitly,\n"
+                "The HEAD attribute currently sets sentence boundaries "
-                "based on the tree structure. This means the HEAD attribute would "
+                "implicitly, based on the tree structure. This means the HEAD "
-                "potentially override the sentence boundaries set by SENT_START.\n"
+                "attribute would potentially override the sentence boundaries "
-                "See https://github.com/spacy-io/spaCy/issues/235 for details and "
+                "set by SENT_START.")
                "workarounds, and to propose solutions.")
        cdef int i, col
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
@ -675,18 +682,14 @@ cdef class Doc:
        return self
    def get_lca_matrix(self):
-        '''
+        """Calculates the lowest common ancestor matrix for a given `Doc`.
-        Calculates the lowest common ancestor matrix
+        Returns LCA matrix containing the integer index of the ancestor, or -1
-        for a given Spacy doc.
+        if no common ancestor is found (ex if span excludes a necessary
-        Returns LCA matrix containing the integer index
+        ancestor). Apologies about the recursion, but the impact on
-        of the ancestor, or -1 if no common ancestor is
+        performance is negligible given the natural limitations on the depth
-        found (ex if span excludes a necessary ancestor).
+        of a typical human sentence.
-        Apologies about the recursion, but the
+        """
        impact on performance is negligible given
        the natural limitations on the depth of a typical human sentence.
        '''
        # Efficiency notes:
        #
        # We can easily improve the performance here by iterating in Cython.
        # To loop over the tokens in Cython, the easiest way is:
        # for token in doc.c[:doc.c.length]:
@ -719,7 +722,6 @@ cdef class Doc:
                token_k = self[k]
                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
                lca_matrix[k][j] = lca_matrix[j][k]
        return lca_matrix
    def to_disk(self, path, **exclude):
@ -819,14 +821,15 @@ cdef class Doc:
        return self
    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
+        """Retokenize the document, such that the span at
-        is merged into a single token. If `start_idx` and `end_idx `do not mark
+        `doc.text[start_idx : end_idx]` is merged into a single token. If
-        start and end token boundaries, the document remains unchanged.
+        `start_idx` and `end_idx `do not mark start and end token boundaries,
        the document remains unchanged.
-        start_idx (int): The character index of the start of the slice to merge.
+        start_idx (int): Character index of the start of the slice to merge.
-        end_idx (int): The character index after the end of the slice to merge.
+        end_idx (int): Character index after the end of the slice to merge.
        **attributes: Attributes to assign to the merged token. By default,
-            attributes are inherited from the syntactic root token of the span.
+            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The newly merged token, or `None` if the start and end
            indices did not fall at token boundaries.
        """
@ -847,10 +850,10 @@ cdef class Doc:
                attributes[ENT_TYPE] = attributes['ent_type']
        elif args:
            raise ValueError(
-                "Doc.merge received %d non-keyword arguments. "
+                "Doc.merge received %d non-keyword arguments. Expected either "
-                "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
+                "3 arguments (deprecated), or 0 (use keyword arguments). "
                "Arguments supplied:\n%s\n"
-                "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
+                "Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))
        # More deprecated attribute handling =/
        if 'label' in attributes:
@ -882,8 +885,9 @@ cdef class Doc:
                Token.set_struct_attr(token, attr_name, attr_value)
        # Begin by setting all the head indices to absolute token positions
        # This is easier to work with for now than the offsets
-        # Before thinking of something simpler, beware the case where a dependency
+        # Before thinking of something simpler, beware the case where a
-        # bridges over the entity. Here the alignment of the tokens changes.
+        # dependency bridges over the entity. Here the alignment of the
        # tokens changes.
        span_root = span.root.i
        token.dep = span.root.dep
        # We update token.lex after keeping span root and dep, since
@ -932,8 +936,9 @@ cdef class Doc:
            >>> trees = doc.print_tree()
            >>> trees[1]
            {'modifiers': [
-                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
+                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
-                'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+                'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
                'lemma': 'Alice'},
                {'modifiers': [
                    {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
                    'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
@ -1018,4 +1023,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
 copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -35,15 +35,16 @@ cdef class Span:
    def has_extension(cls, name):
        return name in Underscore.span_extensions
-    def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
+    def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
-                  vector_norm=None):
+                  vector=None, vector_norm=None):
        """Create a `Span` object from the slice `doc[start : end]`.
        doc (Doc): The parent document.
        start (int): The index of the first token of the span.
        end (int): The index of the first token after the span.
        label (uint64): A label to attach to the Span, e.g. for named entities.
-        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation
            of the span.
        RETURNS (Span): The newly constructed object.
        """
        if not (0 <= start <= end <= len(doc)):
@ -162,7 +163,8 @@ cdef class Span:
            attributes are inherited from the syntactic root token of the span.
        RETURNS (Token): The newly merged token.
        """
-        return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
+        return self.doc.merge(self.start_char, self.end_char, *args,
                              **attributes)
    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
@ -179,24 +181,19 @@ cdef class Span:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    def get_lca_matrix(self):
-        '''
+        """Calculates the lowest common ancestor matrix for a given `Span`.
-        Calculates the lowest common ancestor matrix
+        Returns LCA matrix containing the integer index of the ancestor, or -1
-        for a given Spacy span.
+        if no common ancestor is found (ex if span excludes a necessary
-        Returns LCA matrix containing the integer index
+        ancestor). Apologies about the recursion, but the impact on
-        of the ancestor, or -1 if no common ancestor is
+        performance is negligible given the natural limitations on the depth
-        found (ex if span excludes a necessary ancestor).
+        of a typical human sentence.
-        Apologies about the recursion, but the
+        """
        impact on performance is negligible given
        the natural limitations on the depth of a typical human sentence.
        '''
        def __pairwise_lca(token_j, token_k, lca_matrix, margins):
            offset = margins[0]
            token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
            token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
            token_j_i = token_j.i - offset
            token_k_i = token_k.i - offset
            if lca_matrix[token_j_i][token_k_i] != -2:
                return lca_matrix[token_j_i][token_k_i]
            elif token_j == token_k:
@ -209,23 +206,19 @@ cdef class Span:
                lca_index = -1
            else:
                lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
            lca_matrix[token_j_i][token_k_i] = lca_index
            lca_matrix[token_k_i][token_j_i] = lca_index
            return lca_index
        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
        lca_matrix.fill(-2)
        margins = [self.start, self.end]
        for j in range(len(self)):
            token_j = self[j]
            for k in range(len(self)):
                token_k = self[k]
                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
                lca_matrix[k][j] = lca_matrix[j][k]
        return lca_matrix
    cpdef np.ndarray to_array(self, object py_attr_ids):
@ -349,7 +342,8 @@ cdef class Span:
        """The text content of the span with a trailing whitespace character if
        the last token has one.
-        RETURNS (unicode): The text content of the span (with trailing whitespace).
+        RETURNS (unicode): The text content of the span (with trailing
            whitespace).
        """
        def __get__(self):
            return u''.join([t.text_with_ws for t in self])
@ -358,7 +352,8 @@ cdef class Span:
        """Yields base noun-phrase `Span` objects, if the document has been
        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
-        NP-level coordination, no prepositional phrases, and no relative clauses.
+        NP-level coordination, no prepositional phrases, and no relative
        clauses.
        YIELDS (Span): Base noun-phrase `Span` objects
        """
@ -366,7 +361,8 @@ cdef class Span:
            if not self.doc.is_parsed:
                raise ValueError(
                    "noun_chunks requires the dependency parse, which "
-                    "requires data to be installed. For more info, see the "
+                    "requires a statistical model to be installed and loaded. "
                    "For more info, see the "
                    "documentation: \n%s\n" % about.__docs_models__)
            # Accumulate the result before beginning to iterate over it. This prevents
            # the tokenisation from being changed out from under us during the iteration.
@ -385,9 +381,9 @@ cdef class Span:
        RETURNS (Token): The root token.
-        EXAMPLE: The root token has the shortest path to the root of the sentence
+        EXAMPLE: The root token has the shortest path to the root of the
-            (or is the root itself). If multiple words are equally high in the
+            sentence (or is the root itself). If multiple words are equally
-            tree, the first word is taken. For example:
+            high in the tree, the first word is taken. For example:
            >>> toks = nlp(u'I like New York in Autumn.')
@ -437,11 +433,11 @@ cdef class Span:
                if self.doc.c[i].head == 0:
                    return self.doc[i]
            # If we don't have a sentence root, we do something that's not so
-            # algorithmically clever, but I think should be quite fast, especially
+            # algorithmically clever, but I think should be quite fast,
-            # for short spans.
+            # especially for short spans.
            # For each word, we count the path length, and arg min this measure.
-            # We could use better tree logic to save steps here...But I think this
+            # We could use better tree logic to save steps here...But I
-            # should be okay.
+            # think this should be okay.
            cdef int current_best = self.doc.length
            cdef int root = -1
            for i in range(self.start, self.end):
@ -463,7 +459,7 @@ cdef class Span:
        YIELDS (Token):A left-child of a token of the span.
        """
        def __get__(self):
-            for token in reversed(self): # Reverse, so we get the tokens in order
+            for token in reversed(self): # Reverse, so we get tokens in order
                for left in token.lefts:
                    if left.i < self.start:
                        yield left
@ -493,7 +489,7 @@ cdef class Span:
                yield from word.subtree
    property ent_id:
-        """An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
+        """An (integer) entity ID.
        RETURNS (uint64): The entity ID.
        """
@ -503,8 +499,8 @@ cdef class Span:
        def __set__(self, hash_t key):
            # TODO
            raise NotImplementedError(
-                "Can't yet set ent_id from Span. Vote for this feature on the issue "
+                "Can't yet set ent_id from Span. Vote for this feature on "
-                "tracker: http://github.com/explosion/spaCy/issues")
+                "the issue tracker: http://github.com/explosion/spaCy/issues")
    property ent_id_:
        """A (string) entity ID. Usually assigned by patterns in the `Matcher`.
@ -517,13 +513,16 @@ cdef class Span:
        def __set__(self, hash_t key):
            # TODO
            raise NotImplementedError(
-                "Can't yet set ent_id_ from Span. Vote for this feature on the issue "
+                "Can't yet set ent_id_ from Span. Vote for this feature on the "
-                "tracker: http://github.com/explosion/spaCy/issues")
+                "issue tracker: http://github.com/explosion/spaCy/issues")
    property orth_:
-        # TODO: docstring
+        """Verbatim text content (identical to Span.text). Exists mostly for
        consistency with other attributes.
        RETURNS (unicode): The span's text."""
        def __get__(self):
-            return ''.join([t.string for t in self]).strip()
+            return ''.join([t.orth_ for t in self]).strip()
    property lemma_:
        """The span's lemma.
@ -534,19 +533,19 @@ cdef class Span:
            return ' '.join([t.lemma_ for t in self]).strip()
    property upper_:
-        # TODO: docstring
+        """Deprecated. Use Span.text.upper() instead."""
        def __get__(self):
-            return ''.join([t.string.upper() for t in self]).strip()
+            return ''.join([t.text_with_ws.upper() for t in self]).strip()
    property lower_:
-        # TODO: docstring
+        """Deprecated. Use Span.text.lower() instead."""
        def __get__(self):
-            return ''.join([t.string.lower() for t in self]).strip()
+            return ''.join([t.text_with_ws.lower() for t in self]).strip()
    property string:
-        # TODO: docstring
+        """Deprecated: Use Span.text instead."""
        def __get__(self):
-            return ''.join([t.string for t in self])
+            return ''.join([t.text_with_ws for t in self])
    property label_:
        """The span's label.
@ -570,7 +569,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
        n += 1
        if n >= sent_length:
            raise RuntimeError(
-                "Array bounds exceeded while searching for root word. This likely "
+                "Array bounds exceeded while searching for root word. This "
-                "means the parse tree is in an invalid state. Please report this "
+                "likely means the parse tree is in an invalid state. Please "
-                "issue here: http://github.com/explosion/spaCy/issues")
+                "report this issue here: "
                "http://github.com/explosion/spaCy/issues")
    return n
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from .. import parts_of_speech
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
+from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
+from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
-from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
+from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
-from ..attrs cimport LEMMA, POS, TAG, DEP
+from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
 from ..compat import is_config
 from .. import about
 from .underscore import Underscore
 cdef class Token:
-    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
+    """An individual token – i.e. a word, punctuation symbol, whitespace,
    etc."""
    @classmethod
    def set_extension(cls, name, default=None, method=None,
                      getter=None, setter=None):
@ -171,10 +172,11 @@ cdef class Token:
            return self.orth_
    property text_with_ws:
-        """The text content of the token with a trailing whitespace character if
+        """The text content of the token with a trailing whitespace character
-        it has one.
+        if it has one.
-        RETURNS (unicode): The text content of the span (with trailing whitespace).
+        RETURNS (unicode): The text content of the span (with trailing
            whitespace).
        """
        def __get__(self):
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -306,9 +308,8 @@ cdef class Token:
        def __set__(self, value):
            if self.doc.is_parsed:
                raise ValueError(
-                    'Refusing to write to token.sent_start if its document is parsed, '
+                    "Refusing to write to token.sent_start if its document "
-                    'because this may cause inconsistent state. '
+                    "is parsed, because this may cause inconsistent state.")
                    'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
            if value is None:
                self.c.sent_start = 0
            elif value is True:
@ -316,13 +317,12 @@ cdef class Token:
            elif value is False:
                self.c.sent_start = -1
            else:
-                raise ValueError("Invalid value for token.sent_start -- must be one of "
+                raise ValueError("Invalid value for token.sent_start. Must be "
-                                 "None, True, False")
+                                 "one of: None, True, False")
    property lefts:
        def __get__(self):
-            """
+            """The leftward immediate children of the word, in the syntactic
            The leftward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef int nr_iter = 0
@ -334,13 +334,12 @@ cdef class Token:
                nr_iter += 1
                # This is ugly, but it's a way to guard out infinite loops
                if nr_iter >= 10000000:
-                    raise RuntimeError(
+                    raise RuntimeError("Possibly infinite loop encountered "
-                        "Possibly infinite loop encountered while looking for token.lefts")
+                                       "while looking for token.lefts")
    property rights:
        def __get__(self):
-            """
+            """The rightward immediate children of the word, in the syntactic
            The rightward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
@ -352,27 +351,26 @@ cdef class Token:
                ptr -= 1
                nr_iter += 1
                if nr_iter >= 10000000:
-                    raise RuntimeError(
+                    raise RuntimeError("Possibly infinite loop encountered "
-                        "Possibly infinite loop encountered while looking for token.rights")
+                                       "while looking for token.rights")
            tokens.reverse()
            for t in tokens:
                yield t
    property children:
-        """
+        """A sequence of the token's immediate syntactic children.
        A sequence of the token's immediate syntactic children.
-        Yields: Token A child token such that child.head==self
+        YIELDS (Token): A child token such that child.head==self
        """
        def __get__(self):
            yield from self.lefts
            yield from self.rights
    property subtree:
-        """
+        """A sequence of all the token's syntactic descendents.
        A sequence of all the token's syntactic descendents.
-        Yields: Token A descendent token such that self.is_ancestor(descendent)
+        YIELDS (Token): A descendent token such that
            `self.is_ancestor(descendent)`.
        """
        def __get__(self):
            for word in self.lefts:
@ -456,13 +454,15 @@ cdef class Token:
            if self.c.head > 0: # left dependent
                old_head.c.l_kids -= 1
                if self.c.l_edge == old_head.c.l_edge:
-                    # the token dominates the left edge so the left edge of the head
+                    # the token dominates the left edge so the left edge of
-                    # may change when the token is reattached
+                    # the  head may change when the token is reattached, it may
-                    # it may not change if the new head is a descendant of the current head
+                    # not change if the new head is a descendant of the current
                    # head
                    new_edge = self.c.l_edge
-                    # the new l_edge is the left-most l_edge on any of the other dependents
+                    # the new l_edge is the left-most l_edge on any of the
-                    # where the l_edge is left of the head, otherwise it is the head
+                    # other dependents where the l_edge is left of the head,
                    # otherwise it is the head
                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
@ -472,14 +472,15 @@ cdef class Token:
                                new_edge = child.c.l_edge
                        old_head.c.l_edge = new_edge
-                    # walk up the tree from old_head and assign new l_edge to ancestors
+                    # walk up the tree from old_head and assign new l_edge to
-                    # until an ancestor already has an l_edge that's further left
+                    # ancestors until an ancestor already has an l_edge that's
                    # further left
                    for anc in old_head.ancestors:
                        if anc.c.l_edge <= new_edge:
                            break
                        anc.c.l_edge = new_edge
-            elif self.c.head < 0: # right dependent
+            elif self.c.head < 0:  # right dependent
                old_head.c.r_kids -= 1
                # do the same thing as for l_edge
                if self.c.r_edge == old_head.c.r_edge:
@ -500,7 +501,7 @@ cdef class Token:
                        anc.c.r_edge = new_edge
            # update number of deps of new head
-            if rel_newhead_i > 0: # left dependent
+            if rel_newhead_i > 0:  # left dependent
                new_head.c.l_kids += 1
                # walk up the tree from new head and set l_edge to self.l_edge
                # until you hit a token with an l_edge further to the left
@ -511,7 +512,7 @@ cdef class Token:
                            break
                        anc.c.l_edge = self.c.l_edge
-            elif rel_newhead_i < 0: # right dependent
+            elif rel_newhead_i < 0:  # right dependent
                new_head.c.r_kids += 1
                # do the same as for l_edge
                if self.c.r_edge > new_head.c.r_edge:
@ -572,8 +573,8 @@ cdef class Token:
    property ent_iob_:
        """IOB code of named entity tag. "B" means the token begins an entity,
-        "I" means it is inside an entity, "O" means it is outside an entity, and
+        "I" means it is inside an entity, "O" means it is outside an entity,
-        "" means no entity tag is set.
+        and "" means no entity tag is set.
        RETURNS (unicode): IOB code of named entity tag.
        """
@ -582,8 +583,7 @@ cdef class Token:
            return iob_strings[self.c.ent_iob]
    property ent_id:
-        """ID of the entity the token is an instance of, if any. Usually
+        """ID of the entity the token is an instance of, if any.
        assigned by patterns in the Matcher.
        RETURNS (uint64): ID of the entity.
        """
@ -594,8 +594,7 @@ cdef class Token:
            self.c.ent_id = key
    property ent_id_:
-        """ID of the entity the token is an instance of, if any. Usually
+        """ID of the entity the token is an instance of, if any.
        assigned by patterns in the Matcher.
        RETURNS (unicode): ID of the entity.
        """
@ -606,34 +605,70 @@ cdef class Token:
            self.c.ent_id = self.vocab.strings.add(name)
    property whitespace_:
        """Trailing space character if present.
        RETURNS (unicode): The whitespace character.
        """
        def __get__(self):
            return ' ' if self.c.spacy else ''
    property orth_:
        """Verbatim text content (identical to `Token.text`). Existst mostly
        for consistency with the other attributes.
        RETURNS (unicode): The token text.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.orth]
    property lower_:
        """Lowercase form of the token text. Equivalent to
        `Token.text.lower()`.
        RETURNS (unicode): The lowercase token text.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lower]
    property norm_:
        """The token's norm, i.e. a normalised form of the token text.
        Usually set in the language's tokenizer exceptions or norm exceptions.
        RETURNS (unicode): The norm.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.norm]
    property shape_:
        """Transform of the tokens's string, to show orthographic features.
        For example, "Xxxx" or "dd".
        RETURNS (unicode): The token shape.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.shape]
    property prefix_:
        """A length-N substring from the start of the token. Defaults to `N=1`.
        RETURNS (unicode): The token's prefix.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.prefix]
    property suffix_:
        """A length-N substring from the end of the token. Defaults to `N=3`.
        RETURNS (unicode): The token's suffix.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]
    property lang_:
        """Language of the parent document's vocabulary, e.g. 'en'.
        RETURNS (unicode): The language code.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lang]
@ -648,65 +683,152 @@ cdef class Token:
            self.c.lemma = self.vocab.strings.add(lemma_)
    property pos_:
        """Coarse-grained part-of-speech.
        RETURNS (unicode): The part-of-speech tag.
        """
        def __get__(self):
            return parts_of_speech.NAMES[self.c.pos]
    property tag_:
        """Fine-grained part-of-speech.
        RETURNS (unicode): The part-of-speech tag.
        """
        def __get__(self):
            return self.vocab.strings[self.c.tag]
        def __set__(self, tag):
            self.tag = self.vocab.strings.add(tag)
    property dep_:
        """Syntactic dependency relation.
        RETURNS (unicode): The dependency label.
        """
        def __get__(self):
            return self.vocab.strings[self.c.dep]
        def __set__(self, unicode label):
            self.c.dep = self.vocab.strings.add(label)
    property is_oov:
        """Is the token out-of-vocabulary?
        RETURNS (bool): Whether the token is out-of-vocabulary.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
    property is_stop:
        """Is the token part of a "stop list"? (defined by the language data)
        RETURNS (bool): Whether the token is a stop word.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
    property is_alpha:
        """Does the token consist of alphabetic characters? Equivalent to
        `token.text.isalpha()`.
        RETURNS (bool): Whether the token consists of alpha characters.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
    property is_ascii:
        """Does the token consist of ASCII characters? Equivalent to
        `[any(ord(c) >= 128 for c in token.text)]`.
        RETURNS (bool): Whether the token consists of ASCII characters.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
    property is_digit:
        """Does the token consist of digits? Equivalent to
        `token.text.isdigit()`.
        RETURNS (bool): Whether the token consists of digits.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
    property is_lower:
        """Is the token in lowercase? Equivalent to `token.text.islower()`.
        RETURNS (bool): Whether the token is in lowercase.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
    property is_upper:
        """Is the token in uppercase? Equivalent to `token.text.isupper()`.
        RETURNS (bool): Whether the token is in uppercase.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
    property is_title:
        """Is the token in titlecase? Equivalent to `token.text.istitle()`.
        RETURNS (bool): Whether the token is in titlecase.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
    property is_punct:
        """Is the token punctuation?
        RETURNS (bool): Whether the token is punctuation.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
    property is_space:
        """Does the token consist of whitespace characters? Equivalent to
        `token.text.isspace()`.
        RETURNS (bool): Whether the token consists of whitespace characters.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
    property is_bracket:
        """Is the token a bracket?
        RETURNS (bool): Whether the token is a bracket.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
    property is_quote:
        """Is the token a quotation mark?
        RETURNS (bool): Whether the token is a quotation mark.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
    property is_left_punct:
        """Is the token a left punctuation mark, e.g. "("?
        RETURNS (bool): Whether the token is a left punctuation mark.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
    property is_right_punct:
        """Is the token a left punctuation mark, e.g. "("?
        RETURNS (bool): Whether the token is a left punctuation mark.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
    property like_url:
        """Does the token resemble a URL?
        RETURNS (bool): Whether the token resembles a URL.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
    property like_num:
        """Does the token represent a number? e.g. "10.9", "10", "ten", etc.
        RETURNS (bool): Whether the token resembles a number.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
    property like_email:
        """Does the token resemble an email address?
        RETURNS (bool): Whether the token resembles an email address.
        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
--- a/website/api/span.jade
+++ b/website/api/span.jade
@ -248,6 +248,28 @@ p
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "get_lca_matrix") Span.get_lca_matrix
    +tag method
 p
    |  Calculates the lowest common ancestor matrix for a given #[code Span].
    |  Returns LCA matrix containing the integer index of the ancestor, or
    |  #[code -1] if no common ancestor is found, e.g. if span excludes a
    |  necessary ancestor.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn')
    span = doc[1:4]
    matrix = span.get_lca_matrix()
    # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
 +table(["Name", "Type", "Description"])
    +row("foot")
        +cell returns
        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
        +cell The lowest common ancestor matrix of the #[code Span].
 +h(2, "to_array") Span.to_array
    +tag method
    +tag-new(2)
@ -495,6 +517,18 @@ p
            |  The text content of the span with a trailing whitespace character
            |  if the last token has one.
    +row
        +cell #[code orth]
        +cell int
        +cell ID of the verbatim text content.
    +row
        +cell #[code orth_]
        +cell unicode
        +cell
            |  Verbatim text content (identical to #[code Span.text]). Existst
            |  mostly for consistency with the other attributes.
    +row
        +cell #[code label]
        +cell int
--- a/website/api/token.jade
+++ b/website/api/token.jade
@ -489,15 +489,35 @@ p The L2 norm of the token's vector representation.
        +cell unicode
        +cell Base form of the token, with no inflectional suffixes.
    +row
        +cell #[code norm]
        +cell int
        +cell
            |  The token's norm, i.e. a normalised form of the token text.
            |  Usually set in the language's
            |  #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
            |  #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
    +row
        +cell #[code norm_]
        +cell unicode
        +cell
            |  The token's norm, i.e. a normalised form of the token text.
            |  Usually set in the language's
            |  #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
            |  #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
    +row
        +cell #[code lower]
        +cell int
-        +cell Lower-case form of the token.
+        +cell Lowercase form of the token.
    +row
        +cell #[code lower_]
        +cell unicode
-        +cell Lower-case form of the token.
+        +cell
            |  Lowercase form of the token text. Equivalent to
            |  #[code Token.text.lower()].
    +row
        +cell #[code shape]
@ -537,7 +557,9 @@ p The L2 norm of the token's vector representation.
    +row
        +cell #[code suffix_]
        +cell unicode
-        +cell Length-N substring from the end of the token. Defaults to #[code N=3].
+        +cell
            |  Length-N substring from the end of the token. Defaults to
            |  #[code N=3].
    +row
        +cell #[code is_alpha]
@ -672,6 +694,7 @@ p The L2 norm of the token's vector representation.
        +cell #[code lang]
        +cell int
        +cell Language of the parent document's vocabulary.
    +row
        +cell #[code lang_]
        +cell unicode