Update docstrings and API docs for Doc class

2025-11-06 10:57:34 +03:00 · 2017-05-18 22:17:09 +02:00 · 2017-05-18 22:17:09 +02:00 · b87066ff10
commit b87066ff10
parent 0f513850ab
3 changed files with 684 additions and 531 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:


 cdef class Doc:
-    """
-    A sequence of `Token` objects. Access sentences and named entities,
-    export annotations to numpy arrays, losslessly serialize to compressed
-    binary strings.
+    """A sequence of Token objects. Access sentences and named entities, export
+    annotations to numpy arrays, losslessly serialize to compressed binary strings.
+    The `Doc` object holds an array of `TokenC` structs. The Python-level
+    `Token` and `Span` objects are views of this array, i.e. they don't own
+    the data themselves.

-    Aside: Internals
-        The `Doc` object holds an array of `TokenC` structs.
-        The Python-level `Token` and `Span` objects are views of this
-        array, i.e. they don't own the data themselves.
-
-    Code: Construction 1
-        doc = nlp.tokenizer(u'Some text')
-
-    Code: Construction 2
-        doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
+    EXAMPLE: Construction 1
+        >>> doc = nlp(u'Some text')

+        Construction 2
+        >>> from spacy.tokens import Doc
+        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
    """
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
-        """
-        Create a Doc object.
+        """Create a Doc object.

-        Arguments:
-            vocab:
-                A Vocabulary object, which must match any models you want to
-                use (e.g. tokenizer, parser, entity recognizer).
-
-            words:
-                A list of unicode strings to add to the document as words. If None,
-                defaults to empty list.
-
-            spaces:
-                A list of boolean values, of the same length as words. True
-                means that the word is followed by a space, False means it is not.
-                If None, defaults to [True]*len(words)
+        vocab (Vocab): A vocabulary object, which must match any models you want
+            to use (e.g. tokenizer, parser, entity recognizer).
+        words (list or None): A list of unicode strings to add to the document
+            as words. If `None`, defaults to empty list.
+        spaces (list or None): A list of boolean values, of the same length as
+            words. True means that the word is followed by a space, False means
+            it is not. If `None`, defaults to `[True]*len(words)`
+        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
        size = 20
@ -158,20 +148,22 @@ cdef class Doc:
            self.is_parsed = True

    def __getitem__(self, object i):
-        """
-        doc[i]
-            Get the Token object at position i, where i is an integer.
+        """Get a `Token` or `Span` object.
+
+        EXAMPLE:
+            >>> doc[i]
+            Get the `Token` object at position `i`, where `i` is an integer.
            Negative indexing is supported, and follows the usual Python
-            semantics, i.e. doc[-2] is doc[len(doc) - 2].
-        doc[start : end]]
-            Get a `Span` object, starting at position `start`
-            and ending at position `end`, where `start` and
-            `end` are token indices. For instance,
-            `doc[2:5]` produces a span consisting of
-            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
-            are not supported, as `Span` objects must be contiguous (cannot have gaps).
-            You can use negative indices and open-ended ranges, which have their
-            normal Python semantics.
+            semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
+
+            >>> doc[start : end]]
+            Get a `Span` object, starting at position `start` and ending at
+            position `end`, where `start` and `end` are token indices. For
+            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
+            Stepped slices (e.g. `doc[start : end : step]`) are not supported,
+            as `Span` objects must be contiguous (cannot have gaps). You can use
+            negative indices and open-ended ranges, which have their normal
+            Python semantics.
        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +178,14 @@ cdef class Doc:
            return Token.cinit(self.vocab, &self.c[i], i, self)

    def __iter__(self):
-        """
-        for token in doc
-            Iterate over `Token`  objects, from which the annotations can
-            be easily accessed. This is the main way of accessing Token
-            objects, which are the main way annotations are accessed from
-            Python. If faster-than-Python speeds are required, you can
-            instead access the annotations as a numpy array, or access the
-            underlying C data directly from Cython.
+        """Iterate over `Token`  objects, from which the annotations can be
+        easily accessed. This is the main way of accessing `Token` objects,
+        which are the main way annotations are accessed from Python. If faster-
+        than-Python speeds are required, you can instead access the annotations
+        as a numpy array, or access the underlying C data directly from Cython.
+
+        EXAMPLE:
+            >>> for token in doc
        """
        cdef int i
        for i in range(self.length):
@ -203,9 +195,10 @@ cdef class Doc:
                yield Token.cinit(self.vocab, &self.c[i], i, self)

    def __len__(self):
-        """
-        len(doc)
-            The number of tokens in the document.
+        """The number of tokens in the document.
+
+        EXAMPLE:
+            >>> len(doc)
        """
        return self.length

@ -228,16 +221,12 @@ cdef class Doc:
        return self

    def similarity(self, other):
-        """
-        Make a semantic similarity estimate. The default estimate is cosine
+        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

-        Arguments:
-            other (object): The object to compare with. By default, accepts Doc,
-                Span, Token and Lexeme objects.
-
-        Return:
-            score (float): A scalar similarity score. Higher is more similar.
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.user_hooks:
            return self.user_hooks['similarity'](self, other)
@ -246,8 +235,10 @@ cdef class Doc:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property has_vector:
-        """
-        A boolean value indicating whether a word vector is associated with the object.
+        """A boolean value indicating whether a word vector is associated with
+        the object.
+
+        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.user_hooks:
@ -256,10 +247,11 @@ cdef class Doc:
            return any(token.has_vector for token in self)

    property vector:
-        """
-        A real-valued meaning representation. Defaults to an average of the token vectors.
+        """A real-valued meaning representation. Defaults to an average of the
+        token vectors.

-        Type: numpy.ndarray[ndim=1, dtype='float32']
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the document's semantics.
        """
        def __get__(self):
            if 'vector' in self.user_hooks:
@ -275,6 +267,7 @@ cdef class Doc:
            self._vector = value

    property vector_norm:
+        # TODO: docstrings / docs
        def __get__(self):
            if 'vector_norm' in self.user_hooks:
                return self.user_hooks['vector_norm'](self)
@ -295,34 +288,37 @@ cdef class Doc:
        return self.text

    property text:
-        """
-        A unicode representation of the document text.
+        """A unicode representation of the document text.
+
+        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return u''.join(t.text_with_ws for t in self)

    property text_with_ws:
-        """
-        An alias of Doc.text, provided for duck-type compatibility with Span and Token.
+        """An alias of `Doc.text`, provided for duck-type compatibility with
+        `Span` and `Token`.
+
+        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return self.text

    property ents:
-        """
-        Yields named-entity `Span` objects, if the entity recognizer
-        has been applied to the document. Iterate over the span to get
-        individual Token objects, or access the label:
+        """Iterate over the entities in the document. Yields named-entity `Span`
+        objects, if the entity recognizer has been applied to the document.

-        Example:
-            from spacy.en import English
-            nlp = English()
-            tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
-            ents = list(tokens.ents)
-            assert ents[0].label == 346
-            assert ents[0].label_ == 'PERSON'
-            assert ents[0].orth_ == 'Best'
-            assert ents[0].text == 'Mr. Best'
+        YIELDS (Span): Entities in the document.
+
+        EXAMPLE: Iterate over the span to get individual Token objects, or access
+            the label:
+
+            >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
+            >>> ents = list(tokens.ents)
+            >>> assert ents[0].label == 346
+            >>> assert ents[0].label_ == 'PERSON'
+            >>> assert ents[0].orth_ == 'Best'
+            >>> assert ents[0].text == 'Mr. Best'
        """
        def __get__(self):
            cdef int i
@ -387,12 +383,13 @@ cdef class Doc:
                    self.c[start].ent_iob = 3

    property noun_chunks:
-        """
-        Yields base noun-phrase #[code Span] objects, if the document
-        has been syntactically parsed. A base noun phrase, or
-        'NP chunk', is a noun phrase that does not permit other NPs to
-        be nested within it – so no NP-level coordination, no prepositional
-        phrases, and no relative clauses.
+        """Iterate over the base noun phrases in the document. Yields base
+        noun-phrase #[code Span] objects, if the document has been syntactically
+        parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
+        not permit other NPs to be nested within it – so no NP-level
+        coordination, no prepositional phrases, and no relative clauses.
+
+        YIELDS (Span): Noun chunks in the document.
        """
        def __get__(self):
            if not self.is_parsed:
@ -411,17 +408,15 @@ cdef class Doc:
                yield span

    property sents:
-        """
-        Yields sentence `Span` objects. Sentence spans have no label.
-        To improve accuracy on informal texts, spaCy calculates sentence
-        boundaries from the syntactic dependency parse. If the parser is disabled,
-        `sents` iterator will be unavailable.
+        """Iterate over the sentences in the document. Yields sentence `Span`
+        objects. Sentence spans have no label. To improve accuracy on informal
+        texts, spaCy calculates sentence boundaries from the syntactic
+        dependency parse. If the parser is disabled, the `sents` iterator will
+        be unavailable.

-        Example:
-            from spacy.en import English
-            nlp = English()
-            doc = nlp("This is a sentence. Here's another...")
-            assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
+        EXAMPLE:
+            >>> doc = nlp("This is a sentence. Here's another...")
+            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
            if 'sents' in self.user_hooks:
@ -467,24 +462,20 @@ cdef class Doc:

    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
-        """
-        Given a list of M attribute IDs, export the tokens to a numpy
-        `ndarray` of shape (N, M), where `N` is the length
-        of the document. The values will be 32-bit integers.
+        """Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
+        The values will be 32-bit integers.

-        Example:
-            from spacy import attrs
-            doc = nlp(text)
-            # All strings mapped to integers, for easy export to numpy
-            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
-
-        Arguments:
        attr_ids (list[int]): A list of attribute ID ints.
+        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
+            per word, and one column per attribute indicated in the input
+            `attr_ids`.

-        Returns:
-            feat_array (numpy.ndarray[long, ndim=2]):
-              A feature matrix, with one row per word, and one column per attribute
-              indicated in the input attr_ids.
+        EXAMPLE:
+            >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+            >>> doc = nlp(text)
+            >>> # All strings mapped to integers, for easy export to numpy
+            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
        """
        cdef int i, j
        cdef attr_id_t feature
@ -499,27 +490,20 @@ cdef class Doc:
        return output

    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
-        """
-        Produce a dict of {attribute (int): count (ints)} frequencies, keyed
-        by the values of the given attribute ID.
+        """Count the frequencies of a given attribute. Produces a dict of
+        `{attribute (int): count (ints)}` frequencies, keyed by the values of
+        the given attribute ID.

-        Example:
-            from spacy.en import English
-            from spacy import attrs
-            nlp = English()
-            tokens = nlp(u'apple apple orange banana')
-            tokens.count_by(attrs.ORTH)
-            # {12800L: 1, 11880L: 2, 7561L: 1}
-            tokens.to_array([attrs.ORTH])
-            # array([[11880],
-            #   [11880],
-            #   [ 7561],
-            #   [12800]])
+        attr_id (int): The attribute ID to key the counts.
+        RETURNS (dict): A dictionary mapping attributes to integer counts.

-        Arguments:
-            attr_id
-                int
-                The attribute ID to key the counts.
+        EXAMPLE:
+            >>> from spacy import attrs
+            >>> doc = nlp(u'apple apple orange banana')
+            >>> tokens.count_by(attrs.ORTH)
+            {12800L: 1, 11880L: 2, 7561L: 1}
+            >>> tokens.to_array([attrs.ORTH])
+            array([[11880], [11880], [7561], [12800]])
        """
        cdef int i
        cdef attr_t attr
@ -567,8 +551,12 @@ cdef class Doc:
            self.c[i] = parsed[i]

    def from_array(self, attrs, int[:, :] array):
-        """
-        Write to a `Doc` object, from an `(M, N)` array of attributes.
+        """Load attributes from a numpy array. Write to a `Doc` object, from an
+        `(M, N)` array of attributes.
+
+        attrs (ints): A list of attribute ID ints.
+        array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
+        RETURNS (Doc): Itself.
        """
        cdef int i, col
        cdef attr_id_t attr_id
@ -597,8 +585,10 @@ cdef class Doc:
        return self

    def to_bytes(self):
-        """
-        Serialize, producing a byte string.
+        """Serialize, i.e. export the document contents to a binary string.
+
+        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
+            all annotations.
        """
        return dill.dumps(
            (self.text,
@ -611,8 +601,10 @@ cdef class Doc:
            protocol=-1)

    def from_bytes(self, data):
-        """
-        Deserialize, loading from bytes.
+        """Deserialize, i.e. import the document contents from a binary string.
+
+        data (bytes): The string to load from.
+        RETURNS (Doc): Itself.
        """
        if self.length != 0:
            raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +632,16 @@ cdef class Doc:
        return self

    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """
-        Retokenize the document, such that the span at doc.text[start_idx : end_idx]
-        is merged into a single token. If start_idx and end_idx do not mark start
-        and end token boundaries, the document remains unchanged.
+        """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
+        is merged into a single token. If `start_idx` and `end_idx `do not mark
+        start and end token boundaries, the document remains unchanged.

-        Arguments:
        start_idx (int): The character index of the start of the slice to merge.
        end_idx (int): The character index after the end of the slice to merge.
-            **attributes:
-                Attributes to assign to the merged token. By default, attributes
-                are inherited from the syntactic root token of the span.
-        Returns:
-            token (Token):
-                The newly merged token, or None if the start and end indices did
-                not fall at token boundaries.
+        **attributes: Attributes to assign to the merged token. By default,
+            attributes are inherited from the syntactic root token of the span.
+        RETURNS (Token): The newly merged token, or `None` if the start and end
+            indices did not fall at token boundaries.
        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
@ -758,7 +745,29 @@ cdef class Doc:
        return self[start]

    def print_tree(self, light=False, flat=False):
-        """Returns the parse trees in the JSON (Dict) format."""
+        """Returns the parse trees in JSON (dict) format.
+
+        light (bool): Don't include lemmas or entities.
+        flat (bool): Don't include arcs or modifiers.
+        RETURNS (dict): Parse tree as dict.
+
+        EXAMPLE:
+            >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
+            >>> trees = doc.print_tree()
+            >>> trees[1]
+            {'modifiers': [
+                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
+                'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+                {'modifiers': [
+                    {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
+                    'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
+                'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
+                'POS_fine': 'NN', 'lemma': 'pizza'},
+                {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
+                'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
+                'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
+                'POS_fine': 'VBD', 'lemma': 'eat'}
+        """
        return parse_tree(self, light=light, flat=flat)


--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE


 def merge_ents(doc):
-    """
-    Helper: merge adjacent entities into single tokens; modifies the doc.
-    """
+    """Helper: merge adjacent entities into single tokens; modifies the doc."""
    for ent in doc.ents:
        ent.merge(ent.root.tag_, ent.text, ent.label_)
    return doc


 def format_POS(token, light, flat):
-    """
-    Helper: form the POS output for a token.
-    """
+    """Helper: form the POS output for a token."""
    subtree = dict([
        ("word", token.text),
        ("lemma", token.lemma_),  # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):


 def POS_tree(root, light=False, flat=False):
-    """
-    Helper: generate a POS tree for a root token. The doc must have
-    merge_ents(doc) ran on it.
+    """Helper: generate a POS tree for a root token. The doc must have
+    `merge_ents(doc)` ran on it.
    """
    subtree = format_POS(root, light=light, flat=flat)
    for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):


 def parse_tree(doc, light=False, flat=False):
-    """
-    Makes a copy of the doc, then construct a syntactic parse tree, similar to
+    """Makes a copy of the doc, then construct a syntactic parse tree, similar to
    the one used in displaCy. Generates the POS tree for all sentences in a doc.

-    Args:
-        doc: The doc for parsing.
+    doc (Doc): The doc for parsing.
+    RETURNS (dict): The parse tree.

-    Returns:
-        [parse_trees (Dict)]:
-
-    >>> from spacy.en import English
-    >>> nlp = English()
+    EXAMPLE:
        >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
        >>> trees = doc.print_tree()
-    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
+        >>> trees[1]
+        {'modifiers': [
+            {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
+             'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+            {'modifiers': [
+                {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
+                 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
+             'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
+             'POS_fine': 'NN', 'lemma': 'pizza'},
+            {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
+             'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
+            'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
+            'POS_fine': 'VBD', 'lemma': 'eat'}
    """
    doc_clone  = Doc(doc.vocab, words=[w.text for w in doc])
    doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -4,6 +4,503 @@ include ../../_includes/_mixins

 p A container for accessing linguistic annotations.

+p
+    |  A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
+    |  Access sentences and named entities, export annotations to numpy arrays,
+    |  losslessly serialize to compressed binary strings. The #[code Doc] object
+    |  holds an array of #[code TokenC] structs. The Python-level #[code Token]
+    |  and #[+api("span") #[code Span]] objects are views of this array, i.e.
+    |  they don't own the data themselves.
+
+aside-code("Example").
+    # Construction 1
+    doc = nlp(u'Some text')
+
+    # Construction 2
+    from spacy.tokens import Doc
+    doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
+                               spaces=[True, False, False])
+
+h(2, "init") Doc.__init__
+    +tag method
+
+p
+    |  Construct a #[code Doc] object. The most common way to get a #[code Doc]
+    |  object is via the #[code nlp] object.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell A storage container for lexical types.
+
+    +row
+        +cell #[code words]
+        +cell -
+        +cell A list of strings to add to the container.
+
+    +row
+        +cell #[code spaces]
+        +cell -
+        +cell
+            |  A list of boolean values indicating whether each word has a
+            |  subsequent space. Must have the same length as #[code words], if
+            |  specified. Defaults to a sequence of #[code True].
+
+    +footrow
+        +cell return
+        +cell #[code Doc]
+        +cell The newly constructed object.
+
+h(2, "getitem") Doc.__getitem__
+    +tag method
+
+p
+    |  Get a #[+api("token") #[code Token]] object at position #[code i], where
+    |  #[code i] is an integer. Negative indexing is supported, and follows the
+    |  usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    assert doc[0].text == 'Give'
+    assert doc[-1].text == '.'
+    span = doc[1:1]
+    assert span.text == 'it back'
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code i]
+        +cell int
+        +cell The index of the token.
+
+    +footrow
+        +cell return
+        +cell #[code Token]
+        +cell The token at #[code doc[i]].
+
+p
+    |  Get a #[+api("span") #[code Span]] object, starting at position
+    |  #[code start] (token index) and ending at position #[code end] (token
+    |  index).
+
+p
+    |  For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
+    |  and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
+    |  supported, as #[code Span] objects must be contiguous (cannot have gaps).
+    |  You can use negative indices and open-ended ranges, which have their
+    |  normal Python semantics.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start_end]
+        +cell tuple
+        +cell The slice of the document to get.
+
+    +footrow
+        +cell return
+        +cell #[code Span]
+        +cell The span at #[code doc[start : end]].
+
+h(2, "iter") Doc.__iter__
+    +tag method
+
+p
+    |  Iterate over #[code Token] objects, from which the annotations can be
+    |  easily accessed.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    for token in doc:
+        print(token.text, token.tag_)
+
+p
+    |  This is the main way of accessing #[+api("token") #[code Token]] objects,
+    |  which are the main way annotations are accessed from Python. If
+    |  faster-than-Python speeds are required, you can instead access the
+    |  annotations as a numpy array, or access the underlying C data directly
+    |  from Cython.
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yield
+        +cell #[code Token]
+        +cell A #[code Token] object.
+
+h(2, "len") Doc.__len__
+    +tag method
+
+p Get the number of tokens in the document.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    assert len(doc) == 7
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell return
+        +cell int
+        +cell The number of tokens in the document.
+
+h(2, "similarity") Doc.similarity
+    +tag method
+    +tag requires model
+
+p
+    |  Make a semantic similarity estimate. The default estimate is cosine
+    |  similarity using an average of word vectors.
+
+aside-code("Example").
+    apples, and, oranges = nlp(u'apples and oranges')
+    apples_oranges = apples.similarity(oranges)
+    oranges_apples = oranges.similarity(apples)
+    assert apples_oranges == oranges_apples
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code other]
+        +cell -
+        +cell
+            |  The object to compare with. By default, accepts #[code Doc],
+            |  #[code Span], #[code Token] and #[code Lexeme] objects.
+
+    +footrow
+        +cell return
+        +cell float
+        +cell A scalar similarity score. Higher is more similar.
+
+h(2, "count_by") Doc.count_by
+    +tag method
+
+p
+    |  Count the frequencies of a given attribute. Produces a dict of
+    |  #[code {attr (int): count (ints)}] frequencies, keyed by the values
+    |  of the given attribute ID.
+
+aside-code("Example").
+    from spacy import attrs
+    doc = nlp(u'apple apple orange banana')
+    tokens.count_by(attrs.ORTH)
+    # {12800L: 1, 11880L: 2, 7561L: 1}
+    tokens.to_array([attrs.ORTH])
+    # array([[11880], [11880], [7561], [12800]])
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attr_id]
+        +cell int
+        +cell The attribute ID
+
+    +footrow
+        +cell return
+        +cell dict
+        +cell A dictionary mapping attributes to integer counts.
+
+h(2, "to_array") Doc.to_array
+    +tag method
+
+p
+    |  Export the document annotations to a numpy array of shape #[code N*M]
+    |  where #[code N] is the length of the document and #[code M] is the number
+    |  of attribute IDs to export. The values will be 32-bit integers.
+
+aside-code("Example").
+    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+    doc = nlp(text)
+    # All strings mapped to integers, for easy export to numpy
+    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attr_ids]
+        +cell ints
+        +cell A list of attribute ID ints.
+
+    +footrow
+        +cell return
+        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell
+            |  The exported attributes as a 2D numpy array, with one row per
+            |  token and one column per attribute.
+
+h(2, "from_array") Doc.from_array
+    +tag method
+
+p
+    |  Load attributes from a numpy array. Write to a #[code Doc] object, from
+    |  an #[code (M, N)] array of attributes.
+
+aside-code("Example").
+    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+    from spacy.tokens import Doc
+    doc = nlp(text)
+    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+    doc2 = Doc(doc.vocab)
+    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attrs]
+        +cell ints
+        +cell A list of attribute ID ints.
+
+    +row
+        +cell #[code array]
+        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell The attribute values to load.
+
+    +footrow
+        +cell return
+        +cell #[code Doc]
+        +cell Itself.
+
+h(2, "to_bytes") Doc.to_bytes
+    +tag method
+
+p Serialize, i.e. export the document contents to a binary string.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    doc_bytes = doc.to_bytes()
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell return
+        +cell bytes
+        +cell
+            |  A losslessly serialized copy of the #[code Doc], including all
+            |  annotations.
+
+h(2, "from_bytes") Doc.from_bytes
+    +tag method
+
+p Deserialize, i.e. import the document contents from a binary string.
+
+aside-code("Example").
+    from spacy.tokens import Doc
+    text = u'Give it back! He pleaded.'
+    doc = nlp(text)
+    bytes = doc.to_bytes()
+    doc2 = Doc(doc.vocab).from_bytes(bytes)
+    assert doc.text == doc2.text
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code data]
+        +cell bytes
+        +cell The string to load from.
+
+    +footrow
+        +cell return
+        +cell #[code Doc]
+        +cell Itself.
+
+h(2, "merge") Doc.merge
+    +tag method
+
+p
+    |  Retokenize the document, such that the span at
+    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
+    |  #[code start_idx] and #[end_idx] do not mark start and end token
+    |  boundaries, the document remains unchanged.
+
+aside-code("Example").
+    doc = nlp(u'Los Angeles start.')
+    doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
+    print([token.text for token in doc])
+    # ['Los Angeles', 'start', '.']
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start_idx]
+        +cell int
+        +cell The character index of the start of the slice to merge.
+
+    +row
+        +cell #[code end_idx]
+        +cell int
+        +cell The character index after the end of the slice to merge.
+
+    +row
+        +cell #[code **attributes]
+        +cell -
+        +cell
+            |  Attributes to assign to the merged token. By default,
+            |  attributes are inherited from the syntactic root token of
+            |  the span.
+
+    +footrow
+        +cell return
+        +cell #[code Token]
+        +cell
+            |  The newly merged token, or #[code None] if the start and end
+            |  indices did not fall at token boundaries
+
+h(2, "print_tree") Doc.print_tree
+    +tag method
+    +tag requires model
+
+p
+    |  Returns the parse trees in JSON (dict) format. Especially useful for
+    |  web applications.
+
+aside-code("Example").
+    doc = nlp('Alice ate the pizza.')
+    trees = doc.print_tree()
+    # {'modifiers': [
+    #   {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+    #   {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
+    #   {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
+    # ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code light]
+        +cell bool
+        +cell Don't include lemmas or entities.
+
+    +row
+        +cell #[code flat]
+        +cell bool
+        +cell Don't include arcs or modifiers.
+
+    +footrow
+        +cell return
+        +cell dict
+        +cell Parse tree as dict.
+
+h(2, "text") Doc.text
+    +tag property
+
+p A unicode representation of the document text.
+
+aside-code("Example").
+    text = u'Give it back! He pleaded.'
+    doc = nlp(text)
+    assert doc.text == text
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell return
+        +cell unicode
+        +cell The original verbatim text of the document.
+
+h(2, "text_with_ws") Doc.text_with_ws
+    +tag property
+
+p
+    |  An alias of #[code Doc.text], provided for duck-type compatibility with
+    |  #[code Span] and #[code Token].
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell return
+        +cell unicode
+        +cell The original verbatim text of the document.
+
+h(2, "ents") Doc.ents
+    +tag property
+    +tag requires model
+
+p
+    |  Iterate over the entities in the document. Yields named-entity
+    |  #[code Span] objects, if the entity recognizer has been applied to the
+    |  document.
+
+aside-code("Example").
+    tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
+    ents = list(tokens.ents)
+    assert ents[0].label == 346
+    assert ents[0].label_ == 'PERSON'
+    assert ents[0].text == 'Mr. Best'
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yield
+        +cell #[code Span]
+        +cell Entities in the document.
+
+h(2, "noun_chunks") Doc.noun_chunks
+    +tag property
+    +tag requires model
+
+p
+    |  Iterate over the base noun phrases in the document. Yields base
+    |  noun-phrase #[code Span] objects, if the document has been syntactically
+    |  parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
+    |  permit other NPs to be nested within it – so no NP-level coordination, no
+    |  prepositional phrases, and no relative clauses.
+
+aside-code("Example").
+    doc = nlp(u'A phrase with another phrase occurs.')
+    chunks = list(doc.noun_chunks)
+    assert chunks[0].text == "A phrase"
+    assert chunks[1].text == "another phrase"
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yield
+        +cell #[code Span]
+        +cell Noun chunks in the document.
+
+h(2, "sents") Doc.sents
+    +tag property
+    +tag requires model
+
+p
+    |  Iterate over the sentences in the document. Sentence spans have no label.
+    |  To improve accuracy on informal texts, spaCy calculates sentence boundaries
+    |  from the syntactic dependency parse. If the parser is disabled,
+    |  the #[code sents] iterator will be unavailable.
+
+aside-code("Example").
+    doc = nlp(u"This is a sentence. Here's another...")
+    sents = list(doc.sents)
+    assert len(sents) == 2
+    assert [s.root.text for s in sents] == ["is", "'s"]
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yield
+        +cell #[code Span]
+        +cell Sentences in the document.
+
+h(2, "has_vector") Doc.has_vector
+    +tag property
+    +tag requires model
+
+p
+    |  A boolean value indicating whether a word vector is associated with the
+    |  object.
+
+aside-code("Example").
+    apple = nlp(u'apple')
+    assert apple.has_vector
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell return
+        +cell bool
+        +cell Whether the document has a vector data attached.
+
+h(2, "vector") Doc.vector
+    +tag property
+    +tag requires model
+
+p
+    |  A real-valued meaning representation. Defaults to an average of the
+    |  token vectors.
+
+aside-code("Example").
+    apple = nlp(u'apple')
+    (apple.vector.dtype, apple.vector.shape)
+    # (dtype('float32'), (300,))
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell return
+        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A 1D numpy array representing the document's semantics.
+
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])
@ -59,358 +556,3 @@ p A container for accessing linguistic annotations.
        +cell
            |  A dictionary that allows customisation of properties of
            |  #[code Span] children.
-
-+h(2, "init") Doc.__init__
-    +tag method
-
-p Construct a #[code Doc] object.
-
-+aside("Note")
-    |  The most common way to get a #[code Doc] object is via the #[code nlp]
-    |  object. This method is usually only used for deserialization or preset
-    |  tokenization.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell A storage container for lexical types.
-
-    +row
-        +cell #[code words]
-        +cell -
-        +cell A list of strings to add to the container.
-
-    +row
-        +cell #[code spaces]
-        +cell -
-        +cell
-            |  A list of boolean values indicating whether each word has a
-            |  subsequent space. Must have the same length as #[code words], if
-            |  specified. Defaults to a sequence of #[code True].
-
-    +footrow
-        +cell return
-        +cell #[code Doc]
-        +cell The newly constructed object.
-
-+h(2, "getitem") Doc.__getitem__
-    +tag method
-
-p Get a #[code Token] object.
-
-+aside-code("Example").
-    doc = nlp(u'Give it back! He pleaded.')
-    assert doc[0].text == 'Give'
-    assert doc[-1].text == '.'
-    span = doc[1:1]
-    assert span.text == 'it back'
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code i]
-        +cell int
-        +cell The index of the token.
-
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell The token at #[code doc[i]].
-
-p Get a #[code Span] object.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code start_end]
-        +cell tuple
-        +cell The slice of the document to get.
-
-    +footrow
-        +cell return
-        +cell #[code Span]
-        +cell The span at #[code doc[start : end]].
-
-+h(2, "iter") Doc.__iter__
-    +tag method
-
-p Iterate over #[code Token] objects.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Token]
-        +cell A #[code Token] object.
-
-+h(2, "len") Doc.__len__
-    +tag method
-
-p Get the number of tokens in the document.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell int
-        +cell The number of tokens in the document.
-
-+h(2, "similarity") Doc.similarity
-    +tag method
-
-p
-    |  Make a semantic similarity estimate. The default estimate is cosine
-    |  similarity using an average of word vectors.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code other]
-        +cell -
-        +cell
-            |  The object to compare with. By default, accepts #[code Doc],
-            |  #[code Span], #[code Token] and #[code Lexeme] objects.
-
-    +footrow
-        +cell return
-        +cell float
-        +cell A scalar similarity score. Higher is more similar.
-
-+h(2, "to_array") Doc.to_array
-    +tag method
-
-p
-    |  Export the document annotations to a numpy array of shape #[code N*M]
-    |  where #[code N] is the length of the document and #[code M] is the number
-    |  of attribute IDs to export. The values will be 32-bit integers.
-
-+aside-code("Example").
-    from spacy import attrs
-    doc = nlp(text)
-    # All strings mapped to integers, for easy export to numpy
-    np_array = doc.to_array([attrs.LOWER, attrs.POS,
-                             attrs.ENT_TYPE, attrs.IS_ALPHA])
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code attr_ids]
-        +cell ints
-        +cell A list of attribute ID ints.
-
-    +footrow
-        +cell return
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
-        +cell
-            |  The exported attributes as a 2D numpy array, with one row per
-            |  token and one column per attribute.
-
-+h(2, "count_by") Doc.count_by
-    +tag method
-
-p Count the frequencies of a given attribute.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code attr_id]
-        +cell int
-        +cell The attribute ID
-
-    +footrow
-        +cell return
-        +cell dict
-        +cell A dictionary mapping attributes to integer counts.
-
-+h(2, "from_array") Doc.from_array
-    +tag method
-
-p Load attributes from a numpy array.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code attr_ids]
-        +cell ints
-        +cell A list of attribute ID ints.
-
-    +row
-        +cell #[code values]
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
-        +cell The attribute values to load.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "to_bytes") Doc.to_bytes
-    +tag method
-
-p Export the document contents to a binary string.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell bytes
-        +cell
-            |  A losslessly serialized copy of the #[code Doc] including all
-            |  annotations.
-
-+h(2, "from_bytes") Doc.from_bytes
-    +tag method
-
-p Import the document contents from a binary string.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code byte_string]
-        +cell bytes
-        +cell The string to load from.
-
-    +footrow
-        +cell return
-        +cell #[code Doc]
-        +cell The #[code self] variable.
-
-+h(2, "merge") Doc.merge
-    +tag method
-
-p
-    |  Retokenize the document, such that the span at
-    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
-    |  #[code start_idx] and #[end_idx] do not mark start and end token
-    |  boundaries, the document remains unchanged.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code start_idx]
-        +cell int
-        +cell The character index of the start of the slice to merge.
-
-    +row
-        +cell #[code end_idx]
-        +cell int
-        +cell The character index after the end of the slice to merge.
-
-    +row
-        +cell #[code **attributes]
-        +cell -
-        +cell
-            |  Attributes to assign to the merged token. By default,
-            |  attributes are inherited from the syntactic root token of
-            |  the span.
-
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell
-            |  The newly merged token, or None if the start and end
-            |  indices did not fall at token boundaries
-
-+h(2, "read_bytes") Doc.read_bytes
-    +tag staticmethod
-
-p A static method, used to read serialized #[code Doc] objects from a file.
-
-+aside-code("Example").
-    from spacy.tokens.doc import Doc
-    loc = 'test_serialize.bin'
-    with open(loc, 'wb') as file_:
-        file_.write(nlp(u'This is a document.').to_bytes())
-        file_.write(nlp(u'This is another.').to_bytes())
-    docs = []
-    with open(loc, 'rb') as file_:
-        for byte_string in Doc.read_bytes(file_):
-            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
-    assert len(docs) == 2
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell file
-        +cell buffer
-        +cell A binary buffer to read the serialized annotations from.
-
-    +footrow
-        +cell yield
-        +cell bytes
-        +cell Binary strings from with documents can be loaded.
-
-+h(2, "text") Doc.text
-    +tag property
-
-p A unicode representation of the document text.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell unicode
-        +cell The original verbatim text of the document.
-
-+h(2, "text_with_ws") Doc.text_with_ws
-    +tag property
-
-p
-    |  An alias of #[code Doc.text], provided for duck-type compatibility with
-    |  #[code Span] and #[code Token].
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell unicode
-        +cell The original verbatim text of the document.
-
-+h(2, "sents") Doc.sents
-    +tag property
-
-p Iterate over the sentences in the document.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Span]
-        +cell Sentences in the document.
-
-+h(2, "ents") Doc.ents
-    +tag property
-
-p Iterate over the entities in the document.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Span]
-        +cell Entities in the document.
-
-+h(2, "noun_chunks") Doc.noun_chunks
-    +tag property
-
-p
-    |  Iterate over the base noun phrases in the document. A base noun phrase,
-    |  or "NP chunk", is a noun phrase that does not permit other NPs to be
-    |  nested within it.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Span]
-        +cell Noun chunks in the document
-
-+h(2, "vector") Doc.vector
-    +tag property
-
-p
-    |  A real-valued meaning representation. Defaults to an average of the
-    |  token vectors.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
-        +cell A 1D numpy array representing the document's semantics.
-
-+h(2, "has_vector") Doc.has_vector
-    +tag property
-
-p
-    |  A boolean value indicating whether a word vector is associated with the
-    |  object.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell bool
-        +cell Whether the document has a vector data attached.