Update docstrings and API docs for Doc class

2025-07-17 11:42:30 +03:00 · 2017-05-18 22:17:09 +02:00 · 2017-05-18 22:17:09 +02:00 · b87066ff10
commit b87066ff10
parent 0f513850ab
3 changed files with 684 additions and 531 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
 cdef class Doc:
-    """
+    """A sequence of Token objects. Access sentences and named entities, export
-    A sequence of `Token` objects. Access sentences and named entities,
+    annotations to numpy arrays, losslessly serialize to compressed binary strings.
-    export annotations to numpy arrays, losslessly serialize to compressed
+    The `Doc` object holds an array of `TokenC` structs. The Python-level
-    binary strings.
+    `Token` and `Span` objects are views of this array, i.e. they don't own
    the data themselves.
-    Aside: Internals
+    EXAMPLE: Construction 1
-        The `Doc` object holds an array of `TokenC` structs.
+        >>> doc = nlp(u'Some text')
        The Python-level `Token` and `Span` objects are views of this
        array, i.e. they don't own the data themselves.
    Code: Construction 1
        doc = nlp.tokenizer(u'Some text')
    Code: Construction 2
        doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
        Construction 2
        >>> from spacy.tokens import Doc
        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
    """
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
-        """
+        """Create a Doc object.
        Create a Doc object.
-        Arguments:
+        vocab (Vocab): A vocabulary object, which must match any models you want
-            vocab:
+            to use (e.g. tokenizer, parser, entity recognizer).
-                A Vocabulary object, which must match any models you want to
+        words (list or None): A list of unicode strings to add to the document
-                use (e.g. tokenizer, parser, entity recognizer).
+            as words. If `None`, defaults to empty list.
-
+        spaces (list or None): A list of boolean values, of the same length as
-            words:
+            words. True means that the word is followed by a space, False means
-                A list of unicode strings to add to the document as words. If None,
+            it is not. If `None`, defaults to `[True]*len(words)`
-                defaults to empty list.
+        RETURNS (Doc): The newly constructed object.
            spaces:
                A list of boolean values, of the same length as words. True
                means that the word is followed by a space, False means it is not.
                If None, defaults to [True]*len(words)
        """
        self.vocab = vocab
        size = 20
@ -158,20 +148,22 @@ cdef class Doc:
            self.is_parsed = True
    def __getitem__(self, object i):
-        """
+        """Get a `Token` or `Span` object.
-        doc[i]
+
-            Get the Token object at position i, where i is an integer.
+        EXAMPLE:
            >>> doc[i]
            Get the `Token` object at position `i`, where `i` is an integer.
            Negative indexing is supported, and follows the usual Python
-            semantics, i.e. doc[-2] is doc[len(doc) - 2].
+            semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
-        doc[start : end]]
+
-            Get a `Span` object, starting at position `start`
+            >>> doc[start : end]]
-            and ending at position `end`, where `start` and
+            Get a `Span` object, starting at position `start` and ending at
-            `end` are token indices. For instance,
+            position `end`, where `start` and `end` are token indices. For
-            `doc[2:5]` produces a span consisting of
+            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
-            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
+            Stepped slices (e.g. `doc[start : end : step]`) are not supported,
-            are not supported, as `Span` objects must be contiguous (cannot have gaps).
+            as `Span` objects must be contiguous (cannot have gaps). You can use
-            You can use negative indices and open-ended ranges, which have their
+            negative indices and open-ended ranges, which have their normal
-            normal Python semantics.
+            Python semantics.
        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +178,14 @@ cdef class Doc:
            return Token.cinit(self.vocab, &self.c[i], i, self)
    def __iter__(self):
-        """
+        """Iterate over `Token`  objects, from which the annotations can be
-        for token in doc
+        easily accessed. This is the main way of accessing `Token` objects,
-            Iterate over `Token`  objects, from which the annotations can
+        which are the main way annotations are accessed from Python. If faster-
-            be easily accessed. This is the main way of accessing Token
+        than-Python speeds are required, you can instead access the annotations
-            objects, which are the main way annotations are accessed from
+        as a numpy array, or access the underlying C data directly from Cython.
-            Python. If faster-than-Python speeds are required, you can
+
-            instead access the annotations as a numpy array, or access the
+        EXAMPLE:
-            underlying C data directly from Cython.
+            >>> for token in doc
        """
        cdef int i
        for i in range(self.length):
@ -203,9 +195,10 @@ cdef class Doc:
                yield Token.cinit(self.vocab, &self.c[i], i, self)
    def __len__(self):
-        """
+        """The number of tokens in the document.
-        len(doc)
+
-            The number of tokens in the document.
+        EXAMPLE:
            >>> len(doc)
        """
        return self.length
@ -228,16 +221,12 @@ cdef class Doc:
        return self
    def similarity(self, other):
-        """
+        """Make a semantic similarity estimate. The default estimate is cosine
        Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
-        Arguments:
+        other (object): The object to compare with. By default, accepts `Doc`,
-            other (object): The object to compare with. By default, accepts Doc,
+            `Span`, `Token` and `Lexeme` objects.
-                Span, Token and Lexeme objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        Return:
            score (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.user_hooks:
            return self.user_hooks['similarity'](self, other)
@ -246,8 +235,10 @@ cdef class Doc:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    property has_vector:
-        """
+        """A boolean value indicating whether a word vector is associated with
-        A boolean value indicating whether a word vector is associated with the object.
+        the object.
        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.user_hooks:
@ -256,10 +247,11 @@ cdef class Doc:
            return any(token.has_vector for token in self)
    property vector:
-        """
+        """A real-valued meaning representation. Defaults to an average of the
-        A real-valued meaning representation. Defaults to an average of the token vectors.
+        token vectors.
-        Type: numpy.ndarray[ndim=1, dtype='float32']
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the document's semantics.
        """
        def __get__(self):
            if 'vector' in self.user_hooks:
@ -275,6 +267,7 @@ cdef class Doc:
            self._vector = value
    property vector_norm:
        # TODO: docstrings / docs
        def __get__(self):
            if 'vector_norm' in self.user_hooks:
                return self.user_hooks['vector_norm'](self)
@ -295,34 +288,37 @@ cdef class Doc:
        return self.text
    property text:
-        """
+        """A unicode representation of the document text.
-        A unicode representation of the document text.
+
        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return u''.join(t.text_with_ws for t in self)
    property text_with_ws:
-        """
+        """An alias of `Doc.text`, provided for duck-type compatibility with
-        An alias of Doc.text, provided for duck-type compatibility with Span and Token.
+        `Span` and `Token`.
        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return self.text
    property ents:
-        """
+        """Iterate over the entities in the document. Yields named-entity `Span`
-        Yields named-entity `Span` objects, if the entity recognizer
+        objects, if the entity recognizer has been applied to the document.
        has been applied to the document. Iterate over the span to get
        individual Token objects, or access the label:
-        Example:
+        YIELDS (Span): Entities in the document.
-            from spacy.en import English
+
-            nlp = English()
+        EXAMPLE: Iterate over the span to get individual Token objects, or access
-            tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
+            the label:
-            ents = list(tokens.ents)
+
-            assert ents[0].label == 346
+            >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
-            assert ents[0].label_ == 'PERSON'
+            >>> ents = list(tokens.ents)
-            assert ents[0].orth_ == 'Best'
+            >>> assert ents[0].label == 346
-            assert ents[0].text == 'Mr. Best'
+            >>> assert ents[0].label_ == 'PERSON'
            >>> assert ents[0].orth_ == 'Best'
            >>> assert ents[0].text == 'Mr. Best'
        """
        def __get__(self):
            cdef int i
@ -387,12 +383,13 @@ cdef class Doc:
                    self.c[start].ent_iob = 3
    property noun_chunks:
-        """
+        """Iterate over the base noun phrases in the document. Yields base
-        Yields base noun-phrase #[code Span] objects, if the document
+        noun-phrase #[code Span] objects, if the document has been syntactically
-        has been syntactically parsed. A base noun phrase, or
+        parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
-        'NP chunk', is a noun phrase that does not permit other NPs to
+        not permit other NPs to be nested within it – so no NP-level
-        be nested within it – so no NP-level coordination, no prepositional
+        coordination, no prepositional phrases, and no relative clauses.
-        phrases, and no relative clauses.
+
        YIELDS (Span): Noun chunks in the document.
        """
        def __get__(self):
            if not self.is_parsed:
@ -411,17 +408,15 @@ cdef class Doc:
                yield span
    property sents:
-        """
+        """Iterate over the sentences in the document. Yields sentence `Span`
-        Yields sentence `Span` objects. Sentence spans have no label.
+        objects. Sentence spans have no label. To improve accuracy on informal
-        To improve accuracy on informal texts, spaCy calculates sentence
+        texts, spaCy calculates sentence boundaries from the syntactic
-        boundaries from the syntactic dependency parse. If the parser is disabled,
+        dependency parse. If the parser is disabled, the `sents` iterator will
-        `sents` iterator will be unavailable.
+        be unavailable.
-        Example:
+        EXAMPLE:
-            from spacy.en import English
+            >>> doc = nlp("This is a sentence. Here's another...")
-            nlp = English()
+            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
            doc = nlp("This is a sentence. Here's another...")
            assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
            if 'sents' in self.user_hooks:
@ -467,24 +462,20 @@ cdef class Doc:
    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
-        """
+        """Given a list of M attribute IDs, export the tokens to a numpy
-        Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
-        `ndarray` of shape (N, M), where `N` is the length
+        The values will be 32-bit integers.
        of the document. The values will be 32-bit integers.
-        Example:
+        attr_ids (list[int]): A list of attribute ID ints.
-            from spacy import attrs
+        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
-            doc = nlp(text)
+            per word, and one column per attribute indicated in the input
-            # All strings mapped to integers, for easy export to numpy
+            `attr_ids`.
            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
-        Arguments:
+        EXAMPLE:
-            attr_ids (list[int]): A list of attribute ID ints.
+            >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
-
+            >>> doc = nlp(text)
-        Returns:
+            >>> # All strings mapped to integers, for easy export to numpy
-            feat_array (numpy.ndarray[long, ndim=2]):
+            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
              A feature matrix, with one row per word, and one column per attribute
              indicated in the input attr_ids.
        """
        cdef int i, j
        cdef attr_id_t feature
@ -499,27 +490,20 @@ cdef class Doc:
        return output
    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
-        """
+        """Count the frequencies of a given attribute. Produces a dict of
-        Produce a dict of {attribute (int): count (ints)} frequencies, keyed
+        `{attribute (int): count (ints)}` frequencies, keyed by the values of
-        by the values of the given attribute ID.
+        the given attribute ID.
-        Example:
+        attr_id (int): The attribute ID to key the counts.
-            from spacy.en import English
+        RETURNS (dict): A dictionary mapping attributes to integer counts.
            from spacy import attrs
            nlp = English()
            tokens = nlp(u'apple apple orange banana')
            tokens.count_by(attrs.ORTH)
            # {12800L: 1, 11880L: 2, 7561L: 1}
            tokens.to_array([attrs.ORTH])
            # array([[11880],
            #   [11880],
            #   [ 7561],
            #   [12800]])
-        Arguments:
+        EXAMPLE:
-            attr_id
+            >>> from spacy import attrs
-                int
+            >>> doc = nlp(u'apple apple orange banana')
-                The attribute ID to key the counts.
+            >>> tokens.count_by(attrs.ORTH)
            {12800L: 1, 11880L: 2, 7561L: 1}
            >>> tokens.to_array([attrs.ORTH])
            array([[11880], [11880], [7561], [12800]])
        """
        cdef int i
        cdef attr_t attr
@ -567,8 +551,12 @@ cdef class Doc:
            self.c[i] = parsed[i]
    def from_array(self, attrs, int[:, :] array):
-        """
+        """Load attributes from a numpy array. Write to a `Doc` object, from an
-        Write to a `Doc` object, from an `(M, N)` array of attributes.
+        `(M, N)` array of attributes.
        attrs (ints): A list of attribute ID ints.
        array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
        RETURNS (Doc): Itself.
        """
        cdef int i, col
        cdef attr_id_t attr_id
@ -597,8 +585,10 @@ cdef class Doc:
        return self
    def to_bytes(self):
-        """
+        """Serialize, i.e. export the document contents to a binary string.
-        Serialize, producing a byte string.
+
        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
            all annotations.
        """
        return dill.dumps(
            (self.text,
@ -611,8 +601,10 @@ cdef class Doc:
            protocol=-1)
    def from_bytes(self, data):
-        """
+        """Deserialize, i.e. import the document contents from a binary string.
-        Deserialize, loading from bytes.
+
        data (bytes): The string to load from.
        RETURNS (Doc): Itself.
        """
        if self.length != 0:
            raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +632,16 @@ cdef class Doc:
        return self
    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """
+        """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
-        Retokenize the document, such that the span at doc.text[start_idx : end_idx]
+        is merged into a single token. If `start_idx` and `end_idx `do not mark
-        is merged into a single token. If start_idx and end_idx do not mark start
+        start and end token boundaries, the document remains unchanged.
        and end token boundaries, the document remains unchanged.
-        Arguments:
+        start_idx (int): The character index of the start of the slice to merge.
-            start_idx (int): The character index of the start of the slice to merge.
+        end_idx (int): The character index after the end of the slice to merge.
-            end_idx (int): The character index after the end of the slice to merge.
+        **attributes: Attributes to assign to the merged token. By default,
-            **attributes:
+            attributes are inherited from the syntactic root token of the span.
-                Attributes to assign to the merged token. By default, attributes
+        RETURNS (Token): The newly merged token, or `None` if the start and end
-                are inherited from the syntactic root token of the span.
+            indices did not fall at token boundaries.
        Returns:
            token (Token):
                The newly merged token, or None if the start and end indices did
                not fall at token boundaries.
        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
@ -758,7 +745,29 @@ cdef class Doc:
        return self[start]
    def print_tree(self, light=False, flat=False):
-        """Returns the parse trees in the JSON (Dict) format."""
+        """Returns the parse trees in JSON (dict) format.
        light (bool): Don't include lemmas or entities.
        flat (bool): Don't include arcs or modifiers.
        RETURNS (dict): Parse tree as dict.
        EXAMPLE:
            >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
            >>> trees = doc.print_tree()
            >>> trees[1]
            {'modifiers': [
                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
                'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
                {'modifiers': [
                    {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
                    'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
                'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
                'POS_fine': 'NN', 'lemma': 'pizza'},
                {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
                'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
                'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
                'POS_fine': 'VBD', 'lemma': 'eat'}
        """
        return parse_tree(self, light=light, flat=flat)
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
 def merge_ents(doc):
-    """
+    """Helper: merge adjacent entities into single tokens; modifies the doc."""
    Helper: merge adjacent entities into single tokens; modifies the doc.
    """
    for ent in doc.ents:
        ent.merge(ent.root.tag_, ent.text, ent.label_)
    return doc
 def format_POS(token, light, flat):
-    """
+    """Helper: form the POS output for a token."""
    Helper: form the POS output for a token.
    """
    subtree = dict([
        ("word", token.text),
        ("lemma", token.lemma_),  # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
 def POS_tree(root, light=False, flat=False):
-    """
+    """Helper: generate a POS tree for a root token. The doc must have
-    Helper: generate a POS tree for a root token. The doc must have
+    `merge_ents(doc)` ran on it.
    merge_ents(doc) ran on it.
    """
    subtree = format_POS(root, light=light, flat=flat)
    for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
 def parse_tree(doc, light=False, flat=False):
-    """
+    """Makes a copy of the doc, then construct a syntactic parse tree, similar to
    Makes a copy of the doc, then construct a syntactic parse tree, similar to
    the one used in displaCy. Generates the POS tree for all sentences in a doc.
-    Args:
+    doc (Doc): The doc for parsing.
-        doc: The doc for parsing.
+    RETURNS (dict): The parse tree.
-    Returns:
+    EXAMPLE:
-        [parse_trees (Dict)]:
+        >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
-
+        >>> trees = doc.print_tree()
-    >>> from spacy.en import English
+        >>> trees[1]
-    >>> nlp = English()
+        {'modifiers': [
-    >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
+            {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
-    >>> trees = doc.print_tree()
+             'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
-    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
+            {'modifiers': [
                {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
                 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
             'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
             'POS_fine': 'NN', 'lemma': 'pizza'},
            {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
             'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
            'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
            'POS_fine': 'VBD', 'lemma': 'eat'}
    """
    doc_clone  = Doc(doc.vocab, words=[w.text for w in doc])
    doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -4,6 +4,503 @@ include ../../_includes/_mixins
 p A container for accessing linguistic annotations.
 p
    |  A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
    |  Access sentences and named entities, export annotations to numpy arrays,
    |  losslessly serialize to compressed binary strings. The #[code Doc] object
    |  holds an array of #[code TokenC] structs. The Python-level #[code Token]
    |  and #[+api("span") #[code Span]] objects are views of this array, i.e.
    |  they don't own the data themselves.
 +aside-code("Example").
    # Construction 1
    doc = nlp(u'Some text')
    # Construction 2
    from spacy.tokens import Doc
    doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
                               spaces=[True, False, False])
 +h(2, "init") Doc.__init__
    +tag method
 p
    |  Construct a #[code Doc] object. The most common way to get a #[code Doc]
    |  object is via the #[code nlp] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code words]
        +cell -
        +cell A list of strings to add to the container.
    +row
        +cell #[code spaces]
        +cell -
        +cell
            |  A list of boolean values indicating whether each word has a
            |  subsequent space. Must have the same length as #[code words], if
            |  specified. Defaults to a sequence of #[code True].
    +footrow
        +cell return
        +cell #[code Doc]
        +cell The newly constructed object.
 +h(2, "getitem") Doc.__getitem__
    +tag method
 p
    |  Get a #[+api("token") #[code Token]] object at position #[code i], where
    |  #[code i] is an integer. Negative indexing is supported, and follows the
    |  usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    assert doc[0].text == 'Give'
    assert doc[-1].text == '.'
    span = doc[1:1]
    assert span.text == 'it back'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The index of the token.
    +footrow
        +cell return
        +cell #[code Token]
        +cell The token at #[code doc[i]].
 p
    |  Get a #[+api("span") #[code Span]] object, starting at position
    |  #[code start] (token index) and ending at position #[code end] (token
    |  index).
 p
    |  For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
    |  and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
    |  supported, as #[code Span] objects must be contiguous (cannot have gaps).
    |  You can use negative indices and open-ended ranges, which have their
    |  normal Python semantics.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_end]
        +cell tuple
        +cell The slice of the document to get.
    +footrow
        +cell return
        +cell #[code Span]
        +cell The span at #[code doc[start : end]].
 +h(2, "iter") Doc.__iter__
    +tag method
 p
    |  Iterate over #[code Token] objects, from which the annotations can be
    |  easily accessed.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    for token in doc:
        print(token.text, token.tag_)
 p
    |  This is the main way of accessing #[+api("token") #[code Token]] objects,
    |  which are the main way annotations are accessed from Python. If
    |  faster-than-Python speeds are required, you can instead access the
    |  annotations as a numpy array, or access the underlying C data directly
    |  from Cython.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Token]
        +cell A #[code Token] object.
 +h(2, "len") Doc.__len__
    +tag method
 p Get the number of tokens in the document.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    assert len(doc) == 7
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell int
        +cell The number of tokens in the document.
 +h(2, "similarity") Doc.similarity
    +tag method
    +tag requires model
 p
    |  Make a semantic similarity estimate. The default estimate is cosine
    |  similarity using an average of word vectors.
 +aside-code("Example").
    apples, and, oranges = nlp(u'apples and oranges')
    apples_oranges = apples.similarity(oranges)
    oranges_apples = oranges.similarity(apples)
    assert apples_oranges == oranges_apples
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code other]
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell return
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "count_by") Doc.count_by
    +tag method
 p
    |  Count the frequencies of a given attribute. Produces a dict of
    |  #[code {attr (int): count (ints)}] frequencies, keyed by the values
    |  of the given attribute ID.
 +aside-code("Example").
    from spacy import attrs
    doc = nlp(u'apple apple orange banana')
    tokens.count_by(attrs.ORTH)
    # {12800L: 1, 11880L: 2, 7561L: 1}
    tokens.to_array([attrs.ORTH])
    # array([[11880], [11880], [7561], [12800]])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_id]
        +cell int
        +cell The attribute ID
    +footrow
        +cell return
        +cell dict
        +cell A dictionary mapping attributes to integer counts.
 +h(2, "to_array") Doc.to_array
    +tag method
 p
    |  Export the document annotations to a numpy array of shape #[code N*M]
    |  where #[code N] is the length of the document and #[code M] is the number
    |  of attribute IDs to export. The values will be 32-bit integers.
 +aside-code("Example").
    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
    doc = nlp(text)
    # All strings mapped to integers, for easy export to numpy
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell ints
        +cell A list of attribute ID ints.
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell
            |  The exported attributes as a 2D numpy array, with one row per
            |  token and one column per attribute.
 +h(2, "from_array") Doc.from_array
    +tag method
 p
    |  Load attributes from a numpy array. Write to a #[code Doc] object, from
    |  an #[code (M, N)] array of attributes.
 +aside-code("Example").
    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
    from spacy.tokens import Doc
    doc = nlp(text)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    doc2 = Doc(doc.vocab)
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attrs]
        +cell ints
        +cell A list of attribute ID ints.
    +row
        +cell #[code array]
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell The attribute values to load.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell Itself.
 +h(2, "to_bytes") Doc.to_bytes
    +tag method
 p Serialize, i.e. export the document contents to a binary string.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    doc_bytes = doc.to_bytes()
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bytes
        +cell
            |  A losslessly serialized copy of the #[code Doc], including all
            |  annotations.
 +h(2, "from_bytes") Doc.from_bytes
    +tag method
 p Deserialize, i.e. import the document contents from a binary string.
 +aside-code("Example").
    from spacy.tokens import Doc
    text = u'Give it back! He pleaded.'
    doc = nlp(text)
    bytes = doc.to_bytes()
    doc2 = Doc(doc.vocab).from_bytes(bytes)
    assert doc.text == doc2.text
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code data]
        +cell bytes
        +cell The string to load from.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell Itself.
 +h(2, "merge") Doc.merge
    +tag method
 p
    |  Retokenize the document, such that the span at
    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
    |  #[code start_idx] and #[end_idx] do not mark start and end token
    |  boundaries, the document remains unchanged.
 +aside-code("Example").
    doc = nlp(u'Los Angeles start.')
    doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
    print([token.text for token in doc])
    # ['Los Angeles', 'start', '.']
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_idx]
        +cell int
        +cell The character index of the start of the slice to merge.
    +row
        +cell #[code end_idx]
        +cell int
        +cell The character index after the end of the slice to merge.
    +row
        +cell #[code **attributes]
        +cell -
        +cell
            |  Attributes to assign to the merged token. By default,
            |  attributes are inherited from the syntactic root token of
            |  the span.
    +footrow
        +cell return
        +cell #[code Token]
        +cell
            |  The newly merged token, or #[code None] if the start and end
            |  indices did not fall at token boundaries
 +h(2, "print_tree") Doc.print_tree
    +tag method
    +tag requires model
 p
    |  Returns the parse trees in JSON (dict) format. Especially useful for
    |  web applications.
 +aside-code("Example").
    doc = nlp('Alice ate the pizza.')
    trees = doc.print_tree()
    # {'modifiers': [
    #   {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
    #   {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
    #   {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
    # ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code light]
        +cell bool
        +cell Don't include lemmas or entities.
    +row
        +cell #[code flat]
        +cell bool
        +cell Don't include arcs or modifiers.
    +footrow
        +cell return
        +cell dict
        +cell Parse tree as dict.
 +h(2, "text") Doc.text
    +tag property
 p A unicode representation of the document text.
 +aside-code("Example").
    text = u'Give it back! He pleaded.'
    doc = nlp(text)
    assert doc.text == text
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The original verbatim text of the document.
 +h(2, "text_with_ws") Doc.text_with_ws
    +tag property
 p
    |  An alias of #[code Doc.text], provided for duck-type compatibility with
    |  #[code Span] and #[code Token].
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The original verbatim text of the document.
 +h(2, "ents") Doc.ents
    +tag property
    +tag requires model
 p
    |  Iterate over the entities in the document. Yields named-entity
    |  #[code Span] objects, if the entity recognizer has been applied to the
    |  document.
 +aside-code("Example").
    tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
    ents = list(tokens.ents)
    assert ents[0].label == 346
    assert ents[0].label_ == 'PERSON'
    assert ents[0].text == 'Mr. Best'
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Entities in the document.
 +h(2, "noun_chunks") Doc.noun_chunks
    +tag property
    +tag requires model
 p
    |  Iterate over the base noun phrases in the document. Yields base
    |  noun-phrase #[code Span] objects, if the document has been syntactically
    |  parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
    |  permit other NPs to be nested within it – so no NP-level coordination, no
    |  prepositional phrases, and no relative clauses.
 +aside-code("Example").
    doc = nlp(u'A phrase with another phrase occurs.')
    chunks = list(doc.noun_chunks)
    assert chunks[0].text == "A phrase"
    assert chunks[1].text == "another phrase"
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Noun chunks in the document.
 +h(2, "sents") Doc.sents
    +tag property
    +tag requires model
 p
    |  Iterate over the sentences in the document. Sentence spans have no label.
    |  To improve accuracy on informal texts, spaCy calculates sentence boundaries
    |  from the syntactic dependency parse. If the parser is disabled,
    |  the #[code sents] iterator will be unavailable.
 +aside-code("Example").
    doc = nlp(u"This is a sentence. Here's another...")
    sents = list(doc.sents)
    assert len(sents) == 2
    assert [s.root.text for s in sents] == ["is", "'s"]
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Sentences in the document.
 +h(2, "has_vector") Doc.has_vector
    +tag property
    +tag requires model
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  object.
 +aside-code("Example").
    apple = nlp(u'apple')
    assert apple.has_vector
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bool
        +cell Whether the document has a vector data attached.
 +h(2, "vector") Doc.vector
    +tag property
    +tag requires model
 p
    |  A real-valued meaning representation. Defaults to an average of the
    |  token vectors.
 +aside-code("Example").
    apple = nlp(u'apple')
    (apple.vector.dtype, apple.vector.shape)
    # (dtype('float32'), (300,))
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the document's semantics.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
@ -59,358 +556,3 @@ p A container for accessing linguistic annotations.
        +cell
            |  A dictionary that allows customisation of properties of
            |  #[code Span] children.
 +h(2, "init") Doc.__init__
    +tag method
 p Construct a #[code Doc] object.
 +aside("Note")
    |  The most common way to get a #[code Doc] object is via the #[code nlp]
    |  object. This method is usually only used for deserialization or preset
    |  tokenization.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code words]
        +cell -
        +cell A list of strings to add to the container.
    +row
        +cell #[code spaces]
        +cell -
        +cell
            |  A list of boolean values indicating whether each word has a
            |  subsequent space. Must have the same length as #[code words], if
            |  specified. Defaults to a sequence of #[code True].
    +footrow
        +cell return
        +cell #[code Doc]
        +cell The newly constructed object.
 +h(2, "getitem") Doc.__getitem__
    +tag method
 p Get a #[code Token] object.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    assert doc[0].text == 'Give'
    assert doc[-1].text == '.'
    span = doc[1:1]
    assert span.text == 'it back'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The index of the token.
    +footrow
        +cell return
        +cell #[code Token]
        +cell The token at #[code doc[i]].
 p Get a #[code Span] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_end]
        +cell tuple
        +cell The slice of the document to get.
    +footrow
        +cell return
        +cell #[code Span]
        +cell The span at #[code doc[start : end]].
 +h(2, "iter") Doc.__iter__
    +tag method
 p Iterate over #[code Token] objects.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Token]
        +cell A #[code Token] object.
 +h(2, "len") Doc.__len__
    +tag method
 p Get the number of tokens in the document.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell int
        +cell The number of tokens in the document.
 +h(2, "similarity") Doc.similarity
    +tag method
 p
    |  Make a semantic similarity estimate. The default estimate is cosine
    |  similarity using an average of word vectors.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code other]
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell return
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "to_array") Doc.to_array
    +tag method
 p
    |  Export the document annotations to a numpy array of shape #[code N*M]
    |  where #[code N] is the length of the document and #[code M] is the number
    |  of attribute IDs to export. The values will be 32-bit integers.
 +aside-code("Example").
    from spacy import attrs
    doc = nlp(text)
    # All strings mapped to integers, for easy export to numpy
    np_array = doc.to_array([attrs.LOWER, attrs.POS,
                             attrs.ENT_TYPE, attrs.IS_ALPHA])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell ints
        +cell A list of attribute ID ints.
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell
            |  The exported attributes as a 2D numpy array, with one row per
            |  token and one column per attribute.
 +h(2, "count_by") Doc.count_by
    +tag method
 p Count the frequencies of a given attribute.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_id]
        +cell int
        +cell The attribute ID
    +footrow
        +cell return
        +cell dict
        +cell A dictionary mapping attributes to integer counts.
 +h(2, "from_array") Doc.from_array
    +tag method
 p Load attributes from a numpy array.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell ints
        +cell A list of attribute ID ints.
    +row
        +cell #[code values]
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell The attribute values to load.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "to_bytes") Doc.to_bytes
    +tag method
 p Export the document contents to a binary string.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bytes
        +cell
            |  A losslessly serialized copy of the #[code Doc] including all
            |  annotations.
 +h(2, "from_bytes") Doc.from_bytes
    +tag method
 p Import the document contents from a binary string.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code byte_string]
        +cell bytes
        +cell The string to load from.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell The #[code self] variable.
 +h(2, "merge") Doc.merge
    +tag method
 p
    |  Retokenize the document, such that the span at
    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
    |  #[code start_idx] and #[end_idx] do not mark start and end token
    |  boundaries, the document remains unchanged.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_idx]
        +cell int
        +cell The character index of the start of the slice to merge.
    +row
        +cell #[code end_idx]
        +cell int
        +cell The character index after the end of the slice to merge.
    +row
        +cell #[code **attributes]
        +cell -
        +cell
            |  Attributes to assign to the merged token. By default,
            |  attributes are inherited from the syntactic root token of
            |  the span.
    +footrow
        +cell return
        +cell #[code Token]
        +cell
            |  The newly merged token, or None if the start and end
            |  indices did not fall at token boundaries
 +h(2, "read_bytes") Doc.read_bytes
    +tag staticmethod
 p A static method, used to read serialized #[code Doc] objects from a file.
 +aside-code("Example").
    from spacy.tokens.doc import Doc
    loc = 'test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
 +table(["Name", "Type", "Description"])
    +row
        +cell file
        +cell buffer
        +cell A binary buffer to read the serialized annotations from.
    +footrow
        +cell yield
        +cell bytes
        +cell Binary strings from with documents can be loaded.
 +h(2, "text") Doc.text
    +tag property
 p A unicode representation of the document text.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The original verbatim text of the document.
 +h(2, "text_with_ws") Doc.text_with_ws
    +tag property
 p
    |  An alias of #[code Doc.text], provided for duck-type compatibility with
    |  #[code Span] and #[code Token].
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The original verbatim text of the document.
 +h(2, "sents") Doc.sents
    +tag property
 p Iterate over the sentences in the document.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Sentences in the document.
 +h(2, "ents") Doc.ents
    +tag property
 p Iterate over the entities in the document.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Entities in the document.
 +h(2, "noun_chunks") Doc.noun_chunks
    +tag property
 p
    |  Iterate over the base noun phrases in the document. A base noun phrase,
    |  or "NP chunk", is a noun phrase that does not permit other NPs to be
    |  nested within it.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Noun chunks in the document
 +h(2, "vector") Doc.vector
    +tag property
 p
    |  A real-valued meaning representation. Defaults to an average of the
    |  token vectors.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the document's semantics.
 +h(2, "has_vector") Doc.has_vector
    +tag property
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  object.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bool
        +cell Whether the document has a vector data attached.