spaCy/website/docs/_api-doc.jade

//- ----------------------------------
//- 💫 DOCS > API > DOC
//- ----------------------------------

+section("doc")
    +h(2, "doc", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/tokens/doc.pyx")
        | #[+tag class] Doc

    p
        | A sequence of #[code Token] objects. Access sentences and named entities,
        | export annotations to numpy arrays, losslessly serialize to compressed
        | binary strings.

        +aside.
            Internally, the #[code Doc] object holds an array of #[code TokenC] structs.
            The Python-level #[code Token] and #[code Span] objects are views of this
            array, i.e. they don't own the data themselves.

    +code("python", "Overview").
        class Doc:
            def __init__(self, vocab, orths_and_spaces=None):
                return self

            def __getitem__(self, int i):
                return Token()
            def __getitem__(self, slice i_j):
                return Span()
            def __iter__(self):
                yield Token()
            def __len__(self):
                return int

            def __unicode__(self):
                return unicode
            def __bytes__(self):
                return utf8
            def __repr__(self):
                return unicode

            @property
            def text(self):
                return unicode
            @property
            def text_with_ws(self):
                return unicode

            @property
            def vector(self):
                return numpy.ndarray(dtype='float32')
            @property
            def vector_norm(self):
                return float
            @property
            def ents(self):
                yield Span()
            @property
            def noun_chunks(self):
                yield Span()
            @property
            def sents(self):
                yield Span()

            def similarity(self, other):
                return float

            def merge(self, start_char, end_char, tag, lemma, ent_type):
                return None

            def to_array(self, attr_ids):
                return numpy.ndarray(shape=(len(self), len(attr_ids)), dtype='int64')

            def count_by(self, attr_id, exclude=None, counts=None):
                return dict

            def to_bytes(self):
                return bytes

            def from_array(self, attrs, array):
                return None

            def from_bytes(self, data):
                return self

            @staticmethod
            def read_bytes(file_):
                yield bytes

    +section("doc-init")
        +h(3, "doc-init")
            | #[+tag method] Doc.__init__

        .has-aside
            +code("python", "Definition").
                def __init__(self, vocab, orths_and_spaces=None):
                    return Doc

            +aside("Implementation").
                This method of constructing a #[code Doc] object is usually only used
                for deserialization. Standard usage is to construct the document via
                a call to the language object.

        +table(["Name", "Type", "Description"])
            +row
                +cell vocab
                +cell.
                    A Vocabulary object, which must match any models you want to
                    use (e.g. tokenizer, parser, entity recognizer).

            +row
                +cell orths_and_spaces
                +cell.
                    A list of tokens in the document as a sequence of
                    #[code (orth_id, has_space)] tuples, where #[code orth_id]
                    is an integer and #[code has_space] is a boolean, indicating
                    whether the token has a trailing space.

    +section("doc-sequenceapi")
        +h(3, "doc-sequenceapi")
            | #[+tag Section] Sequence API

        +table(["Example", "Description"])
            +row
                +cell #[code doc[i]]
                +cell.
                    Get the Token object at position i, where i is an integer.
                    Negative indexing is supported, and follows the usual Python
                    semantics, i.e. doc[-2] is doc[len(doc) - 2].

            +row
                +cell #[code doc[start : end]]
                +cell.
                    Get a #[code Span] object, starting at position #[code start]
                    and ending at position #[code end], where #[code start] and
                    #[code end] are token indices. For instance,
                    #[code doc[2:5]] produces a span consisting of
                    tokens 2, 3 and 4. Stepped slices (e.g. #[code doc[start : end : step]])
                    are not supported, as #[code Span] objects must be contiguous
                    (cannot have gaps). You can use negative indices and open-ended
                    ranges, which have their normal Python semantics.

            +row
                +cell #[code for token in doc]
                +cell.
                    Iterate over Token  objects, from which the annotations can
                    be easily accessed. This is the main way of accessing Token
                    objects, which are the main way annotations are accessed from
                    Python. If faster-than-Python speeds are required, you can
                    instead access the annotations as a numpy array, or access the
                    underlying C data directly from Cython.

            +row
                +cell #[code len(doc)]
                +cell.
                    The number of tokens in the document.

    +section("doc-spans")
        +h(3, "doc-spans-sents")
            | #[+tag property] Doc.sents

        p.
            Yields sentence #[code Span] objects. Sentence spans have no label.
            To improve accuracy on informal texts, spaCy calculates sentence
            boundaries from the syntactic dependency parse. If the parser is disabled,
            the #[code sents] iterator will be unavailable.

        +code("python", "Example").
            from spacy.en import English
            nlp = English()
            doc = nlp("This is a sentence. Here's another...")
            assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]

        +h(3, "doc-spans-ents")
            | #[+tag property] Doc.ents

        p.
            Yields named-entity #[code Span] objects, if the entity recognizer
            has been applied to the document. Iterate over the span to get
            individual Token objects, or access the label:

        +code("python", "Example").
            from spacy.en import English
            nlp = English()
            tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
            ents = list(tokens.ents)
            assert ents[0].label == 346
            assert ents[0].label_ == 'PERSON'
            assert ents[0].orth_ == 'Best'
            assert ents[0].text == 'Mr. Best'

        +h(3, "doc-spans-nounchunks")
            | #[+tag property] Doc.noun_chunks

        p.
            Yields base noun-phrase #[code Span] objects, if the document
            has been syntactically parsed. A base noun phrase, or
            'NP chunk', is a noun phrase that does not permit other NPs to
            be nested within it – so no NP-level coordination, no prepositional
            phrases, and no relative clauses. For example:

        +code("python", "Example").
            from spacy.en import English
            nlp = English()
            doc = nlp(u'The sentence in this example has three noun chunks.')
            for chunk in doc.noun_chunks:
                print(chunk.label_, chunk.orth_, '&lt;--', chunk.root.head.orth_)

    +section("doc-exportimport-toarray")
        +h(3, "doc-exportimport-toarray")
            | #[+tag method] Doc.to_array

        p.
            Given a list of M attribute IDs, export the tokens to a numpy
            #[code ndarray] of shape #[code N*M], where #[code N] is the length
            of the document. The values will be 32-bit integers.

        +code("python", "Example").
            from spacy import attrs
            doc = nlp(text)
            # All strings mapped to integers, for easy export to numpy
            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])

        +code("python", "Definition").
            def to_array(self, attr_ids):
                return numpy.ndarray(shape=(len(self), len(attr_ids)), dtype='int64')

        +table(["Name", "Type", "Description"])
            +row
                +cell attr_ids
                +cell list of ints
                +cell.
                    A list of attribute ID ints. Attribute IDs can be imported
                    from #[code spacy.attrs] or #[code spacy.symbols].

    +section("doc-exportimport-countby")
        +h(4, "doc-exportimport-countby")
            | #[+tag method] Doc.count_by

        p.
            Produce a dict of #[code {attribute (int): count (ints)}] frequencies,
            keyed by the values of the given attribute ID.

        +code("python", "Example").
            def count_by(self, attr_id):
                return dict

        +table(["Name", "Type", "Description"])
            +row
                +cell attr_id
                +cell int
                +cell.
                    The attribute ID to key the counts.

    +section("doc-exportimport-fromarray")
        +h(4, "doc-exportimport-fromarray")
            | #[+tag method] Doc.from_array

        p Write to a #[code Doc] object, from an M*N array of attributes.

        +code("python", "Definition").
            def from_array(self, attrs, array):
                return None

    +section("doc-exportimport-frombytes")
        +h(4, "doc-exportimport-frombytes") Doc.from_bytes

        p Deserialize, loading from bytes.

        +code("python", "Definition").
            def from_bytes(self, byte_string):
                return Doc

    +section("doc-exportimport-tobytes")
        +h(4, "doc-exportimport-tobytes")
            | #[+tag method] Doc.to_bytes

        p Serialize, producing a byte string.

        +code("python", "Definition").
            def to_bytes(self):
                return bytes

    +section("doc-exportimport-readbytes")
        +h(4, "doc-exportimport-readbytes")
            | #[+tag method] Doc.read_bytes

        p.
            A static method, used to read serialized #[code Doc] objects from
            a file. For example:

        +code("python", "Example").
            from spacy.tokens.doc import Doc
            loc = 'test_serialize.bin'
            with open(loc, 'wb') as file_:
                file_.write(nlp(u'This is a document.').to_bytes())
                file_.write(nlp(u'This is another.').to_bytes())
            docs = []
            with open(loc, 'rb') as file_:
                for byte_string in Doc.read_bytes(file_):
                    docs.append(Doc(nlp.vocab).from_bytes(byte_string))
            assert len(docs) == 2

        +code("python", "Definition").
            @staticmethod
            def read_bytes(file_):
                yield bytes
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								//- ----------------------------------
 								//- 💫 DOCS > API > DOC
 								//- ----------------------------------
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								+section("doc")
 								    +h(2, "doc", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/tokens/doc.pyx")
 								        | #[+tag class] Doc
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								    p
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        | A sequence of #[code Token] objects. Access sentences and named entities,
 								        | export annotations to numpy arrays, losslessly serialize to compressed
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								        | binary strings.
 								        +aside.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            Internally, the #[code Doc] object holds an array of #[code TokenC] structs.
 								            The Python-level #[code Token] and #[code Span] objects are views of this
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            array, i.e. they don't own the data themselves.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +code("python", "Overview").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								        class Doc:
 								            def __init__(self, vocab, orths_and_spaces=None):
 								                return self
 								            def __getitem__(self, int i):
 								                return Token()
 								            def __getitem__(self, slice i_j):
 								                return Span()
 								            def __iter__(self):
 								                yield Token()
 								            def __len__(self):
 								                return int
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def __unicode__(self):
 								                return unicode
 								            def __bytes__(self):
 								                return utf8
 								            def __repr__(self):
 								                return unicode
 								            @property
 								            def text(self):
 								                return unicode
 								            @property
 								            def text_with_ws(self):
 								                return unicode
 								            @property
 								            def vector(self):
 								                return numpy.ndarray(dtype='float32')
 								            @property
 								            def vector_norm(self):
 								                return float
 								            @property
 								            def ents(self):
 								                yield Span()
 								            @property
 								            def noun_chunks(self):
 								                yield Span()
 								            @property
 								            def sents(self):
 								                yield Span()
 								            def similarity(self, other):
 								                return float
 								            def merge(self, start_char, end_char, tag, lemma, ent_type):
 								                return None
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def to_array(self, attr_ids):
 								                return numpy.ndarray(shape=(len(self), len(attr_ids)), dtype='int64')
 								            def count_by(self, attr_id, exclude=None, counts=None):
 								                return dict
 								            def to_bytes(self):
 								                return bytes
 								            def from_array(self, attrs, array):
 								                return None
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def from_bytes(self, data):
 								                return self
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            @staticmethod
 								            def read_bytes(file_):
 								                yield bytes
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
 								    +section("doc-init")
 								        +h(3, "doc-init")
 								            | #[+tag method] Doc.__init__
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								        .has-aside
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                def __init__(self, vocab, orths_and_spaces=None):
 								                    return Doc
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +aside("Implementation").
 								                This method of constructing a #[code Doc] object is usually only used
 								                for deserialization. Standard usage is to construct the document via
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                a call to the language object.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +table(["Name", "Type", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            +row
 								                +cell vocab
 								                +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    A Vocabulary object, which must match any models you want to
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    use (e.g. tokenizer, parser, entity recognizer).
 								            +row
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                +cell orths_and_spaces
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    A list of tokens in the document as a sequence of
 								                    #[code (orth_id, has_space)] tuples, where #[code orth_id]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    is an integer and #[code has_space] is a boolean, indicating
 								                    whether the token has a trailing space.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-sequenceapi")
 								        +h(3, "doc-sequenceapi")
 								            | #[+tag Section] Sequence API
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +table(["Example", "Description"])
 								            +row
 								                +cell #[code doc[i]]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    Get the Token object at position i, where i is an integer.
 								                    Negative indexing is supported, and follows the usual Python
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    semantics, i.e. doc[-2] is doc[len(doc) - 2].
 								            +row
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                +cell #[code doc[start : end]]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +cell.
 								                    Get a #[code Span] object, starting at position #[code start]
 								                    and ending at position #[code end], where #[code start] and
 								                    #[code end] are token indices. For instance,
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    #[code doc[2:5]] produces a span consisting of
 								                    tokens 2, 3 and 4. Stepped slices (e.g. #[code doc[start : end : step]])
 								                    are not supported, as #[code Span] objects must be contiguous
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    (cannot have gaps). You can use negative indices and open-ended
 								                    ranges, which have their normal Python semantics.
 								            +row
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                +cell #[code for token in doc]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    Iterate over Token  objects, from which the annotations can
 								                    be easily accessed. This is the main way of accessing Token
 								                    objects, which are the main way annotations are accessed from
 								                    Python. If faster-than-Python speeds are required, you can
 								                    instead access the annotations as a numpy array, or access the
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    underlying C data directly from Cython.
 								            +row
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                +cell #[code len(doc)]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +cell.
 								                    The number of tokens in the document.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-spans")
 								        +h(3, "doc-spans-sents")
 								            | #[+tag property] Doc.sents
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								        p.
 								            Yields sentence #[code Span] objects. Sentence spans have no label.
 								            To improve accuracy on informal texts, spaCy calculates sentence
 								            boundaries from the syntactic dependency parse. If the parser is disabled,
 								            the #[code sents] iterator will be unavailable.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Example").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            from spacy.en import English
 								            nlp = English()
 								            doc = nlp("This is a sentence. Here's another...")
 								            assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +h(3, "doc-spans-ents")
 								            | #[+tag property] Doc.ents
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								        p.
 								            Yields named-entity #[code Span] objects, if the entity recognizer
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            has been applied to the document. Iterate over the span to get
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            individual Token objects, or access the label:
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Example").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            from spacy.en import English
 								            nlp = English()
 								            tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
 								            ents = list(tokens.ents)
 								            assert ents[0].label == 346
 								            assert ents[0].label_ == 'PERSON'
 								            assert ents[0].orth_ == 'Best'
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            assert ents[0].text == 'Mr. Best'
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +h(3, "doc-spans-nounchunks")
 								            | #[+tag property] Doc.noun_chunks
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								        p.
 								            Yields base noun-phrase #[code Span] objects, if the document
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            has been syntactically parsed. A base noun phrase, or
 								            'NP chunk', is a noun phrase that does not permit other NPs to
 								            be nested within it – so no NP-level coordination, no prepositional
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            phrases, and no relative clauses. For example:
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Example").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            from spacy.en import English
 								            nlp = English()
 								            doc = nlp(u'The sentence in this example has three noun chunks.')
 								            for chunk in doc.noun_chunks:
 								                print(chunk.label_, chunk.orth_, '&lt;--', chunk.root.head.orth_)
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-exportimport-toarray")
 								        +h(3, "doc-exportimport-toarray")
 								            | #[+tag method] Doc.to_array
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								        p.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            Given a list of M attribute IDs, export the tokens to a numpy
 								            #[code ndarray] of shape #[code N*M], where #[code N] is the length
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            of the document. The values will be 32-bit integers.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Example").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            from spacy import attrs
 								            doc = nlp(text)
 								            # All strings mapped to integers, for easy export to numpy
 								            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
 								        +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def to_array(self, attr_ids):
 								                return numpy.ndarray(shape=(len(self), len(attr_ids)), dtype='int64')
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
 								        +table(["Name", "Type", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            +row
 								                +cell attr_ids
 								                +cell list of ints
 								                +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    A list of attribute ID ints. Attribute IDs can be imported
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    from #[code spacy.attrs] or #[code spacy.symbols].
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-exportimport-countby")
 								        +h(4, "doc-exportimport-countby")
 								            | #[+tag method] Doc.count_by
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								        p.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            Produce a dict of #[code {attribute (int): count (ints)}] frequencies,
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            keyed by the values of the given attribute ID.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Example").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def count_by(self, attr_id):
 								                return dict
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +table(["Name", "Type", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            +row
 								                +cell attr_id
 								                +cell int
 								                +cell.
 								                    The attribute ID to key the counts.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-exportimport-fromarray")
 								        +h(4, "doc-exportimport-fromarray")
 								            | #[+tag method] Doc.from_array
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        p Write to a #[code Doc] object, from an M*N array of attributes.
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def from_array(self, attrs, array):
 								                return None
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-exportimport-frombytes")
 								        +h(4, "doc-exportimport-frombytes") Doc.from_bytes
 								        p Deserialize, loading from bytes.
 								        +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def from_bytes(self, byte_string):
 								                return Doc
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-exportimport-tobytes")
 								        +h(4, "doc-exportimport-tobytes")
 								            | #[+tag method] Doc.to_bytes
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        p Serialize, producing a byte string.
 								        +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            def to_bytes(self):
 								                return bytes
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								    +section("doc-exportimport-readbytes")
 								        +h(4, "doc-exportimport-readbytes")
 								            | #[+tag method] Doc.read_bytes
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								        p.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            A static method, used to read serialized #[code Doc] objects from
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            a file. For example:
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Example").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            from spacy.tokens.doc import Doc
 								            loc = 'test_serialize.bin'
 								            with open(loc, 'wb') as file_:
 								                file_.write(nlp(u'This is a document.').to_bytes())
 								                file_.write(nlp(u'This is another.').to_bytes())
 								            docs = []
 								            with open(loc, 'rb') as file_:
 								                for byte_string in Doc.read_bytes(file_):
 								                    docs.append(Doc(nlp.vocab).from_bytes(byte_string))
 								            assert len(docs) == 2
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            @staticmethod
 								            def read_bytes(file_):
 								                yield bytes