From b87066ff10136cfae37c30deaa25a2f4b1c0df3e Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 18 May 2017 22:17:09 +0200 Subject: [PATCH] Update docstrings and API docs for Doc class --- spacy/tokens/doc.pyx | 319 +++++++------- spacy/tokens/printers.py | 44 +- website/docs/api/doc.jade | 852 ++++++++++++++++++++++---------------- 3 files changed, 684 insertions(+), 531 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f9325cded..949fdea29 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: cdef class Doc: - """ - A sequence of `Token` objects. Access sentences and named entities, - export annotations to numpy arrays, losslessly serialize to compressed - binary strings. + """A sequence of Token objects. Access sentences and named entities, export + annotations to numpy arrays, losslessly serialize to compressed binary strings. + The `Doc` object holds an array of `TokenC` structs. The Python-level + `Token` and `Span` objects are views of this array, i.e. they don't own + the data themselves. - Aside: Internals - The `Doc` object holds an array of `TokenC` structs. - The Python-level `Token` and `Span` objects are views of this - array, i.e. they don't own the data themselves. - - Code: Construction 1 - doc = nlp.tokenizer(u'Some text') - - Code: Construction 2 - doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)]) + EXAMPLE: Construction 1 + >>> doc = nlp(u'Some text') + Construction 2 + >>> from spacy.tokens import Doc + >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) """ def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): - """ - Create a Doc object. + """Create a Doc object. - Arguments: - vocab: - A Vocabulary object, which must match any models you want to - use (e.g. tokenizer, parser, entity recognizer). - - words: - A list of unicode strings to add to the document as words. If None, - defaults to empty list. - - spaces: - A list of boolean values, of the same length as words. True - means that the word is followed by a space, False means it is not. - If None, defaults to [True]*len(words) + vocab (Vocab): A vocabulary object, which must match any models you want + to use (e.g. tokenizer, parser, entity recognizer). + words (list or None): A list of unicode strings to add to the document + as words. If `None`, defaults to empty list. + spaces (list or None): A list of boolean values, of the same length as + words. True means that the word is followed by a space, False means + it is not. If `None`, defaults to `[True]*len(words)` + RETURNS (Doc): The newly constructed object. """ self.vocab = vocab size = 20 @@ -158,20 +148,22 @@ cdef class Doc: self.is_parsed = True def __getitem__(self, object i): - """ - doc[i] - Get the Token object at position i, where i is an integer. + """Get a `Token` or `Span` object. + + EXAMPLE: + >>> doc[i] + Get the `Token` object at position `i`, where `i` is an integer. Negative indexing is supported, and follows the usual Python - semantics, i.e. doc[-2] is doc[len(doc) - 2]. - doc[start : end]] - Get a `Span` object, starting at position `start` - and ending at position `end`, where `start` and - `end` are token indices. For instance, - `doc[2:5]` produces a span consisting of - tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) - are not supported, as `Span` objects must be contiguous (cannot have gaps). - You can use negative indices and open-ended ranges, which have their - normal Python semantics. + semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`. + + >>> doc[start : end]] + Get a `Span` object, starting at position `start` and ending at + position `end`, where `start` and `end` are token indices. For + instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4. + Stepped slices (e.g. `doc[start : end : step]`) are not supported, + as `Span` objects must be contiguous (cannot have gaps). You can use + negative indices and open-ended ranges, which have their normal + Python semantics. """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step) @@ -186,14 +178,14 @@ cdef class Doc: return Token.cinit(self.vocab, &self.c[i], i, self) def __iter__(self): - """ - for token in doc - Iterate over `Token` objects, from which the annotations can - be easily accessed. This is the main way of accessing Token - objects, which are the main way annotations are accessed from - Python. If faster-than-Python speeds are required, you can - instead access the annotations as a numpy array, or access the - underlying C data directly from Cython. + """Iterate over `Token` objects, from which the annotations can be + easily accessed. This is the main way of accessing `Token` objects, + which are the main way annotations are accessed from Python. If faster- + than-Python speeds are required, you can instead access the annotations + as a numpy array, or access the underlying C data directly from Cython. + + EXAMPLE: + >>> for token in doc """ cdef int i for i in range(self.length): @@ -203,9 +195,10 @@ cdef class Doc: yield Token.cinit(self.vocab, &self.c[i], i, self) def __len__(self): - """ - len(doc) - The number of tokens in the document. + """The number of tokens in the document. + + EXAMPLE: + >>> len(doc) """ return self.length @@ -228,16 +221,12 @@ cdef class Doc: return self def similarity(self, other): - """ - Make a semantic similarity estimate. The default estimate is cosine + """Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. - Arguments: - other (object): The object to compare with. By default, accepts Doc, - Span, Token and Lexeme objects. - - Return: - score (float): A scalar similarity score. Higher is more similar. + other (object): The object to compare with. By default, accepts `Doc`, + `Span`, `Token` and `Lexeme` objects. + RETURNS (float): A scalar similarity score. Higher is more similar. """ if 'similarity' in self.user_hooks: return self.user_hooks['similarity'](self, other) @@ -246,8 +235,10 @@ cdef class Doc: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property has_vector: - """ - A boolean value indicating whether a word vector is associated with the object. + """A boolean value indicating whether a word vector is associated with + the object. + + RETURNS (bool): Whether a word vector is associated with the object. """ def __get__(self): if 'has_vector' in self.user_hooks: @@ -256,10 +247,11 @@ cdef class Doc: return any(token.has_vector for token in self) property vector: - """ - A real-valued meaning representation. Defaults to an average of the token vectors. + """A real-valued meaning representation. Defaults to an average of the + token vectors. - Type: numpy.ndarray[ndim=1, dtype='float32'] + RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array + representing the document's semantics. """ def __get__(self): if 'vector' in self.user_hooks: @@ -275,6 +267,7 @@ cdef class Doc: self._vector = value property vector_norm: + # TODO: docstrings / docs def __get__(self): if 'vector_norm' in self.user_hooks: return self.user_hooks['vector_norm'](self) @@ -295,34 +288,37 @@ cdef class Doc: return self.text property text: - """ - A unicode representation of the document text. + """A unicode representation of the document text. + + RETURNS (unicode): The original verbatim text of the document. """ def __get__(self): return u''.join(t.text_with_ws for t in self) property text_with_ws: - """ - An alias of Doc.text, provided for duck-type compatibility with Span and Token. + """An alias of `Doc.text`, provided for duck-type compatibility with + `Span` and `Token`. + + RETURNS (unicode): The original verbatim text of the document. """ def __get__(self): return self.text property ents: - """ - Yields named-entity `Span` objects, if the entity recognizer - has been applied to the document. Iterate over the span to get - individual Token objects, or access the label: + """Iterate over the entities in the document. Yields named-entity `Span` + objects, if the entity recognizer has been applied to the document. - Example: - from spacy.en import English - nlp = English() - tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') - ents = list(tokens.ents) - assert ents[0].label == 346 - assert ents[0].label_ == 'PERSON' - assert ents[0].orth_ == 'Best' - assert ents[0].text == 'Mr. Best' + YIELDS (Span): Entities in the document. + + EXAMPLE: Iterate over the span to get individual Token objects, or access + the label: + + >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') + >>> ents = list(tokens.ents) + >>> assert ents[0].label == 346 + >>> assert ents[0].label_ == 'PERSON' + >>> assert ents[0].orth_ == 'Best' + >>> assert ents[0].text == 'Mr. Best' """ def __get__(self): cdef int i @@ -387,12 +383,13 @@ cdef class Doc: self.c[start].ent_iob = 3 property noun_chunks: - """ - Yields base noun-phrase #[code Span] objects, if the document - has been syntactically parsed. A base noun phrase, or - 'NP chunk', is a noun phrase that does not permit other NPs to - be nested within it – so no NP-level coordination, no prepositional - phrases, and no relative clauses. + """Iterate over the base noun phrases in the document. Yields base + noun-phrase #[code Span] objects, if the document has been syntactically + parsed. A base noun phrase, or "NP chunk", is a noun phrase that does + not permit other NPs to be nested within it – so no NP-level + coordination, no prepositional phrases, and no relative clauses. + + YIELDS (Span): Noun chunks in the document. """ def __get__(self): if not self.is_parsed: @@ -411,17 +408,15 @@ cdef class Doc: yield span property sents: - """ - Yields sentence `Span` objects. Sentence spans have no label. - To improve accuracy on informal texts, spaCy calculates sentence - boundaries from the syntactic dependency parse. If the parser is disabled, - `sents` iterator will be unavailable. + """Iterate over the sentences in the document. Yields sentence `Span` + objects. Sentence spans have no label. To improve accuracy on informal + texts, spaCy calculates sentence boundaries from the syntactic + dependency parse. If the parser is disabled, the `sents` iterator will + be unavailable. - Example: - from spacy.en import English - nlp = English() - doc = nlp("This is a sentence. Here's another...") - assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] + EXAMPLE: + >>> doc = nlp("This is a sentence. Here's another...") + >>> assert [s.root.text for s in doc.sents] == ["is", "'s"] """ def __get__(self): if 'sents' in self.user_hooks: @@ -467,24 +462,20 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): - """ - Given a list of M attribute IDs, export the tokens to a numpy - `ndarray` of shape (N, M), where `N` is the length - of the document. The values will be 32-bit integers. + """Given a list of M attribute IDs, export the tokens to a numpy + `ndarray` of shape `(N, M)`, where `N` is the length of the document. + The values will be 32-bit integers. - Example: - from spacy import attrs - doc = nlp(text) - # All strings mapped to integers, for easy export to numpy - np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) + attr_ids (list[int]): A list of attribute ID ints. + RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row + per word, and one column per attribute indicated in the input + `attr_ids`. - Arguments: - attr_ids (list[int]): A list of attribute ID ints. - - Returns: - feat_array (numpy.ndarray[long, ndim=2]): - A feature matrix, with one row per word, and one column per attribute - indicated in the input attr_ids. + EXAMPLE: + >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA + >>> doc = nlp(text) + >>> # All strings mapped to integers, for easy export to numpy + >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) """ cdef int i, j cdef attr_id_t feature @@ -499,27 +490,20 @@ cdef class Doc: return output def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): - """ - Produce a dict of {attribute (int): count (ints)} frequencies, keyed - by the values of the given attribute ID. + """Count the frequencies of a given attribute. Produces a dict of + `{attribute (int): count (ints)}` frequencies, keyed by the values of + the given attribute ID. - Example: - from spacy.en import English - from spacy import attrs - nlp = English() - tokens = nlp(u'apple apple orange banana') - tokens.count_by(attrs.ORTH) - # {12800L: 1, 11880L: 2, 7561L: 1} - tokens.to_array([attrs.ORTH]) - # array([[11880], - # [11880], - # [ 7561], - # [12800]]) + attr_id (int): The attribute ID to key the counts. + RETURNS (dict): A dictionary mapping attributes to integer counts. - Arguments: - attr_id - int - The attribute ID to key the counts. + EXAMPLE: + >>> from spacy import attrs + >>> doc = nlp(u'apple apple orange banana') + >>> tokens.count_by(attrs.ORTH) + {12800L: 1, 11880L: 2, 7561L: 1} + >>> tokens.to_array([attrs.ORTH]) + array([[11880], [11880], [7561], [12800]]) """ cdef int i cdef attr_t attr @@ -567,8 +551,12 @@ cdef class Doc: self.c[i] = parsed[i] def from_array(self, attrs, int[:, :] array): - """ - Write to a `Doc` object, from an `(M, N)` array of attributes. + """Load attributes from a numpy array. Write to a `Doc` object, from an + `(M, N)` array of attributes. + + attrs (ints): A list of attribute ID ints. + array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load. + RETURNS (Doc): Itself. """ cdef int i, col cdef attr_id_t attr_id @@ -597,8 +585,10 @@ cdef class Doc: return self def to_bytes(self): - """ - Serialize, producing a byte string. + """Serialize, i.e. export the document contents to a binary string. + + RETURNS (bytes): A losslessly serialized copy of the `Doc`, including + all annotations. """ return dill.dumps( (self.text, @@ -611,8 +601,10 @@ cdef class Doc: protocol=-1) def from_bytes(self, data): - """ - Deserialize, loading from bytes. + """Deserialize, i.e. import the document contents from a binary string. + + data (bytes): The string to load from. + RETURNS (Doc): Itself. """ if self.length != 0: raise ValueError("Cannot load into non-empty Doc") @@ -640,21 +632,16 @@ cdef class Doc: return self def merge(self, int start_idx, int end_idx, *args, **attributes): - """ - Retokenize the document, such that the span at doc.text[start_idx : end_idx] - is merged into a single token. If start_idx and end_idx do not mark start - and end token boundaries, the document remains unchanged. + """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` + is merged into a single token. If `start_idx` and `end_idx `do not mark + start and end token boundaries, the document remains unchanged. - Arguments: - start_idx (int): The character index of the start of the slice to merge. - end_idx (int): The character index after the end of the slice to merge. - **attributes: - Attributes to assign to the merged token. By default, attributes - are inherited from the syntactic root token of the span. - Returns: - token (Token): - The newly merged token, or None if the start and end indices did - not fall at token boundaries. + start_idx (int): The character index of the start of the slice to merge. + end_idx (int): The character index after the end of the slice to merge. + **attributes: Attributes to assign to the merged token. By default, + attributes are inherited from the syntactic root token of the span. + RETURNS (Token): The newly merged token, or `None` if the start and end + indices did not fall at token boundaries. """ cdef unicode tag, lemma, ent_type if len(args) == 3: @@ -758,7 +745,29 @@ cdef class Doc: return self[start] def print_tree(self, light=False, flat=False): - """Returns the parse trees in the JSON (Dict) format.""" + """Returns the parse trees in JSON (dict) format. + + light (bool): Don't include lemmas or entities. + flat (bool): Don't include arcs or modifiers. + RETURNS (dict): Parse tree as dict. + + EXAMPLE: + >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') + >>> trees = doc.print_tree() + >>> trees[1] + {'modifiers': [ + {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', + 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, + {'modifiers': [ + {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', + 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], + 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', + 'POS_fine': 'NN', 'lemma': 'pizza'}, + {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', + 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], + 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', + 'POS_fine': 'VBD', 'lemma': 'eat'} + """ return parse_tree(self, light=light, flat=flat) diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index f9b1f3972..4bc7099d7 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE def merge_ents(doc): - """ - Helper: merge adjacent entities into single tokens; modifies the doc. - """ + """Helper: merge adjacent entities into single tokens; modifies the doc.""" for ent in doc.ents: ent.merge(ent.root.tag_, ent.text, ent.label_) return doc def format_POS(token, light, flat): - """ - Helper: form the POS output for a token. - """ + """Helper: form the POS output for a token.""" subtree = dict([ ("word", token.text), ("lemma", token.lemma_), # trigger @@ -37,9 +33,8 @@ def format_POS(token, light, flat): def POS_tree(root, light=False, flat=False): - """ - Helper: generate a POS tree for a root token. The doc must have - merge_ents(doc) ran on it. + """Helper: generate a POS tree for a root token. The doc must have + `merge_ents(doc)` ran on it. """ subtree = format_POS(root, light=light, flat=flat) for c in root.children: @@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False): def parse_tree(doc, light=False, flat=False): - """ - Makes a copy of the doc, then construct a syntactic parse tree, similar to + """Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc. - Args: - doc: The doc for parsing. + doc (Doc): The doc for parsing. + RETURNS (dict): The parse tree. - Returns: - [parse_trees (Dict)]: - - >>> from spacy.en import English - >>> nlp = English() - >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') - >>> trees = doc.print_tree() - [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] + EXAMPLE: + >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') + >>> trees = doc.print_tree() + >>> trees[1] + {'modifiers': [ + {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', + 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, + {'modifiers': [ + {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', + 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], + 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', + 'POS_fine': 'NN', 'lemma': 'pizza'}, + {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', + 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], + 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', + 'POS_fine': 'VBD', 'lemma': 'eat'} """ doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 72fe34f8c..77c98a6a3 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -4,6 +4,503 @@ include ../../_includes/_mixins p A container for accessing linguistic annotations. +p + | A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects. + | Access sentences and named entities, export annotations to numpy arrays, + | losslessly serialize to compressed binary strings. The #[code Doc] object + | holds an array of #[code TokenC] structs. The Python-level #[code Token] + | and #[+api("span") #[code Span]] objects are views of this array, i.e. + | they don't own the data themselves. + ++aside-code("Example"). + # Construction 1 + doc = nlp(u'Some text') + + # Construction 2 + from spacy.tokens import Doc + doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], + spaces=[True, False, False]) + ++h(2, "init") Doc.__init__ + +tag method + +p + | Construct a #[code Doc] object. The most common way to get a #[code Doc] + | object is via the #[code nlp] object. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A storage container for lexical types. + + +row + +cell #[code words] + +cell - + +cell A list of strings to add to the container. + + +row + +cell #[code spaces] + +cell - + +cell + | A list of boolean values indicating whether each word has a + | subsequent space. Must have the same length as #[code words], if + | specified. Defaults to a sequence of #[code True]. + + +footrow + +cell return + +cell #[code Doc] + +cell The newly constructed object. + ++h(2, "getitem") Doc.__getitem__ + +tag method + +p + | Get a #[+api("token") #[code Token]] object at position #[code i], where + | #[code i] is an integer. Negative indexing is supported, and follows the + | usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]]. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + assert doc[0].text == 'Give' + assert doc[-1].text == '.' + span = doc[1:1] + assert span.text == 'it back' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code i] + +cell int + +cell The index of the token. + + +footrow + +cell return + +cell #[code Token] + +cell The token at #[code doc[i]]. + +p + | Get a #[+api("span") #[code Span]] object, starting at position + | #[code start] (token index) and ending at position #[code end] (token + | index). + +p + | For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3 + | and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not + | supported, as #[code Span] objects must be contiguous (cannot have gaps). + | You can use negative indices and open-ended ranges, which have their + | normal Python semantics. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start_end] + +cell tuple + +cell The slice of the document to get. + + +footrow + +cell return + +cell #[code Span] + +cell The span at #[code doc[start : end]]. + ++h(2, "iter") Doc.__iter__ + +tag method + +p + | Iterate over #[code Token] objects, from which the annotations can be + | easily accessed. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + for token in doc: + print(token.text, token.tag_) + +p + | This is the main way of accessing #[+api("token") #[code Token]] objects, + | which are the main way annotations are accessed from Python. If + | faster-than-Python speeds are required, you can instead access the + | annotations as a numpy array, or access the underlying C data directly + | from Cython. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yield + +cell #[code Token] + +cell A #[code Token] object. + ++h(2, "len") Doc.__len__ + +tag method + +p Get the number of tokens in the document. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + assert len(doc) == 7 + ++table(["Name", "Type", "Description"]) + +footrow + +cell return + +cell int + +cell The number of tokens in the document. + ++h(2, "similarity") Doc.similarity + +tag method + +tag requires model + +p + | Make a semantic similarity estimate. The default estimate is cosine + | similarity using an average of word vectors. + ++aside-code("Example"). + apples, and, oranges = nlp(u'apples and oranges') + apples_oranges = apples.similarity(oranges) + oranges_apples = oranges.similarity(apples) + assert apples_oranges == oranges_apples + ++table(["Name", "Type", "Description"]) + +row + +cell #[code other] + +cell - + +cell + | The object to compare with. By default, accepts #[code Doc], + | #[code Span], #[code Token] and #[code Lexeme] objects. + + +footrow + +cell return + +cell float + +cell A scalar similarity score. Higher is more similar. + ++h(2, "count_by") Doc.count_by + +tag method + +p + | Count the frequencies of a given attribute. Produces a dict of + | #[code {attr (int): count (ints)}] frequencies, keyed by the values + | of the given attribute ID. + ++aside-code("Example"). + from spacy import attrs + doc = nlp(u'apple apple orange banana') + tokens.count_by(attrs.ORTH) + # {12800L: 1, 11880L: 2, 7561L: 1} + tokens.to_array([attrs.ORTH]) + # array([[11880], [11880], [7561], [12800]]) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code attr_id] + +cell int + +cell The attribute ID + + +footrow + +cell return + +cell dict + +cell A dictionary mapping attributes to integer counts. + ++h(2, "to_array") Doc.to_array + +tag method + +p + | Export the document annotations to a numpy array of shape #[code N*M] + | where #[code N] is the length of the document and #[code M] is the number + | of attribute IDs to export. The values will be 32-bit integers. + ++aside-code("Example"). + from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA + doc = nlp(text) + # All strings mapped to integers, for easy export to numpy + np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code attr_ids] + +cell ints + +cell A list of attribute ID ints. + + +footrow + +cell return + +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell + | The exported attributes as a 2D numpy array, with one row per + | token and one column per attribute. + ++h(2, "from_array") Doc.from_array + +tag method + +p + | Load attributes from a numpy array. Write to a #[code Doc] object, from + | an #[code (M, N)] array of attributes. + ++aside-code("Example"). + from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA + from spacy.tokens import Doc + doc = nlp(text) + np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) + doc2 = Doc(doc.vocab) + doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code attrs] + +cell ints + +cell A list of attribute ID ints. + + +row + +cell #[code array] + +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell The attribute values to load. + + +footrow + +cell return + +cell #[code Doc] + +cell Itself. + ++h(2, "to_bytes") Doc.to_bytes + +tag method + +p Serialize, i.e. export the document contents to a binary string. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + doc_bytes = doc.to_bytes() + ++table(["Name", "Type", "Description"]) + +footrow + +cell return + +cell bytes + +cell + | A losslessly serialized copy of the #[code Doc], including all + | annotations. + ++h(2, "from_bytes") Doc.from_bytes + +tag method + +p Deserialize, i.e. import the document contents from a binary string. + ++aside-code("Example"). + from spacy.tokens import Doc + text = u'Give it back! He pleaded.' + doc = nlp(text) + bytes = doc.to_bytes() + doc2 = Doc(doc.vocab).from_bytes(bytes) + assert doc.text == doc2.text + ++table(["Name", "Type", "Description"]) + +row + +cell #[code data] + +cell bytes + +cell The string to load from. + + +footrow + +cell return + +cell #[code Doc] + +cell Itself. + ++h(2, "merge") Doc.merge + +tag method + +p + | Retokenize the document, such that the span at + | #[code doc.text[start_idx : end_idx]] is merged into a single token. If + | #[code start_idx] and #[end_idx] do not mark start and end token + | boundaries, the document remains unchanged. + ++aside-code("Example"). + doc = nlp(u'Los Angeles start.') + doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE') + print([token.text for token in doc]) + # ['Los Angeles', 'start', '.'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start_idx] + +cell int + +cell The character index of the start of the slice to merge. + + +row + +cell #[code end_idx] + +cell int + +cell The character index after the end of the slice to merge. + + +row + +cell #[code **attributes] + +cell - + +cell + | Attributes to assign to the merged token. By default, + | attributes are inherited from the syntactic root token of + | the span. + + +footrow + +cell return + +cell #[code Token] + +cell + | The newly merged token, or #[code None] if the start and end + | indices did not fall at token boundaries + ++h(2, "print_tree") Doc.print_tree + +tag method + +tag requires model + +p + | Returns the parse trees in JSON (dict) format. Especially useful for + | web applications. + ++aside-code("Example"). + doc = nlp('Alice ate the pizza.') + trees = doc.print_tree() + # {'modifiers': [ + # {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, + # {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, + # {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'} + # ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'} + ++table(["Name", "Type", "Description"]) + +row + +cell #[code light] + +cell bool + +cell Don't include lemmas or entities. + + +row + +cell #[code flat] + +cell bool + +cell Don't include arcs or modifiers. + + +footrow + +cell return + +cell dict + +cell Parse tree as dict. + ++h(2, "text") Doc.text + +tag property + +p A unicode representation of the document text. + ++aside-code("Example"). + text = u'Give it back! He pleaded.' + doc = nlp(text) + assert doc.text == text + ++table(["Name", "Type", "Description"]) + +footrow + +cell return + +cell unicode + +cell The original verbatim text of the document. + ++h(2, "text_with_ws") Doc.text_with_ws + +tag property + +p + | An alias of #[code Doc.text], provided for duck-type compatibility with + | #[code Span] and #[code Token]. + ++table(["Name", "Type", "Description"]) + +footrow + +cell return + +cell unicode + +cell The original verbatim text of the document. + ++h(2, "ents") Doc.ents + +tag property + +tag requires model + +p + | Iterate over the entities in the document. Yields named-entity + | #[code Span] objects, if the entity recognizer has been applied to the + | document. + ++aside-code("Example"). + tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') + ents = list(tokens.ents) + assert ents[0].label == 346 + assert ents[0].label_ == 'PERSON' + assert ents[0].text == 'Mr. Best' + ++table(["Name", "Type", "Description"]) + +footrow + +cell yield + +cell #[code Span] + +cell Entities in the document. + ++h(2, "noun_chunks") Doc.noun_chunks + +tag property + +tag requires model + +p + | Iterate over the base noun phrases in the document. Yields base + | noun-phrase #[code Span] objects, if the document has been syntactically + | parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not + | permit other NPs to be nested within it – so no NP-level coordination, no + | prepositional phrases, and no relative clauses. + ++aside-code("Example"). + doc = nlp(u'A phrase with another phrase occurs.') + chunks = list(doc.noun_chunks) + assert chunks[0].text == "A phrase" + assert chunks[1].text == "another phrase" + ++table(["Name", "Type", "Description"]) + +footrow + +cell yield + +cell #[code Span] + +cell Noun chunks in the document. + ++h(2, "sents") Doc.sents + +tag property + +tag requires model + +p + | Iterate over the sentences in the document. Sentence spans have no label. + | To improve accuracy on informal texts, spaCy calculates sentence boundaries + | from the syntactic dependency parse. If the parser is disabled, + | the #[code sents] iterator will be unavailable. + ++aside-code("Example"). + doc = nlp(u"This is a sentence. Here's another...") + sents = list(doc.sents) + assert len(sents) == 2 + assert [s.root.text for s in sents] == ["is", "'s"] + ++table(["Name", "Type", "Description"]) + +footrow + +cell yield + +cell #[code Span] + +cell Sentences in the document. + ++h(2, "has_vector") Doc.has_vector + +tag property + +tag requires model + +p + | A boolean value indicating whether a word vector is associated with the + | object. + ++aside-code("Example"). + apple = nlp(u'apple') + assert apple.has_vector + ++table(["Name", "Type", "Description"]) + +footrow + +cell return + +cell bool + +cell Whether the document has a vector data attached. + ++h(2, "vector") Doc.vector + +tag property + +tag requires model + +p + | A real-valued meaning representation. Defaults to an average of the + | token vectors. + ++aside-code("Example"). + apple = nlp(u'apple') + (apple.vector.dtype, apple.vector.shape) + # (dtype('float32'), (300,)) + ++table(["Name", "Type", "Description"]) + +footrow + +cell return + +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell A 1D numpy array representing the document's semantics. + +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) @@ -59,358 +556,3 @@ p A container for accessing linguistic annotations. +cell | A dictionary that allows customisation of properties of | #[code Span] children. - -+h(2, "init") Doc.__init__ - +tag method - -p Construct a #[code Doc] object. - -+aside("Note") - | The most common way to get a #[code Doc] object is via the #[code nlp] - | object. This method is usually only used for deserialization or preset - | tokenization. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code vocab] - +cell #[code Vocab] - +cell A storage container for lexical types. - - +row - +cell #[code words] - +cell - - +cell A list of strings to add to the container. - - +row - +cell #[code spaces] - +cell - - +cell - | A list of boolean values indicating whether each word has a - | subsequent space. Must have the same length as #[code words], if - | specified. Defaults to a sequence of #[code True]. - - +footrow - +cell return - +cell #[code Doc] - +cell The newly constructed object. - -+h(2, "getitem") Doc.__getitem__ - +tag method - -p Get a #[code Token] object. - -+aside-code("Example"). - doc = nlp(u'Give it back! He pleaded.') - assert doc[0].text == 'Give' - assert doc[-1].text == '.' - span = doc[1:1] - assert span.text == 'it back' - -+table(["Name", "Type", "Description"]) - +row - +cell #[code i] - +cell int - +cell The index of the token. - - +footrow - +cell return - +cell #[code Token] - +cell The token at #[code doc[i]]. - -p Get a #[code Span] object. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code start_end] - +cell tuple - +cell The slice of the document to get. - - +footrow - +cell return - +cell #[code Span] - +cell The span at #[code doc[start : end]]. - -+h(2, "iter") Doc.__iter__ - +tag method - -p Iterate over #[code Token] objects. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yield - +cell #[code Token] - +cell A #[code Token] object. - -+h(2, "len") Doc.__len__ - +tag method - -p Get the number of tokens in the document. - -+table(["Name", "Type", "Description"]) - +footrow - +cell return - +cell int - +cell The number of tokens in the document. - -+h(2, "similarity") Doc.similarity - +tag method - -p - | Make a semantic similarity estimate. The default estimate is cosine - | similarity using an average of word vectors. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code other] - +cell - - +cell - | The object to compare with. By default, accepts #[code Doc], - | #[code Span], #[code Token] and #[code Lexeme] objects. - - +footrow - +cell return - +cell float - +cell A scalar similarity score. Higher is more similar. - -+h(2, "to_array") Doc.to_array - +tag method - -p - | Export the document annotations to a numpy array of shape #[code N*M] - | where #[code N] is the length of the document and #[code M] is the number - | of attribute IDs to export. The values will be 32-bit integers. - -+aside-code("Example"). - from spacy import attrs - doc = nlp(text) - # All strings mapped to integers, for easy export to numpy - np_array = doc.to_array([attrs.LOWER, attrs.POS, - attrs.ENT_TYPE, attrs.IS_ALPHA]) - -+table(["Name", "Type", "Description"]) - +row - +cell #[code attr_ids] - +cell ints - +cell A list of attribute ID ints. - - +footrow - +cell return - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] - +cell - | The exported attributes as a 2D numpy array, with one row per - | token and one column per attribute. - -+h(2, "count_by") Doc.count_by - +tag method - -p Count the frequencies of a given attribute. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code attr_id] - +cell int - +cell The attribute ID - - +footrow - +cell return - +cell dict - +cell A dictionary mapping attributes to integer counts. - -+h(2, "from_array") Doc.from_array - +tag method - -p Load attributes from a numpy array. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code attr_ids] - +cell ints - +cell A list of attribute ID ints. - - +row - +cell #[code values] - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] - +cell The attribute values to load. - - +footrow - +cell return - +cell #[code None] - +cell - - -+h(2, "to_bytes") Doc.to_bytes - +tag method - -p Export the document contents to a binary string. - -+table(["Name", "Type", "Description"]) - +footrow - +cell return - +cell bytes - +cell - | A losslessly serialized copy of the #[code Doc] including all - | annotations. - -+h(2, "from_bytes") Doc.from_bytes - +tag method - -p Import the document contents from a binary string. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code byte_string] - +cell bytes - +cell The string to load from. - - +footrow - +cell return - +cell #[code Doc] - +cell The #[code self] variable. - -+h(2, "merge") Doc.merge - +tag method - -p - | Retokenize the document, such that the span at - | #[code doc.text[start_idx : end_idx]] is merged into a single token. If - | #[code start_idx] and #[end_idx] do not mark start and end token - | boundaries, the document remains unchanged. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code start_idx] - +cell int - +cell The character index of the start of the slice to merge. - - +row - +cell #[code end_idx] - +cell int - +cell The character index after the end of the slice to merge. - - +row - +cell #[code **attributes] - +cell - - +cell - | Attributes to assign to the merged token. By default, - | attributes are inherited from the syntactic root token of - | the span. - - +footrow - +cell return - +cell #[code Token] - +cell - | The newly merged token, or None if the start and end - | indices did not fall at token boundaries - -+h(2, "read_bytes") Doc.read_bytes - +tag staticmethod - -p A static method, used to read serialized #[code Doc] objects from a file. - -+aside-code("Example"). - from spacy.tokens.doc import Doc - loc = 'test_serialize.bin' - with open(loc, 'wb') as file_: - file_.write(nlp(u'This is a document.').to_bytes()) - file_.write(nlp(u'This is another.').to_bytes()) - docs = [] - with open(loc, 'rb') as file_: - for byte_string in Doc.read_bytes(file_): - docs.append(Doc(nlp.vocab).from_bytes(byte_string)) - assert len(docs) == 2 - -+table(["Name", "Type", "Description"]) - +row - +cell file - +cell buffer - +cell A binary buffer to read the serialized annotations from. - - +footrow - +cell yield - +cell bytes - +cell Binary strings from with documents can be loaded. - -+h(2, "text") Doc.text - +tag property - -p A unicode representation of the document text. - -+table(["Name", "Type", "Description"]) - +footrow - +cell return - +cell unicode - +cell The original verbatim text of the document. - -+h(2, "text_with_ws") Doc.text_with_ws - +tag property - -p - | An alias of #[code Doc.text], provided for duck-type compatibility with - | #[code Span] and #[code Token]. - -+table(["Name", "Type", "Description"]) - +footrow - +cell return - +cell unicode - +cell The original verbatim text of the document. - -+h(2, "sents") Doc.sents - +tag property - -p Iterate over the sentences in the document. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yield - +cell #[code Span] - +cell Sentences in the document. - -+h(2, "ents") Doc.ents - +tag property - -p Iterate over the entities in the document. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yield - +cell #[code Span] - +cell Entities in the document. - -+h(2, "noun_chunks") Doc.noun_chunks - +tag property - -p - | Iterate over the base noun phrases in the document. A base noun phrase, - | or "NP chunk", is a noun phrase that does not permit other NPs to be - | nested within it. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yield - +cell #[code Span] - +cell Noun chunks in the document - -+h(2, "vector") Doc.vector - +tag property - -p - | A real-valued meaning representation. Defaults to an average of the - | token vectors. - -+table(["Name", "Type", "Description"]) - +footrow - +cell return - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] - +cell A 1D numpy array representing the document's semantics. - -+h(2, "has_vector") Doc.has_vector - +tag property - -p - | A boolean value indicating whether a word vector is associated with the - | object. - -+table(["Name", "Type", "Description"]) - +footrow - +cell return - +cell bool - +cell Whether the document has a vector data attached.