Update docstrings and API docs for Doc class

This commit is contained in:
ines 2017-05-18 22:17:09 +02:00
parent 0f513850ab
commit b87066ff10
3 changed files with 684 additions and 531 deletions

View File

@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc: cdef class Doc:
""" """A sequence of Token objects. Access sentences and named entities, export
A sequence of `Token` objects. Access sentences and named entities, annotations to numpy arrays, losslessly serialize to compressed binary strings.
export annotations to numpy arrays, losslessly serialize to compressed The `Doc` object holds an array of `TokenC` structs. The Python-level
binary strings. `Token` and `Span` objects are views of this array, i.e. they don't own
the data themselves.
Aside: Internals EXAMPLE: Construction 1
The `Doc` object holds an array of `TokenC` structs. >>> doc = nlp(u'Some text')
The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves.
Code: Construction 1
doc = nlp.tokenizer(u'Some text')
Code: Construction 2
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
Construction 2
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
""" """
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
""" """Create a Doc object.
Create a Doc object.
Arguments: vocab (Vocab): A vocabulary object, which must match any models you want
vocab: to use (e.g. tokenizer, parser, entity recognizer).
A Vocabulary object, which must match any models you want to words (list or None): A list of unicode strings to add to the document
use (e.g. tokenizer, parser, entity recognizer). as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words: words. True means that the word is followed by a space, False means
A list of unicode strings to add to the document as words. If None, it is not. If `None`, defaults to `[True]*len(words)`
defaults to empty list. RETURNS (Doc): The newly constructed object.
spaces:
A list of boolean values, of the same length as words. True
means that the word is followed by a space, False means it is not.
If None, defaults to [True]*len(words)
""" """
self.vocab = vocab self.vocab = vocab
size = 20 size = 20
@ -158,20 +148,22 @@ cdef class Doc:
self.is_parsed = True self.is_parsed = True
def __getitem__(self, object i): def __getitem__(self, object i):
""" """Get a `Token` or `Span` object.
doc[i]
Get the Token object at position i, where i is an integer. EXAMPLE:
>>> doc[i]
Get the `Token` object at position `i`, where `i` is an integer.
Negative indexing is supported, and follows the usual Python Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2]. semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
doc[start : end]]
Get a `Span` object, starting at position `start` >>> doc[start : end]]
and ending at position `end`, where `start` and Get a `Span` object, starting at position `start` and ending at
`end` are token indices. For instance, position `end`, where `start` and `end` are token indices. For
`doc[2:5]` produces a span consisting of instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) Stepped slices (e.g. `doc[start : end : step]`) are not supported,
are not supported, as `Span` objects must be contiguous (cannot have gaps). as `Span` objects must be contiguous (cannot have gaps). You can use
You can use negative indices and open-ended ranges, which have their negative indices and open-ended ranges, which have their normal
normal Python semantics. Python semantics.
""" """
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +178,14 @@ cdef class Doc:
return Token.cinit(self.vocab, &self.c[i], i, self) return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self): def __iter__(self):
""" """Iterate over `Token` objects, from which the annotations can be
for token in doc easily accessed. This is the main way of accessing `Token` objects,
Iterate over `Token` objects, from which the annotations can which are the main way annotations are accessed from Python. If faster-
be easily accessed. This is the main way of accessing Token than-Python speeds are required, you can instead access the annotations
objects, which are the main way annotations are accessed from as a numpy array, or access the underlying C data directly from Cython.
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the EXAMPLE:
underlying C data directly from Cython. >>> for token in doc
""" """
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
@ -203,9 +195,10 @@ cdef class Doc:
yield Token.cinit(self.vocab, &self.c[i], i, self) yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self): def __len__(self):
""" """The number of tokens in the document.
len(doc)
The number of tokens in the document. EXAMPLE:
>>> len(doc)
""" """
return self.length return self.length
@ -228,16 +221,12 @@ cdef class Doc:
return self return self
def similarity(self, other): def similarity(self, other):
""" """Make a semantic similarity estimate. The default estimate is cosine
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.
Arguments: other (object): The object to compare with. By default, accepts `Doc`,
other (object): The object to compare with. By default, accepts Doc, `Span`, `Token` and `Lexeme` objects.
Span, Token and Lexeme objects. RETURNS (float): A scalar similarity score. Higher is more similar.
Return:
score (float): A scalar similarity score. Higher is more similar.
""" """
if 'similarity' in self.user_hooks: if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other) return self.user_hooks['similarity'](self, other)
@ -246,8 +235,10 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector: property has_vector:
""" """A boolean value indicating whether a word vector is associated with
A boolean value indicating whether a word vector is associated with the object. the object.
RETURNS (bool): Whether a word vector is associated with the object.
""" """
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
@ -256,10 +247,11 @@ cdef class Doc:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
""" """A real-valued meaning representation. Defaults to an average of the
A real-valued meaning representation. Defaults to an average of the token vectors. token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32'] RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the document's semantics.
""" """
def __get__(self): def __get__(self):
if 'vector' in self.user_hooks: if 'vector' in self.user_hooks:
@ -275,6 +267,7 @@ cdef class Doc:
self._vector = value self._vector = value
property vector_norm: property vector_norm:
# TODO: docstrings / docs
def __get__(self): def __get__(self):
if 'vector_norm' in self.user_hooks: if 'vector_norm' in self.user_hooks:
return self.user_hooks['vector_norm'](self) return self.user_hooks['vector_norm'](self)
@ -295,34 +288,37 @@ cdef class Doc:
return self.text return self.text
property text: property text:
""" """A unicode representation of the document text.
A unicode representation of the document text.
RETURNS (unicode): The original verbatim text of the document.
""" """
def __get__(self): def __get__(self):
return u''.join(t.text_with_ws for t in self) return u''.join(t.text_with_ws for t in self)
property text_with_ws: property text_with_ws:
""" """An alias of `Doc.text`, provided for duck-type compatibility with
An alias of Doc.text, provided for duck-type compatibility with Span and Token. `Span` and `Token`.
RETURNS (unicode): The original verbatim text of the document.
""" """
def __get__(self): def __get__(self):
return self.text return self.text
property ents: property ents:
""" """Iterate over the entities in the document. Yields named-entity `Span`
Yields named-entity `Span` objects, if the entity recognizer objects, if the entity recognizer has been applied to the document.
has been applied to the document. Iterate over the span to get
individual Token objects, or access the label:
Example: YIELDS (Span): Entities in the document.
from spacy.en import English
nlp = English() EXAMPLE: Iterate over the span to get individual Token objects, or access
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') the label:
ents = list(tokens.ents)
assert ents[0].label == 346 >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
assert ents[0].label_ == 'PERSON' >>> ents = list(tokens.ents)
assert ents[0].orth_ == 'Best' >>> assert ents[0].label == 346
assert ents[0].text == 'Mr. Best' >>> assert ents[0].label_ == 'PERSON'
>>> assert ents[0].orth_ == 'Best'
>>> assert ents[0].text == 'Mr. Best'
""" """
def __get__(self): def __get__(self):
cdef int i cdef int i
@ -387,12 +383,13 @@ cdef class Doc:
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
property noun_chunks: property noun_chunks:
""" """Iterate over the base noun phrases in the document. Yields base
Yields base noun-phrase #[code Span] objects, if the document noun-phrase #[code Span] objects, if the document has been syntactically
has been syntactically parsed. A base noun phrase, or parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
'NP chunk', is a noun phrase that does not permit other NPs to not permit other NPs to be nested within it so no NP-level
be nested within it so no NP-level coordination, no prepositional coordination, no prepositional phrases, and no relative clauses.
phrases, and no relative clauses.
YIELDS (Span): Noun chunks in the document.
""" """
def __get__(self): def __get__(self):
if not self.is_parsed: if not self.is_parsed:
@ -411,17 +408,15 @@ cdef class Doc:
yield span yield span
property sents: property sents:
""" """Iterate over the sentences in the document. Yields sentence `Span`
Yields sentence `Span` objects. Sentence spans have no label. objects. Sentence spans have no label. To improve accuracy on informal
To improve accuracy on informal texts, spaCy calculates sentence texts, spaCy calculates sentence boundaries from the syntactic
boundaries from the syntactic dependency parse. If the parser is disabled, dependency parse. If the parser is disabled, the `sents` iterator will
`sents` iterator will be unavailable. be unavailable.
Example: EXAMPLE:
from spacy.en import English >>> doc = nlp("This is a sentence. Here's another...")
nlp = English() >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
doc = nlp("This is a sentence. Here's another...")
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
""" """
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
@ -467,24 +462,20 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
""" """Given a list of M attribute IDs, export the tokens to a numpy
Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document.
`ndarray` of shape (N, M), where `N` is the length The values will be 32-bit integers.
of the document. The values will be 32-bit integers.
Example: attr_ids (list[int]): A list of attribute ID ints.
from spacy import attrs RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
doc = nlp(text) per word, and one column per attribute indicated in the input
# All strings mapped to integers, for easy export to numpy `attr_ids`.
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments: EXAMPLE:
attr_ids (list[int]): A list of attribute ID ints. >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
>>> doc = nlp(text)
Returns: >>> # All strings mapped to integers, for easy export to numpy
feat_array (numpy.ndarray[long, ndim=2]): >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
A feature matrix, with one row per word, and one column per attribute
indicated in the input attr_ids.
""" """
cdef int i, j cdef int i, j
cdef attr_id_t feature cdef attr_id_t feature
@ -499,27 +490,20 @@ cdef class Doc:
return output return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
""" """Count the frequencies of a given attribute. Produces a dict of
Produce a dict of {attribute (int): count (ints)} frequencies, keyed `{attribute (int): count (ints)}` frequencies, keyed by the values of
by the values of the given attribute ID. the given attribute ID.
Example: attr_id (int): The attribute ID to key the counts.
from spacy.en import English RETURNS (dict): A dictionary mapping attributes to integer counts.
from spacy import attrs
nlp = English()
tokens = nlp(u'apple apple orange banana')
tokens.count_by(attrs.ORTH)
# {12800L: 1, 11880L: 2, 7561L: 1}
tokens.to_array([attrs.ORTH])
# array([[11880],
# [11880],
# [ 7561],
# [12800]])
Arguments: EXAMPLE:
attr_id >>> from spacy import attrs
int >>> doc = nlp(u'apple apple orange banana')
The attribute ID to key the counts. >>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880], [11880], [7561], [12800]])
""" """
cdef int i cdef int i
cdef attr_t attr cdef attr_t attr
@ -567,8 +551,12 @@ cdef class Doc:
self.c[i] = parsed[i] self.c[i] = parsed[i]
def from_array(self, attrs, int[:, :] array): def from_array(self, attrs, int[:, :] array):
""" """Load attributes from a numpy array. Write to a `Doc` object, from an
Write to a `Doc` object, from an `(M, N)` array of attributes. `(M, N)` array of attributes.
attrs (ints): A list of attribute ID ints.
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
RETURNS (Doc): Itself.
""" """
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
@ -597,8 +585,10 @@ cdef class Doc:
return self return self
def to_bytes(self): def to_bytes(self):
""" """Serialize, i.e. export the document contents to a binary string.
Serialize, producing a byte string.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
""" """
return dill.dumps( return dill.dumps(
(self.text, (self.text,
@ -611,8 +601,10 @@ cdef class Doc:
protocol=-1) protocol=-1)
def from_bytes(self, data): def from_bytes(self, data):
""" """Deserialize, i.e. import the document contents from a binary string.
Deserialize, loading from bytes.
data (bytes): The string to load from.
RETURNS (Doc): Itself.
""" """
if self.length != 0: if self.length != 0:
raise ValueError("Cannot load into non-empty Doc") raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +632,16 @@ cdef class Doc:
return self return self
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
""" """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
Retokenize the document, such that the span at doc.text[start_idx : end_idx] is merged into a single token. If `start_idx` and `end_idx `do not mark
is merged into a single token. If start_idx and end_idx do not mark start start and end token boundaries, the document remains unchanged.
and end token boundaries, the document remains unchanged.
Arguments: start_idx (int): The character index of the start of the slice to merge.
start_idx (int): The character index of the start of the slice to merge. end_idx (int): The character index after the end of the slice to merge.
end_idx (int): The character index after the end of the slice to merge. **attributes: Attributes to assign to the merged token. By default,
**attributes: attributes are inherited from the syntactic root token of the span.
Attributes to assign to the merged token. By default, attributes RETURNS (Token): The newly merged token, or `None` if the start and end
are inherited from the syntactic root token of the span. indices did not fall at token boundaries.
Returns:
token (Token):
The newly merged token, or None if the start and end indices did
not fall at token boundaries.
""" """
cdef unicode tag, lemma, ent_type cdef unicode tag, lemma, ent_type
if len(args) == 3: if len(args) == 3:
@ -758,7 +745,29 @@ cdef class Doc:
return self[start] return self[start]
def print_tree(self, light=False, flat=False): def print_tree(self, light=False, flat=False):
"""Returns the parse trees in the JSON (Dict) format.""" """Returns the parse trees in JSON (dict) format.
light (bool): Don't include lemmas or entities.
flat (bool): Don't include arcs or modifiers.
RETURNS (dict): Parse tree as dict.
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
return parse_tree(self, light=light, flat=flat) return parse_tree(self, light=light, flat=flat)

View File

@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
def merge_ents(doc): def merge_ents(doc):
""" """Helper: merge adjacent entities into single tokens; modifies the doc."""
Helper: merge adjacent entities into single tokens; modifies the doc.
"""
for ent in doc.ents: for ent in doc.ents:
ent.merge(ent.root.tag_, ent.text, ent.label_) ent.merge(ent.root.tag_, ent.text, ent.label_)
return doc return doc
def format_POS(token, light, flat): def format_POS(token, light, flat):
""" """Helper: form the POS output for a token."""
Helper: form the POS output for a token.
"""
subtree = dict([ subtree = dict([
("word", token.text), ("word", token.text),
("lemma", token.lemma_), # trigger ("lemma", token.lemma_), # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
def POS_tree(root, light=False, flat=False): def POS_tree(root, light=False, flat=False):
""" """Helper: generate a POS tree for a root token. The doc must have
Helper: generate a POS tree for a root token. The doc must have `merge_ents(doc)` ran on it.
merge_ents(doc) ran on it.
""" """
subtree = format_POS(root, light=light, flat=flat) subtree = format_POS(root, light=light, flat=flat)
for c in root.children: for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
def parse_tree(doc, light=False, flat=False): def parse_tree(doc, light=False, flat=False):
""" """Makes a copy of the doc, then construct a syntactic parse tree, similar to
Makes a copy of the doc, then construct a syntactic parse tree, similar to
the one used in displaCy. Generates the POS tree for all sentences in a doc. the one used in displaCy. Generates the POS tree for all sentences in a doc.
Args: doc (Doc): The doc for parsing.
doc: The doc for parsing. RETURNS (dict): The parse tree.
Returns: EXAMPLE:
[parse_trees (Dict)]: >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> from spacy.en import English >>> trees[1]
>>> nlp = English() {'modifiers': [
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
>>> trees = doc.print_tree() 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] {'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
""" """
doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],

View File

@ -4,6 +4,503 @@ include ../../_includes/_mixins
p A container for accessing linguistic annotations. p A container for accessing linguistic annotations.
p
| A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
| Access sentences and named entities, export annotations to numpy arrays,
| losslessly serialize to compressed binary strings. The #[code Doc] object
| holds an array of #[code TokenC] structs. The Python-level #[code Token]
| and #[+api("span") #[code Span]] objects are views of this array, i.e.
| they don't own the data themselves.
+aside-code("Example").
# Construction 1
doc = nlp(u'Some text')
# Construction 2
from spacy.tokens import Doc
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
+h(2, "init") Doc.__init__
+tag method
p
| Construct a #[code Doc] object. The most common way to get a #[code Doc]
| object is via the #[code nlp] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell return
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p
| Get a #[+api("token") #[code Token]] object at position #[code i], where
| #[code i] is an integer. Negative indexing is supported, and follows the
| usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell return
+cell #[code Token]
+cell The token at #[code doc[i]].
p
| Get a #[+api("span") #[code Span]] object, starting at position
| #[code start] (token index) and ending at position #[code end] (token
| index).
p
| For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
| and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
| supported, as #[code Span] objects must be contiguous (cannot have gaps).
| You can use negative indices and open-ended ranges, which have their
| normal Python semantics.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell return
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p
| Iterate over #[code Token] objects, from which the annotations can be
| easily accessed.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
for token in doc:
print(token.text, token.tag_)
p
| This is the main way of accessing #[+api("token") #[code Token]] objects,
| which are the main way annotations are accessed from Python. If
| faster-than-Python speeds are required, you can instead access the
| annotations as a numpy array, or access the underlying C data directly
| from Cython.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert len(doc) == 7
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
+tag requires model
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+aside-code("Example").
apples, and, oranges = nlp(u'apples and oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "count_by") Doc.count_by
+tag method
p
| Count the frequencies of a given attribute. Produces a dict of
| #[code {attr (int): count (ints)}] frequencies, keyed by the values
| of the given attribute ID.
+aside-code("Example").
from spacy import attrs
doc = nlp(u'apple apple orange banana')
tokens.count_by(attrs.ORTH)
# {12800L: 1, 11880L: 2, 7561L: 1}
tokens.to_array([attrs.ORTH])
# array([[11880], [11880], [7561], [12800]])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell return
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "from_array") Doc.from_array
+tag method
p
| Load attributes from a numpy array. Write to a #[code Doc] object, from
| an #[code (M, N)] array of attributes.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
doc = nlp(text)
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
doc2 = Doc(doc.vocab)
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
+table(["Name", "Type", "Description"])
+row
+cell #[code attrs]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code array]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell return
+cell #[code Doc]
+cell Itself.
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Serialize, i.e. export the document contents to a binary string.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
doc_bytes = doc.to_bytes()
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc], including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Deserialize, i.e. import the document contents from a binary string.
+aside-code("Example").
from spacy.tokens import Doc
text = u'Give it back! He pleaded.'
doc = nlp(text)
bytes = doc.to_bytes()
doc2 = Doc(doc.vocab).from_bytes(bytes)
assert doc.text == doc2.text
+table(["Name", "Type", "Description"])
+row
+cell #[code data]
+cell bytes
+cell The string to load from.
+footrow
+cell return
+cell #[code Doc]
+cell Itself.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+aside-code("Example").
doc = nlp(u'Los Angeles start.')
doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
print([token.text for token in doc])
# ['Los Angeles', 'start', '.']
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell return
+cell #[code Token]
+cell
| The newly merged token, or #[code None] if the start and end
| indices did not fall at token boundaries
+h(2, "print_tree") Doc.print_tree
+tag method
+tag requires model
p
| Returns the parse trees in JSON (dict) format. Especially useful for
| web applications.
+aside-code("Example").
doc = nlp('Alice ate the pizza.')
trees = doc.print_tree()
# {'modifiers': [
# {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
# {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
# {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
# ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
+table(["Name", "Type", "Description"])
+row
+cell #[code light]
+cell bool
+cell Don't include lemmas or entities.
+row
+cell #[code flat]
+cell bool
+cell Don't include arcs or modifiers.
+footrow
+cell return
+cell dict
+cell Parse tree as dict.
+h(2, "text") Doc.text
+tag property
p A unicode representation of the document text.
+aside-code("Example").
text = u'Give it back! He pleaded.'
doc = nlp(text)
assert doc.text == text
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "text_with_ws") Doc.text_with_ws
+tag property
p
| An alias of #[code Doc.text], provided for duck-type compatibility with
| #[code Span] and #[code Token].
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "ents") Doc.ents
+tag property
+tag requires model
p
| Iterate over the entities in the document. Yields named-entity
| #[code Span] objects, if the entity recognizer has been applied to the
| document.
+aside-code("Example").
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].text == 'Mr. Best'
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
+tag requires model
p
| Iterate over the base noun phrases in the document. Yields base
| noun-phrase #[code Span] objects, if the document has been syntactically
| parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
| permit other NPs to be nested within it so no NP-level coordination, no
| prepositional phrases, and no relative clauses.
+aside-code("Example").
doc = nlp(u'A phrase with another phrase occurs.')
chunks = list(doc.noun_chunks)
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Noun chunks in the document.
+h(2, "sents") Doc.sents
+tag property
+tag requires model
p
| Iterate over the sentences in the document. Sentence spans have no label.
| To improve accuracy on informal texts, spaCy calculates sentence boundaries
| from the syntactic dependency parse. If the parser is disabled,
| the #[code sents] iterator will be unavailable.
+aside-code("Example").
doc = nlp(u"This is a sentence. Here's another...")
sents = list(doc.sents)
assert len(sents) == 2
assert [s.root.text for s in sents] == ["is", "'s"]
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Sentences in the document.
+h(2, "has_vector") Doc.has_vector
+tag property
+tag requires model
p
| A boolean value indicating whether a word vector is associated with the
| object.
+aside-code("Example").
apple = nlp(u'apple')
assert apple.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the document has a vector data attached.
+h(2, "vector") Doc.vector
+tag property
+tag requires model
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+aside-code("Example").
apple = nlp(u'apple')
(apple.vector.dtype, apple.vector.shape)
# (dtype('float32'), (300,))
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -59,358 +556,3 @@ p A container for accessing linguistic annotations.
+cell +cell
| A dictionary that allows customisation of properties of | A dictionary that allows customisation of properties of
| #[code Span] children. | #[code Span] children.
+h(2, "init") Doc.__init__
+tag method
p Construct a #[code Doc] object.
+aside("Note")
| The most common way to get a #[code Doc] object is via the #[code nlp]
| object. This method is usually only used for deserialization or preset
| tokenization.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell return
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p Get a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell return
+cell #[code Token]
+cell The token at #[code doc[i]].
p Get a #[code Span] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell return
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p Iterate over #[code Token] objects.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy import attrs
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS,
attrs.ENT_TYPE, attrs.IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "count_by") Doc.count_by
+tag method
p Count the frequencies of a given attribute.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell return
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "from_array") Doc.from_array
+tag method
p Load attributes from a numpy array.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code values]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Export the document contents to a binary string.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc] including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Import the document contents from a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code byte_string]
+cell bytes
+cell The string to load from.
+footrow
+cell return
+cell #[code Doc]
+cell The #[code self] variable.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell return
+cell #[code Token]
+cell
| The newly merged token, or None if the start and end
| indices did not fall at token boundaries
+h(2, "read_bytes") Doc.read_bytes
+tag staticmethod
p A static method, used to read serialized #[code Doc] objects from a file.
+aside-code("Example").
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
+table(["Name", "Type", "Description"])
+row
+cell file
+cell buffer
+cell A binary buffer to read the serialized annotations from.
+footrow
+cell yield
+cell bytes
+cell Binary strings from with documents can be loaded.
+h(2, "text") Doc.text
+tag property
p A unicode representation of the document text.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "text_with_ws") Doc.text_with_ws
+tag property
p
| An alias of #[code Doc.text], provided for duck-type compatibility with
| #[code Span] and #[code Token].
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "sents") Doc.sents
+tag property
p Iterate over the sentences in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Sentences in the document.
+h(2, "ents") Doc.ents
+tag property
p Iterate over the entities in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
p
| Iterate over the base noun phrases in the document. A base noun phrase,
| or "NP chunk", is a noun phrase that does not permit other NPs to be
| nested within it.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Noun chunks in the document
+h(2, "vector") Doc.vector
+tag property
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "has_vector") Doc.has_vector
+tag property
p
| A boolean value indicating whether a word vector is associated with the
| object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the document has a vector data attached.