Update docstrings and API docs for Doc class

This commit is contained in:
ines 2017-05-18 22:17:09 +02:00
parent 0f513850ab
commit b87066ff10
3 changed files with 684 additions and 531 deletions

View File

@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc:
"""
A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed
binary strings.
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings.
The `Doc` object holds an array of `TokenC` structs. The Python-level
`Token` and `Span` objects are views of this array, i.e. they don't own
the data themselves.
Aside: Internals
The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves.
Code: Construction 1
doc = nlp.tokenizer(u'Some text')
Code: Construction 2
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
EXAMPLE: Construction 1
>>> doc = nlp(u'Some text')
Construction 2
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
"""
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
"""
Create a Doc object.
"""Create a Doc object.
Arguments:
vocab:
A Vocabulary object, which must match any models you want to
use (e.g. tokenizer, parser, entity recognizer).
words:
A list of unicode strings to add to the document as words. If None,
defaults to empty list.
spaces:
A list of boolean values, of the same length as words. True
means that the word is followed by a space, False means it is not.
If None, defaults to [True]*len(words)
vocab (Vocab): A vocabulary object, which must match any models you want
to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
RETURNS (Doc): The newly constructed object.
"""
self.vocab = vocab
size = 20
@ -158,20 +148,22 @@ cdef class Doc:
self.is_parsed = True
def __getitem__(self, object i):
"""
doc[i]
Get the Token object at position i, where i is an integer.
"""Get a `Token` or `Span` object.
EXAMPLE:
>>> doc[i]
Get the `Token` object at position `i`, where `i` is an integer.
Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2].
doc[start : end]]
Get a `Span` object, starting at position `start`
and ending at position `end`, where `start` and
`end` are token indices. For instance,
`doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their
normal Python semantics.
semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
>>> doc[start : end]]
Get a `Span` object, starting at position `start` and ending at
position `end`, where `start` and `end` are token indices. For
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
as `Span` objects must be contiguous (cannot have gaps). You can use
negative indices and open-ended ranges, which have their normal
Python semantics.
"""
if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +178,14 @@ cdef class Doc:
return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self):
"""
for token in doc
Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the
underlying C data directly from Cython.
"""Iterate over `Token` objects, from which the annotations can be
easily accessed. This is the main way of accessing `Token` objects,
which are the main way annotations are accessed from Python. If faster-
than-Python speeds are required, you can instead access the annotations
as a numpy array, or access the underlying C data directly from Cython.
EXAMPLE:
>>> for token in doc
"""
cdef int i
for i in range(self.length):
@ -203,9 +195,10 @@ cdef class Doc:
yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self):
"""
len(doc)
The number of tokens in the document.
"""The number of tokens in the document.
EXAMPLE:
>>> len(doc)
"""
return self.length
@ -228,16 +221,12 @@ cdef class Doc:
return self
def similarity(self, other):
"""
Make a semantic similarity estimate. The default estimate is cosine
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other (object): The object to compare with. By default, accepts Doc,
Span, Token and Lexeme objects.
Return:
score (float): A scalar similarity score. Higher is more similar.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other)
@ -246,8 +235,10 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector:
"""
A boolean value indicating whether a word vector is associated with the object.
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
if 'has_vector' in self.user_hooks:
@ -256,10 +247,11 @@ cdef class Doc:
return any(token.has_vector for token in self)
property vector:
"""
A real-valued meaning representation. Defaults to an average of the token vectors.
"""A real-valued meaning representation. Defaults to an average of the
token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32']
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the document's semantics.
"""
def __get__(self):
if 'vector' in self.user_hooks:
@ -275,6 +267,7 @@ cdef class Doc:
self._vector = value
property vector_norm:
# TODO: docstrings / docs
def __get__(self):
if 'vector_norm' in self.user_hooks:
return self.user_hooks['vector_norm'](self)
@ -295,34 +288,37 @@ cdef class Doc:
return self.text
property text:
"""
A unicode representation of the document text.
"""A unicode representation of the document text.
RETURNS (unicode): The original verbatim text of the document.
"""
def __get__(self):
return u''.join(t.text_with_ws for t in self)
property text_with_ws:
"""
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
"""An alias of `Doc.text`, provided for duck-type compatibility with
`Span` and `Token`.
RETURNS (unicode): The original verbatim text of the document.
"""
def __get__(self):
return self.text
property ents:
"""
Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get
individual Token objects, or access the label:
"""Iterate over the entities in the document. Yields named-entity `Span`
objects, if the entity recognizer has been applied to the document.
Example:
from spacy.en import English
nlp = English()
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].orth_ == 'Best'
assert ents[0].text == 'Mr. Best'
YIELDS (Span): Entities in the document.
EXAMPLE: Iterate over the span to get individual Token objects, or access
the label:
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> assert ents[0].label == 346
>>> assert ents[0].label_ == 'PERSON'
>>> assert ents[0].orth_ == 'Best'
>>> assert ents[0].text == 'Mr. Best'
"""
def __get__(self):
cdef int i
@ -387,12 +383,13 @@ cdef class Doc:
self.c[start].ent_iob = 3
property noun_chunks:
"""
Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses.
"""Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been syntactically
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
not permit other NPs to be nested within it so no NP-level
coordination, no prepositional phrases, and no relative clauses.
YIELDS (Span): Noun chunks in the document.
"""
def __get__(self):
if not self.is_parsed:
@ -411,17 +408,15 @@ cdef class Doc:
yield span
property sents:
"""
Yields sentence `Span` objects. Sentence spans have no label.
To improve accuracy on informal texts, spaCy calculates sentence
boundaries from the syntactic dependency parse. If the parser is disabled,
`sents` iterator will be unavailable.
"""Iterate over the sentences in the document. Yields sentence `Span`
objects. Sentence spans have no label. To improve accuracy on informal
texts, spaCy calculates sentence boundaries from the syntactic
dependency parse. If the parser is disabled, the `sents` iterator will
be unavailable.
Example:
from spacy.en import English
nlp = English()
doc = nlp("This is a sentence. Here's another...")
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
EXAMPLE:
>>> doc = nlp("This is a sentence. Here's another...")
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
"""
def __get__(self):
if 'sents' in self.user_hooks:
@ -467,24 +462,20 @@ cdef class Doc:
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""
Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length
of the document. The values will be 32-bit integers.
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
Example:
from spacy import attrs
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments:
attr_ids (list[int]): A list of attribute ID ints.
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
Returns:
feat_array (numpy.ndarray[long, ndim=2]):
A feature matrix, with one row per word, and one column per attribute
indicated in the input attr_ids.
EXAMPLE:
>>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
>>> doc = nlp(text)
>>> # All strings mapped to integers, for easy export to numpy
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
"""
cdef int i, j
cdef attr_id_t feature
@ -499,27 +490,20 @@ cdef class Doc:
return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
"""
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID.
"""Count the frequencies of a given attribute. Produces a dict of
`{attribute (int): count (ints)}` frequencies, keyed by the values of
the given attribute ID.
Example:
from spacy.en import English
from spacy import attrs
nlp = English()
tokens = nlp(u'apple apple orange banana')
tokens.count_by(attrs.ORTH)
# {12800L: 1, 11880L: 2, 7561L: 1}
tokens.to_array([attrs.ORTH])
# array([[11880],
# [11880],
# [ 7561],
# [12800]])
attr_id (int): The attribute ID to key the counts.
RETURNS (dict): A dictionary mapping attributes to integer counts.
Arguments:
attr_id
int
The attribute ID to key the counts.
EXAMPLE:
>>> from spacy import attrs
>>> doc = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880], [11880], [7561], [12800]])
"""
cdef int i
cdef attr_t attr
@ -567,8 +551,12 @@ cdef class Doc:
self.c[i] = parsed[i]
def from_array(self, attrs, int[:, :] array):
"""
Write to a `Doc` object, from an `(M, N)` array of attributes.
"""Load attributes from a numpy array. Write to a `Doc` object, from an
`(M, N)` array of attributes.
attrs (ints): A list of attribute ID ints.
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
RETURNS (Doc): Itself.
"""
cdef int i, col
cdef attr_id_t attr_id
@ -597,8 +585,10 @@ cdef class Doc:
return self
def to_bytes(self):
"""
Serialize, producing a byte string.
"""Serialize, i.e. export the document contents to a binary string.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
"""
return dill.dumps(
(self.text,
@ -611,8 +601,10 @@ cdef class Doc:
protocol=-1)
def from_bytes(self, data):
"""
Deserialize, loading from bytes.
"""Deserialize, i.e. import the document contents from a binary string.
data (bytes): The string to load from.
RETURNS (Doc): Itself.
"""
if self.length != 0:
raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +632,16 @@ cdef class Doc:
return self
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
is merged into a single token. If start_idx and end_idx do not mark start
and end token boundaries, the document remains unchanged.
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
is merged into a single token. If `start_idx` and `end_idx `do not mark
start and end token boundaries, the document remains unchanged.
Arguments:
start_idx (int): The character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge.
**attributes:
Attributes to assign to the merged token. By default, attributes
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token, or None if the start and end indices did
not fall at token boundaries.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type
if len(args) == 3:
@ -758,7 +745,29 @@ cdef class Doc:
return self[start]
def print_tree(self, light=False, flat=False):
"""Returns the parse trees in the JSON (Dict) format."""
"""Returns the parse trees in JSON (dict) format.
light (bool): Don't include lemmas or entities.
flat (bool): Don't include arcs or modifiers.
RETURNS (dict): Parse tree as dict.
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
return parse_tree(self, light=light, flat=flat)

View File

@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
def merge_ents(doc):
"""
Helper: merge adjacent entities into single tokens; modifies the doc.
"""
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
for ent in doc.ents:
ent.merge(ent.root.tag_, ent.text, ent.label_)
return doc
def format_POS(token, light, flat):
"""
Helper: form the POS output for a token.
"""
"""Helper: form the POS output for a token."""
subtree = dict([
("word", token.text),
("lemma", token.lemma_), # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
def POS_tree(root, light=False, flat=False):
"""
Helper: generate a POS tree for a root token. The doc must have
merge_ents(doc) ran on it.
"""Helper: generate a POS tree for a root token. The doc must have
`merge_ents(doc)` ran on it.
"""
subtree = format_POS(root, light=light, flat=flat)
for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
def parse_tree(doc, light=False, flat=False):
"""
Makes a copy of the doc, then construct a syntactic parse tree, similar to
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
the one used in displaCy. Generates the POS tree for all sentences in a doc.
Args:
doc: The doc for parsing.
doc (Doc): The doc for parsing.
RETURNS (dict): The parse tree.
Returns:
[parse_trees (Dict)]:
>>> from spacy.en import English
>>> nlp = English()
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],

View File

@ -4,6 +4,503 @@ include ../../_includes/_mixins
p A container for accessing linguistic annotations.
p
| A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
| Access sentences and named entities, export annotations to numpy arrays,
| losslessly serialize to compressed binary strings. The #[code Doc] object
| holds an array of #[code TokenC] structs. The Python-level #[code Token]
| and #[+api("span") #[code Span]] objects are views of this array, i.e.
| they don't own the data themselves.
+aside-code("Example").
# Construction 1
doc = nlp(u'Some text')
# Construction 2
from spacy.tokens import Doc
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
+h(2, "init") Doc.__init__
+tag method
p
| Construct a #[code Doc] object. The most common way to get a #[code Doc]
| object is via the #[code nlp] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell return
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p
| Get a #[+api("token") #[code Token]] object at position #[code i], where
| #[code i] is an integer. Negative indexing is supported, and follows the
| usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell return
+cell #[code Token]
+cell The token at #[code doc[i]].
p
| Get a #[+api("span") #[code Span]] object, starting at position
| #[code start] (token index) and ending at position #[code end] (token
| index).
p
| For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
| and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
| supported, as #[code Span] objects must be contiguous (cannot have gaps).
| You can use negative indices and open-ended ranges, which have their
| normal Python semantics.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell return
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p
| Iterate over #[code Token] objects, from which the annotations can be
| easily accessed.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
for token in doc:
print(token.text, token.tag_)
p
| This is the main way of accessing #[+api("token") #[code Token]] objects,
| which are the main way annotations are accessed from Python. If
| faster-than-Python speeds are required, you can instead access the
| annotations as a numpy array, or access the underlying C data directly
| from Cython.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert len(doc) == 7
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
+tag requires model
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+aside-code("Example").
apples, and, oranges = nlp(u'apples and oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "count_by") Doc.count_by
+tag method
p
| Count the frequencies of a given attribute. Produces a dict of
| #[code {attr (int): count (ints)}] frequencies, keyed by the values
| of the given attribute ID.
+aside-code("Example").
from spacy import attrs
doc = nlp(u'apple apple orange banana')
tokens.count_by(attrs.ORTH)
# {12800L: 1, 11880L: 2, 7561L: 1}
tokens.to_array([attrs.ORTH])
# array([[11880], [11880], [7561], [12800]])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell return
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "from_array") Doc.from_array
+tag method
p
| Load attributes from a numpy array. Write to a #[code Doc] object, from
| an #[code (M, N)] array of attributes.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
doc = nlp(text)
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
doc2 = Doc(doc.vocab)
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
+table(["Name", "Type", "Description"])
+row
+cell #[code attrs]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code array]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell return
+cell #[code Doc]
+cell Itself.
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Serialize, i.e. export the document contents to a binary string.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
doc_bytes = doc.to_bytes()
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc], including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Deserialize, i.e. import the document contents from a binary string.
+aside-code("Example").
from spacy.tokens import Doc
text = u'Give it back! He pleaded.'
doc = nlp(text)
bytes = doc.to_bytes()
doc2 = Doc(doc.vocab).from_bytes(bytes)
assert doc.text == doc2.text
+table(["Name", "Type", "Description"])
+row
+cell #[code data]
+cell bytes
+cell The string to load from.
+footrow
+cell return
+cell #[code Doc]
+cell Itself.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+aside-code("Example").
doc = nlp(u'Los Angeles start.')
doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
print([token.text for token in doc])
# ['Los Angeles', 'start', '.']
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell return
+cell #[code Token]
+cell
| The newly merged token, or #[code None] if the start and end
| indices did not fall at token boundaries
+h(2, "print_tree") Doc.print_tree
+tag method
+tag requires model
p
| Returns the parse trees in JSON (dict) format. Especially useful for
| web applications.
+aside-code("Example").
doc = nlp('Alice ate the pizza.')
trees = doc.print_tree()
# {'modifiers': [
# {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
# {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
# {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
# ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
+table(["Name", "Type", "Description"])
+row
+cell #[code light]
+cell bool
+cell Don't include lemmas or entities.
+row
+cell #[code flat]
+cell bool
+cell Don't include arcs or modifiers.
+footrow
+cell return
+cell dict
+cell Parse tree as dict.
+h(2, "text") Doc.text
+tag property
p A unicode representation of the document text.
+aside-code("Example").
text = u'Give it back! He pleaded.'
doc = nlp(text)
assert doc.text == text
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "text_with_ws") Doc.text_with_ws
+tag property
p
| An alias of #[code Doc.text], provided for duck-type compatibility with
| #[code Span] and #[code Token].
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "ents") Doc.ents
+tag property
+tag requires model
p
| Iterate over the entities in the document. Yields named-entity
| #[code Span] objects, if the entity recognizer has been applied to the
| document.
+aside-code("Example").
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].text == 'Mr. Best'
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
+tag requires model
p
| Iterate over the base noun phrases in the document. Yields base
| noun-phrase #[code Span] objects, if the document has been syntactically
| parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
| permit other NPs to be nested within it so no NP-level coordination, no
| prepositional phrases, and no relative clauses.
+aside-code("Example").
doc = nlp(u'A phrase with another phrase occurs.')
chunks = list(doc.noun_chunks)
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Noun chunks in the document.
+h(2, "sents") Doc.sents
+tag property
+tag requires model
p
| Iterate over the sentences in the document. Sentence spans have no label.
| To improve accuracy on informal texts, spaCy calculates sentence boundaries
| from the syntactic dependency parse. If the parser is disabled,
| the #[code sents] iterator will be unavailable.
+aside-code("Example").
doc = nlp(u"This is a sentence. Here's another...")
sents = list(doc.sents)
assert len(sents) == 2
assert [s.root.text for s in sents] == ["is", "'s"]
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Sentences in the document.
+h(2, "has_vector") Doc.has_vector
+tag property
+tag requires model
p
| A boolean value indicating whether a word vector is associated with the
| object.
+aside-code("Example").
apple = nlp(u'apple')
assert apple.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the document has a vector data attached.
+h(2, "vector") Doc.vector
+tag property
+tag requires model
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+aside-code("Example").
apple = nlp(u'apple')
(apple.vector.dtype, apple.vector.shape)
# (dtype('float32'), (300,))
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
@ -59,358 +556,3 @@ p A container for accessing linguistic annotations.
+cell
| A dictionary that allows customisation of properties of
| #[code Span] children.
+h(2, "init") Doc.__init__
+tag method
p Construct a #[code Doc] object.
+aside("Note")
| The most common way to get a #[code Doc] object is via the #[code nlp]
| object. This method is usually only used for deserialization or preset
| tokenization.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell return
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p Get a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell return
+cell #[code Token]
+cell The token at #[code doc[i]].
p Get a #[code Span] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell return
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p Iterate over #[code Token] objects.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy import attrs
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS,
attrs.ENT_TYPE, attrs.IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "count_by") Doc.count_by
+tag method
p Count the frequencies of a given attribute.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell return
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "from_array") Doc.from_array
+tag method
p Load attributes from a numpy array.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code values]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Export the document contents to a binary string.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc] including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Import the document contents from a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code byte_string]
+cell bytes
+cell The string to load from.
+footrow
+cell return
+cell #[code Doc]
+cell The #[code self] variable.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell return
+cell #[code Token]
+cell
| The newly merged token, or None if the start and end
| indices did not fall at token boundaries
+h(2, "read_bytes") Doc.read_bytes
+tag staticmethod
p A static method, used to read serialized #[code Doc] objects from a file.
+aside-code("Example").
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
+table(["Name", "Type", "Description"])
+row
+cell file
+cell buffer
+cell A binary buffer to read the serialized annotations from.
+footrow
+cell yield
+cell bytes
+cell Binary strings from with documents can be loaded.
+h(2, "text") Doc.text
+tag property
p A unicode representation of the document text.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "text_with_ws") Doc.text_with_ws
+tag property
p
| An alias of #[code Doc.text], provided for duck-type compatibility with
| #[code Span] and #[code Token].
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "sents") Doc.sents
+tag property
p Iterate over the sentences in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Sentences in the document.
+h(2, "ents") Doc.ents
+tag property
p Iterate over the entities in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
p
| Iterate over the base noun phrases in the document. A base noun phrase,
| or "NP chunk", is a noun phrase that does not permit other NPs to be
| nested within it.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Noun chunks in the document
+h(2, "vector") Doc.vector
+tag property
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "has_vector") Doc.has_vector
+tag property
p
| A boolean value indicating whether a word vector is associated with the
| object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the document has a vector data attached.