mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Update docstrings and API docs for Doc class
This commit is contained in:
parent
0f513850ab
commit
b87066ff10
|
@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""
|
"""A sequence of Token objects. Access sentences and named entities, export
|
||||||
A sequence of `Token` objects. Access sentences and named entities,
|
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
||||||
export annotations to numpy arrays, losslessly serialize to compressed
|
The `Doc` object holds an array of `TokenC` structs. The Python-level
|
||||||
binary strings.
|
`Token` and `Span` objects are views of this array, i.e. they don't own
|
||||||
|
the data themselves.
|
||||||
|
|
||||||
Aside: Internals
|
EXAMPLE: Construction 1
|
||||||
The `Doc` object holds an array of `TokenC` structs.
|
>>> doc = nlp(u'Some text')
|
||||||
The Python-level `Token` and `Span` objects are views of this
|
|
||||||
array, i.e. they don't own the data themselves.
|
|
||||||
|
|
||||||
Code: Construction 1
|
|
||||||
doc = nlp.tokenizer(u'Some text')
|
|
||||||
|
|
||||||
Code: Construction 2
|
|
||||||
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
|
|
||||||
|
|
||||||
|
Construction 2
|
||||||
|
>>> from spacy.tokens import Doc
|
||||||
|
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||||
"""
|
"""Create a Doc object.
|
||||||
Create a Doc object.
|
|
||||||
|
|
||||||
Arguments:
|
vocab (Vocab): A vocabulary object, which must match any models you want
|
||||||
vocab:
|
to use (e.g. tokenizer, parser, entity recognizer).
|
||||||
A Vocabulary object, which must match any models you want to
|
words (list or None): A list of unicode strings to add to the document
|
||||||
use (e.g. tokenizer, parser, entity recognizer).
|
as words. If `None`, defaults to empty list.
|
||||||
|
spaces (list or None): A list of boolean values, of the same length as
|
||||||
words:
|
words. True means that the word is followed by a space, False means
|
||||||
A list of unicode strings to add to the document as words. If None,
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
defaults to empty list.
|
RETURNS (Doc): The newly constructed object.
|
||||||
|
|
||||||
spaces:
|
|
||||||
A list of boolean values, of the same length as words. True
|
|
||||||
means that the word is followed by a space, False means it is not.
|
|
||||||
If None, defaults to [True]*len(words)
|
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = 20
|
size = 20
|
||||||
|
@ -158,20 +148,22 @@ cdef class Doc:
|
||||||
self.is_parsed = True
|
self.is_parsed = True
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""
|
"""Get a `Token` or `Span` object.
|
||||||
doc[i]
|
|
||||||
Get the Token object at position i, where i is an integer.
|
EXAMPLE:
|
||||||
|
>>> doc[i]
|
||||||
|
Get the `Token` object at position `i`, where `i` is an integer.
|
||||||
Negative indexing is supported, and follows the usual Python
|
Negative indexing is supported, and follows the usual Python
|
||||||
semantics, i.e. doc[-2] is doc[len(doc) - 2].
|
semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
|
||||||
doc[start : end]]
|
|
||||||
Get a `Span` object, starting at position `start`
|
>>> doc[start : end]]
|
||||||
and ending at position `end`, where `start` and
|
Get a `Span` object, starting at position `start` and ending at
|
||||||
`end` are token indices. For instance,
|
position `end`, where `start` and `end` are token indices. For
|
||||||
`doc[2:5]` produces a span consisting of
|
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
|
||||||
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
|
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
|
||||||
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
as `Span` objects must be contiguous (cannot have gaps). You can use
|
||||||
You can use negative indices and open-ended ranges, which have their
|
negative indices and open-ended ranges, which have their normal
|
||||||
normal Python semantics.
|
Python semantics.
|
||||||
"""
|
"""
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
|
@ -186,14 +178,14 @@ cdef class Doc:
|
||||||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""
|
"""Iterate over `Token` objects, from which the annotations can be
|
||||||
for token in doc
|
easily accessed. This is the main way of accessing `Token` objects,
|
||||||
Iterate over `Token` objects, from which the annotations can
|
which are the main way annotations are accessed from Python. If faster-
|
||||||
be easily accessed. This is the main way of accessing Token
|
than-Python speeds are required, you can instead access the annotations
|
||||||
objects, which are the main way annotations are accessed from
|
as a numpy array, or access the underlying C data directly from Cython.
|
||||||
Python. If faster-than-Python speeds are required, you can
|
|
||||||
instead access the annotations as a numpy array, or access the
|
EXAMPLE:
|
||||||
underlying C data directly from Cython.
|
>>> for token in doc
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
@ -203,9 +195,10 @@ cdef class Doc:
|
||||||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""
|
"""The number of tokens in the document.
|
||||||
len(doc)
|
|
||||||
The number of tokens in the document.
|
EXAMPLE:
|
||||||
|
>>> len(doc)
|
||||||
"""
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
|
@ -228,16 +221,12 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
"""
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||||
Make a semantic similarity estimate. The default estimate is cosine
|
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
|
||||||
Arguments:
|
other (object): The object to compare with. By default, accepts `Doc`,
|
||||||
other (object): The object to compare with. By default, accepts Doc,
|
`Span`, `Token` and `Lexeme` objects.
|
||||||
Span, Token and Lexeme objects.
|
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
Return:
|
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
|
||||||
"""
|
"""
|
||||||
if 'similarity' in self.user_hooks:
|
if 'similarity' in self.user_hooks:
|
||||||
return self.user_hooks['similarity'](self, other)
|
return self.user_hooks['similarity'](self, other)
|
||||||
|
@ -246,8 +235,10 @@ cdef class Doc:
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
"""
|
"""A boolean value indicating whether a word vector is associated with
|
||||||
A boolean value indicating whether a word vector is associated with the object.
|
the object.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.user_hooks:
|
if 'has_vector' in self.user_hooks:
|
||||||
|
@ -256,10 +247,11 @@ cdef class Doc:
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
"""
|
"""A real-valued meaning representation. Defaults to an average of the
|
||||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
token vectors.
|
||||||
|
|
||||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
|
representing the document's semantics.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.user_hooks:
|
if 'vector' in self.user_hooks:
|
||||||
|
@ -275,6 +267,7 @@ cdef class Doc:
|
||||||
self._vector = value
|
self._vector = value
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
|
# TODO: docstrings / docs
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector_norm' in self.user_hooks:
|
if 'vector_norm' in self.user_hooks:
|
||||||
return self.user_hooks['vector_norm'](self)
|
return self.user_hooks['vector_norm'](self)
|
||||||
|
@ -295,34 +288,37 @@ cdef class Doc:
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
"""
|
"""A unicode representation of the document text.
|
||||||
A unicode representation of the document text.
|
|
||||||
|
RETURNS (unicode): The original verbatim text of the document.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return u''.join(t.text_with_ws for t in self)
|
return u''.join(t.text_with_ws for t in self)
|
||||||
|
|
||||||
property text_with_ws:
|
property text_with_ws:
|
||||||
"""
|
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
||||||
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
|
`Span` and `Token`.
|
||||||
|
|
||||||
|
RETURNS (unicode): The original verbatim text of the document.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property ents:
|
property ents:
|
||||||
"""
|
"""Iterate over the entities in the document. Yields named-entity `Span`
|
||||||
Yields named-entity `Span` objects, if the entity recognizer
|
objects, if the entity recognizer has been applied to the document.
|
||||||
has been applied to the document. Iterate over the span to get
|
|
||||||
individual Token objects, or access the label:
|
|
||||||
|
|
||||||
Example:
|
YIELDS (Span): Entities in the document.
|
||||||
from spacy.en import English
|
|
||||||
nlp = English()
|
EXAMPLE: Iterate over the span to get individual Token objects, or access
|
||||||
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
the label:
|
||||||
ents = list(tokens.ents)
|
|
||||||
assert ents[0].label == 346
|
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
assert ents[0].label_ == 'PERSON'
|
>>> ents = list(tokens.ents)
|
||||||
assert ents[0].orth_ == 'Best'
|
>>> assert ents[0].label == 346
|
||||||
assert ents[0].text == 'Mr. Best'
|
>>> assert ents[0].label_ == 'PERSON'
|
||||||
|
>>> assert ents[0].orth_ == 'Best'
|
||||||
|
>>> assert ents[0].text == 'Mr. Best'
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -387,12 +383,13 @@ cdef class Doc:
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
"""
|
"""Iterate over the base noun phrases in the document. Yields base
|
||||||
Yields base noun-phrase #[code Span] objects, if the document
|
noun-phrase #[code Span] objects, if the document has been syntactically
|
||||||
has been syntactically parsed. A base noun phrase, or
|
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
|
||||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
not permit other NPs to be nested within it – so no NP-level
|
||||||
be nested within it – so no NP-level coordination, no prepositional
|
coordination, no prepositional phrases, and no relative clauses.
|
||||||
phrases, and no relative clauses.
|
|
||||||
|
YIELDS (Span): Noun chunks in the document.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
|
@ -411,17 +408,15 @@ cdef class Doc:
|
||||||
yield span
|
yield span
|
||||||
|
|
||||||
property sents:
|
property sents:
|
||||||
"""
|
"""Iterate over the sentences in the document. Yields sentence `Span`
|
||||||
Yields sentence `Span` objects. Sentence spans have no label.
|
objects. Sentence spans have no label. To improve accuracy on informal
|
||||||
To improve accuracy on informal texts, spaCy calculates sentence
|
texts, spaCy calculates sentence boundaries from the syntactic
|
||||||
boundaries from the syntactic dependency parse. If the parser is disabled,
|
dependency parse. If the parser is disabled, the `sents` iterator will
|
||||||
`sents` iterator will be unavailable.
|
be unavailable.
|
||||||
|
|
||||||
Example:
|
EXAMPLE:
|
||||||
from spacy.en import English
|
>>> doc = nlp("This is a sentence. Here's another...")
|
||||||
nlp = English()
|
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
||||||
doc = nlp("This is a sentence. Here's another...")
|
|
||||||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sents' in self.user_hooks:
|
if 'sents' in self.user_hooks:
|
||||||
|
@ -467,24 +462,20 @@ cdef class Doc:
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
"""
|
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||||
Given a list of M attribute IDs, export the tokens to a numpy
|
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||||
`ndarray` of shape (N, M), where `N` is the length
|
The values will be 32-bit integers.
|
||||||
of the document. The values will be 32-bit integers.
|
|
||||||
|
|
||||||
Example:
|
attr_ids (list[int]): A list of attribute ID ints.
|
||||||
from spacy import attrs
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||||
doc = nlp(text)
|
per word, and one column per attribute indicated in the input
|
||||||
# All strings mapped to integers, for easy export to numpy
|
`attr_ids`.
|
||||||
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
|
|
||||||
|
|
||||||
Arguments:
|
EXAMPLE:
|
||||||
attr_ids (list[int]): A list of attribute ID ints.
|
>>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||||
|
>>> doc = nlp(text)
|
||||||
Returns:
|
>>> # All strings mapped to integers, for easy export to numpy
|
||||||
feat_array (numpy.ndarray[long, ndim=2]):
|
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||||
A feature matrix, with one row per word, and one column per attribute
|
|
||||||
indicated in the input attr_ids.
|
|
||||||
"""
|
"""
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef attr_id_t feature
|
cdef attr_id_t feature
|
||||||
|
@ -499,27 +490,20 @@ cdef class Doc:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
||||||
"""
|
"""Count the frequencies of a given attribute. Produces a dict of
|
||||||
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
||||||
by the values of the given attribute ID.
|
the given attribute ID.
|
||||||
|
|
||||||
Example:
|
attr_id (int): The attribute ID to key the counts.
|
||||||
from spacy.en import English
|
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
||||||
from spacy import attrs
|
|
||||||
nlp = English()
|
|
||||||
tokens = nlp(u'apple apple orange banana')
|
|
||||||
tokens.count_by(attrs.ORTH)
|
|
||||||
# {12800L: 1, 11880L: 2, 7561L: 1}
|
|
||||||
tokens.to_array([attrs.ORTH])
|
|
||||||
# array([[11880],
|
|
||||||
# [11880],
|
|
||||||
# [ 7561],
|
|
||||||
# [12800]])
|
|
||||||
|
|
||||||
Arguments:
|
EXAMPLE:
|
||||||
attr_id
|
>>> from spacy import attrs
|
||||||
int
|
>>> doc = nlp(u'apple apple orange banana')
|
||||||
The attribute ID to key the counts.
|
>>> tokens.count_by(attrs.ORTH)
|
||||||
|
{12800L: 1, 11880L: 2, 7561L: 1}
|
||||||
|
>>> tokens.to_array([attrs.ORTH])
|
||||||
|
array([[11880], [11880], [7561], [12800]])
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef attr_t attr
|
cdef attr_t attr
|
||||||
|
@ -567,8 +551,12 @@ cdef class Doc:
|
||||||
self.c[i] = parsed[i]
|
self.c[i] = parsed[i]
|
||||||
|
|
||||||
def from_array(self, attrs, int[:, :] array):
|
def from_array(self, attrs, int[:, :] array):
|
||||||
"""
|
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
||||||
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
`(M, N)` array of attributes.
|
||||||
|
|
||||||
|
attrs (ints): A list of attribute ID ints.
|
||||||
|
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
|
||||||
|
RETURNS (Doc): Itself.
|
||||||
"""
|
"""
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
|
@ -597,8 +585,10 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
"""
|
"""Serialize, i.e. export the document contents to a binary string.
|
||||||
Serialize, producing a byte string.
|
|
||||||
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
|
all annotations.
|
||||||
"""
|
"""
|
||||||
return dill.dumps(
|
return dill.dumps(
|
||||||
(self.text,
|
(self.text,
|
||||||
|
@ -611,8 +601,10 @@ cdef class Doc:
|
||||||
protocol=-1)
|
protocol=-1)
|
||||||
|
|
||||||
def from_bytes(self, data):
|
def from_bytes(self, data):
|
||||||
"""
|
"""Deserialize, i.e. import the document contents from a binary string.
|
||||||
Deserialize, loading from bytes.
|
|
||||||
|
data (bytes): The string to load from.
|
||||||
|
RETURNS (Doc): Itself.
|
||||||
"""
|
"""
|
||||||
if self.length != 0:
|
if self.length != 0:
|
||||||
raise ValueError("Cannot load into non-empty Doc")
|
raise ValueError("Cannot load into non-empty Doc")
|
||||||
|
@ -640,21 +632,16 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""
|
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
||||||
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
is merged into a single token. If `start_idx` and `end_idx `do not mark
|
||||||
is merged into a single token. If start_idx and end_idx do not mark start
|
start and end token boundaries, the document remains unchanged.
|
||||||
and end token boundaries, the document remains unchanged.
|
|
||||||
|
|
||||||
Arguments:
|
start_idx (int): The character index of the start of the slice to merge.
|
||||||
start_idx (int): The character index of the start of the slice to merge.
|
end_idx (int): The character index after the end of the slice to merge.
|
||||||
end_idx (int): The character index after the end of the slice to merge.
|
**attributes: Attributes to assign to the merged token. By default,
|
||||||
**attributes:
|
attributes are inherited from the syntactic root token of the span.
|
||||||
Attributes to assign to the merged token. By default, attributes
|
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||||
are inherited from the syntactic root token of the span.
|
indices did not fall at token boundaries.
|
||||||
Returns:
|
|
||||||
token (Token):
|
|
||||||
The newly merged token, or None if the start and end indices did
|
|
||||||
not fall at token boundaries.
|
|
||||||
"""
|
"""
|
||||||
cdef unicode tag, lemma, ent_type
|
cdef unicode tag, lemma, ent_type
|
||||||
if len(args) == 3:
|
if len(args) == 3:
|
||||||
|
@ -758,7 +745,29 @@ cdef class Doc:
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
def print_tree(self, light=False, flat=False):
|
def print_tree(self, light=False, flat=False):
|
||||||
"""Returns the parse trees in the JSON (Dict) format."""
|
"""Returns the parse trees in JSON (dict) format.
|
||||||
|
|
||||||
|
light (bool): Don't include lemmas or entities.
|
||||||
|
flat (bool): Don't include arcs or modifiers.
|
||||||
|
RETURNS (dict): Parse tree as dict.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||||
|
>>> trees = doc.print_tree()
|
||||||
|
>>> trees[1]
|
||||||
|
{'modifiers': [
|
||||||
|
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||||
|
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||||
|
{'modifiers': [
|
||||||
|
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||||
|
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||||
|
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||||
|
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||||
|
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||||
|
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||||
|
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||||
|
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||||
|
"""
|
||||||
return parse_tree(self, light=light, flat=flat)
|
return parse_tree(self, light=light, flat=flat)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
def merge_ents(doc):
|
def merge_ents(doc):
|
||||||
"""
|
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
|
||||||
Helper: merge adjacent entities into single tokens; modifies the doc.
|
|
||||||
"""
|
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def format_POS(token, light, flat):
|
def format_POS(token, light, flat):
|
||||||
"""
|
"""Helper: form the POS output for a token."""
|
||||||
Helper: form the POS output for a token.
|
|
||||||
"""
|
|
||||||
subtree = dict([
|
subtree = dict([
|
||||||
("word", token.text),
|
("word", token.text),
|
||||||
("lemma", token.lemma_), # trigger
|
("lemma", token.lemma_), # trigger
|
||||||
|
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
|
||||||
|
|
||||||
|
|
||||||
def POS_tree(root, light=False, flat=False):
|
def POS_tree(root, light=False, flat=False):
|
||||||
"""
|
"""Helper: generate a POS tree for a root token. The doc must have
|
||||||
Helper: generate a POS tree for a root token. The doc must have
|
`merge_ents(doc)` ran on it.
|
||||||
merge_ents(doc) ran on it.
|
|
||||||
"""
|
"""
|
||||||
subtree = format_POS(root, light=light, flat=flat)
|
subtree = format_POS(root, light=light, flat=flat)
|
||||||
for c in root.children:
|
for c in root.children:
|
||||||
|
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
|
||||||
|
|
||||||
|
|
||||||
def parse_tree(doc, light=False, flat=False):
|
def parse_tree(doc, light=False, flat=False):
|
||||||
"""
|
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||||
Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
|
||||||
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
||||||
|
|
||||||
Args:
|
doc (Doc): The doc for parsing.
|
||||||
doc: The doc for parsing.
|
RETURNS (dict): The parse tree.
|
||||||
|
|
||||||
Returns:
|
EXAMPLE:
|
||||||
[parse_trees (Dict)]:
|
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||||
|
>>> trees = doc.print_tree()
|
||||||
>>> from spacy.en import English
|
>>> trees[1]
|
||||||
>>> nlp = English()
|
{'modifiers': [
|
||||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||||
>>> trees = doc.print_tree()
|
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||||
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
{'modifiers': [
|
||||||
|
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||||
|
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||||
|
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||||
|
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||||
|
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||||
|
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||||
|
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||||
|
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||||
"""
|
"""
|
||||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||||
|
|
|
@ -4,6 +4,503 @@ include ../../_includes/_mixins
|
||||||
|
|
||||||
p A container for accessing linguistic annotations.
|
p A container for accessing linguistic annotations.
|
||||||
|
|
||||||
|
p
|
||||||
|
| A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
|
||||||
|
| Access sentences and named entities, export annotations to numpy arrays,
|
||||||
|
| losslessly serialize to compressed binary strings. The #[code Doc] object
|
||||||
|
| holds an array of #[code TokenC] structs. The Python-level #[code Token]
|
||||||
|
| and #[+api("span") #[code Span]] objects are views of this array, i.e.
|
||||||
|
| they don't own the data themselves.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
# Construction 1
|
||||||
|
doc = nlp(u'Some text')
|
||||||
|
|
||||||
|
# Construction 2
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||||
|
spaces=[True, False, False])
|
||||||
|
|
||||||
|
+h(2, "init") Doc.__init__
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Construct a #[code Doc] object. The most common way to get a #[code Doc]
|
||||||
|
| object is via the #[code nlp] object.
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code vocab]
|
||||||
|
+cell #[code Vocab]
|
||||||
|
+cell A storage container for lexical types.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code words]
|
||||||
|
+cell -
|
||||||
|
+cell A list of strings to add to the container.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code spaces]
|
||||||
|
+cell -
|
||||||
|
+cell
|
||||||
|
| A list of boolean values indicating whether each word has a
|
||||||
|
| subsequent space. Must have the same length as #[code words], if
|
||||||
|
| specified. Defaults to a sequence of #[code True].
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell The newly constructed object.
|
||||||
|
|
||||||
|
+h(2, "getitem") Doc.__getitem__
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Get a #[+api("token") #[code Token]] object at position #[code i], where
|
||||||
|
| #[code i] is an integer. Negative indexing is supported, and follows the
|
||||||
|
| usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'Give it back! He pleaded.')
|
||||||
|
assert doc[0].text == 'Give'
|
||||||
|
assert doc[-1].text == '.'
|
||||||
|
span = doc[1:1]
|
||||||
|
assert span.text == 'it back'
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code i]
|
||||||
|
+cell int
|
||||||
|
+cell The index of the token.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Token]
|
||||||
|
+cell The token at #[code doc[i]].
|
||||||
|
|
||||||
|
p
|
||||||
|
| Get a #[+api("span") #[code Span]] object, starting at position
|
||||||
|
| #[code start] (token index) and ending at position #[code end] (token
|
||||||
|
| index).
|
||||||
|
|
||||||
|
p
|
||||||
|
| For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
|
||||||
|
| and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
|
||||||
|
| supported, as #[code Span] objects must be contiguous (cannot have gaps).
|
||||||
|
| You can use negative indices and open-ended ranges, which have their
|
||||||
|
| normal Python semantics.
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code start_end]
|
||||||
|
+cell tuple
|
||||||
|
+cell The slice of the document to get.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Span]
|
||||||
|
+cell The span at #[code doc[start : end]].
|
||||||
|
|
||||||
|
+h(2, "iter") Doc.__iter__
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Iterate over #[code Token] objects, from which the annotations can be
|
||||||
|
| easily accessed.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'Give it back! He pleaded.')
|
||||||
|
for token in doc:
|
||||||
|
print(token.text, token.tag_)
|
||||||
|
|
||||||
|
p
|
||||||
|
| This is the main way of accessing #[+api("token") #[code Token]] objects,
|
||||||
|
| which are the main way annotations are accessed from Python. If
|
||||||
|
| faster-than-Python speeds are required, you can instead access the
|
||||||
|
| annotations as a numpy array, or access the underlying C data directly
|
||||||
|
| from Cython.
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell yield
|
||||||
|
+cell #[code Token]
|
||||||
|
+cell A #[code Token] object.
|
||||||
|
|
||||||
|
+h(2, "len") Doc.__len__
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Get the number of tokens in the document.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'Give it back! He pleaded.')
|
||||||
|
assert len(doc) == 7
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell int
|
||||||
|
+cell The number of tokens in the document.
|
||||||
|
|
||||||
|
+h(2, "similarity") Doc.similarity
|
||||||
|
+tag method
|
||||||
|
+tag requires model
|
||||||
|
|
||||||
|
p
|
||||||
|
| Make a semantic similarity estimate. The default estimate is cosine
|
||||||
|
| similarity using an average of word vectors.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
apples, and, oranges = nlp(u'apples and oranges')
|
||||||
|
apples_oranges = apples.similarity(oranges)
|
||||||
|
oranges_apples = oranges.similarity(apples)
|
||||||
|
assert apples_oranges == oranges_apples
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code other]
|
||||||
|
+cell -
|
||||||
|
+cell
|
||||||
|
| The object to compare with. By default, accepts #[code Doc],
|
||||||
|
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell float
|
||||||
|
+cell A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
|
+h(2, "count_by") Doc.count_by
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Count the frequencies of a given attribute. Produces a dict of
|
||||||
|
| #[code {attr (int): count (ints)}] frequencies, keyed by the values
|
||||||
|
| of the given attribute ID.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy import attrs
|
||||||
|
doc = nlp(u'apple apple orange banana')
|
||||||
|
tokens.count_by(attrs.ORTH)
|
||||||
|
# {12800L: 1, 11880L: 2, 7561L: 1}
|
||||||
|
tokens.to_array([attrs.ORTH])
|
||||||
|
# array([[11880], [11880], [7561], [12800]])
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code attr_id]
|
||||||
|
+cell int
|
||||||
|
+cell The attribute ID
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell dict
|
||||||
|
+cell A dictionary mapping attributes to integer counts.
|
||||||
|
|
||||||
|
+h(2, "to_array") Doc.to_array
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Export the document annotations to a numpy array of shape #[code N*M]
|
||||||
|
| where #[code N] is the length of the document and #[code M] is the number
|
||||||
|
| of attribute IDs to export. The values will be 32-bit integers.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||||
|
doc = nlp(text)
|
||||||
|
# All strings mapped to integers, for easy export to numpy
|
||||||
|
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code attr_ids]
|
||||||
|
+cell ints
|
||||||
|
+cell A list of attribute ID ints.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||||
|
+cell
|
||||||
|
| The exported attributes as a 2D numpy array, with one row per
|
||||||
|
| token and one column per attribute.
|
||||||
|
|
||||||
|
+h(2, "from_array") Doc.from_array
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Load attributes from a numpy array. Write to a #[code Doc] object, from
|
||||||
|
| an #[code (M, N)] array of attributes.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
doc = nlp(text)
|
||||||
|
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||||
|
doc2 = Doc(doc.vocab)
|
||||||
|
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code attrs]
|
||||||
|
+cell ints
|
||||||
|
+cell A list of attribute ID ints.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code array]
|
||||||
|
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||||
|
+cell The attribute values to load.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell Itself.
|
||||||
|
|
||||||
|
+h(2, "to_bytes") Doc.to_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Serialize, i.e. export the document contents to a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'Give it back! He pleaded.')
|
||||||
|
doc_bytes = doc.to_bytes()
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell bytes
|
||||||
|
+cell
|
||||||
|
| A losslessly serialized copy of the #[code Doc], including all
|
||||||
|
| annotations.
|
||||||
|
|
||||||
|
+h(2, "from_bytes") Doc.from_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Deserialize, i.e. import the document contents from a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
text = u'Give it back! He pleaded.'
|
||||||
|
doc = nlp(text)
|
||||||
|
bytes = doc.to_bytes()
|
||||||
|
doc2 = Doc(doc.vocab).from_bytes(bytes)
|
||||||
|
assert doc.text == doc2.text
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code data]
|
||||||
|
+cell bytes
|
||||||
|
+cell The string to load from.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell Itself.
|
||||||
|
|
||||||
|
+h(2, "merge") Doc.merge
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Retokenize the document, such that the span at
|
||||||
|
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
||||||
|
| #[code start_idx] and #[end_idx] do not mark start and end token
|
||||||
|
| boundaries, the document remains unchanged.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'Los Angeles start.')
|
||||||
|
doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
|
||||||
|
print([token.text for token in doc])
|
||||||
|
# ['Los Angeles', 'start', '.']
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code start_idx]
|
||||||
|
+cell int
|
||||||
|
+cell The character index of the start of the slice to merge.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code end_idx]
|
||||||
|
+cell int
|
||||||
|
+cell The character index after the end of the slice to merge.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **attributes]
|
||||||
|
+cell -
|
||||||
|
+cell
|
||||||
|
| Attributes to assign to the merged token. By default,
|
||||||
|
| attributes are inherited from the syntactic root token of
|
||||||
|
| the span.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Token]
|
||||||
|
+cell
|
||||||
|
| The newly merged token, or #[code None] if the start and end
|
||||||
|
| indices did not fall at token boundaries
|
||||||
|
|
||||||
|
+h(2, "print_tree") Doc.print_tree
|
||||||
|
+tag method
|
||||||
|
+tag requires model
|
||||||
|
|
||||||
|
p
|
||||||
|
| Returns the parse trees in JSON (dict) format. Especially useful for
|
||||||
|
| web applications.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp('Alice ate the pizza.')
|
||||||
|
trees = doc.print_tree()
|
||||||
|
# {'modifiers': [
|
||||||
|
# {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||||
|
# {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||||
|
# {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
|
||||||
|
# ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code light]
|
||||||
|
+cell bool
|
||||||
|
+cell Don't include lemmas or entities.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code flat]
|
||||||
|
+cell bool
|
||||||
|
+cell Don't include arcs or modifiers.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell dict
|
||||||
|
+cell Parse tree as dict.
|
||||||
|
|
||||||
|
+h(2, "text") Doc.text
|
||||||
|
+tag property
|
||||||
|
|
||||||
|
p A unicode representation of the document text.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
text = u'Give it back! He pleaded.'
|
||||||
|
doc = nlp(text)
|
||||||
|
assert doc.text == text
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell unicode
|
||||||
|
+cell The original verbatim text of the document.
|
||||||
|
|
||||||
|
+h(2, "text_with_ws") Doc.text_with_ws
|
||||||
|
+tag property
|
||||||
|
|
||||||
|
p
|
||||||
|
| An alias of #[code Doc.text], provided for duck-type compatibility with
|
||||||
|
| #[code Span] and #[code Token].
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell unicode
|
||||||
|
+cell The original verbatim text of the document.
|
||||||
|
|
||||||
|
+h(2, "ents") Doc.ents
|
||||||
|
+tag property
|
||||||
|
+tag requires model
|
||||||
|
|
||||||
|
p
|
||||||
|
| Iterate over the entities in the document. Yields named-entity
|
||||||
|
| #[code Span] objects, if the entity recognizer has been applied to the
|
||||||
|
| document.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
|
ents = list(tokens.ents)
|
||||||
|
assert ents[0].label == 346
|
||||||
|
assert ents[0].label_ == 'PERSON'
|
||||||
|
assert ents[0].text == 'Mr. Best'
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell yield
|
||||||
|
+cell #[code Span]
|
||||||
|
+cell Entities in the document.
|
||||||
|
|
||||||
|
+h(2, "noun_chunks") Doc.noun_chunks
|
||||||
|
+tag property
|
||||||
|
+tag requires model
|
||||||
|
|
||||||
|
p
|
||||||
|
| Iterate over the base noun phrases in the document. Yields base
|
||||||
|
| noun-phrase #[code Span] objects, if the document has been syntactically
|
||||||
|
| parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
|
||||||
|
| permit other NPs to be nested within it – so no NP-level coordination, no
|
||||||
|
| prepositional phrases, and no relative clauses.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'A phrase with another phrase occurs.')
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert chunks[0].text == "A phrase"
|
||||||
|
assert chunks[1].text == "another phrase"
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell yield
|
||||||
|
+cell #[code Span]
|
||||||
|
+cell Noun chunks in the document.
|
||||||
|
|
||||||
|
+h(2, "sents") Doc.sents
|
||||||
|
+tag property
|
||||||
|
+tag requires model
|
||||||
|
|
||||||
|
p
|
||||||
|
| Iterate over the sentences in the document. Sentence spans have no label.
|
||||||
|
| To improve accuracy on informal texts, spaCy calculates sentence boundaries
|
||||||
|
| from the syntactic dependency parse. If the parser is disabled,
|
||||||
|
| the #[code sents] iterator will be unavailable.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u"This is a sentence. Here's another...")
|
||||||
|
sents = list(doc.sents)
|
||||||
|
assert len(sents) == 2
|
||||||
|
assert [s.root.text for s in sents] == ["is", "'s"]
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell yield
|
||||||
|
+cell #[code Span]
|
||||||
|
+cell Sentences in the document.
|
||||||
|
|
||||||
|
+h(2, "has_vector") Doc.has_vector
|
||||||
|
+tag property
|
||||||
|
+tag requires model
|
||||||
|
|
||||||
|
p
|
||||||
|
| A boolean value indicating whether a word vector is associated with the
|
||||||
|
| object.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
apple = nlp(u'apple')
|
||||||
|
assert apple.has_vector
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell bool
|
||||||
|
+cell Whether the document has a vector data attached.
|
||||||
|
|
||||||
|
+h(2, "vector") Doc.vector
|
||||||
|
+tag property
|
||||||
|
+tag requires model
|
||||||
|
|
||||||
|
p
|
||||||
|
| A real-valued meaning representation. Defaults to an average of the
|
||||||
|
| token vectors.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
apple = nlp(u'apple')
|
||||||
|
(apple.vector.dtype, apple.vector.shape)
|
||||||
|
# (dtype('float32'), (300,))
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
|
+cell A 1D numpy array representing the document's semantics.
|
||||||
|
|
||||||
+h(2, "attributes") Attributes
|
+h(2, "attributes") Attributes
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
|
@ -59,358 +556,3 @@ p A container for accessing linguistic annotations.
|
||||||
+cell
|
+cell
|
||||||
| A dictionary that allows customisation of properties of
|
| A dictionary that allows customisation of properties of
|
||||||
| #[code Span] children.
|
| #[code Span] children.
|
||||||
|
|
||||||
+h(2, "init") Doc.__init__
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Construct a #[code Doc] object.
|
|
||||||
|
|
||||||
+aside("Note")
|
|
||||||
| The most common way to get a #[code Doc] object is via the #[code nlp]
|
|
||||||
| object. This method is usually only used for deserialization or preset
|
|
||||||
| tokenization.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code vocab]
|
|
||||||
+cell #[code Vocab]
|
|
||||||
+cell A storage container for lexical types.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code words]
|
|
||||||
+cell -
|
|
||||||
+cell A list of strings to add to the container.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code spaces]
|
|
||||||
+cell -
|
|
||||||
+cell
|
|
||||||
| A list of boolean values indicating whether each word has a
|
|
||||||
| subsequent space. Must have the same length as #[code words], if
|
|
||||||
| specified. Defaults to a sequence of #[code True].
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code Doc]
|
|
||||||
+cell The newly constructed object.
|
|
||||||
|
|
||||||
+h(2, "getitem") Doc.__getitem__
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Get a #[code Token] object.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
doc = nlp(u'Give it back! He pleaded.')
|
|
||||||
assert doc[0].text == 'Give'
|
|
||||||
assert doc[-1].text == '.'
|
|
||||||
span = doc[1:1]
|
|
||||||
assert span.text == 'it back'
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code i]
|
|
||||||
+cell int
|
|
||||||
+cell The index of the token.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code Token]
|
|
||||||
+cell The token at #[code doc[i]].
|
|
||||||
|
|
||||||
p Get a #[code Span] object.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code start_end]
|
|
||||||
+cell tuple
|
|
||||||
+cell The slice of the document to get.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code Span]
|
|
||||||
+cell The span at #[code doc[start : end]].
|
|
||||||
|
|
||||||
+h(2, "iter") Doc.__iter__
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Iterate over #[code Token] objects.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell yield
|
|
||||||
+cell #[code Token]
|
|
||||||
+cell A #[code Token] object.
|
|
||||||
|
|
||||||
+h(2, "len") Doc.__len__
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Get the number of tokens in the document.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell int
|
|
||||||
+cell The number of tokens in the document.
|
|
||||||
|
|
||||||
+h(2, "similarity") Doc.similarity
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p
|
|
||||||
| Make a semantic similarity estimate. The default estimate is cosine
|
|
||||||
| similarity using an average of word vectors.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code other]
|
|
||||||
+cell -
|
|
||||||
+cell
|
|
||||||
| The object to compare with. By default, accepts #[code Doc],
|
|
||||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell float
|
|
||||||
+cell A scalar similarity score. Higher is more similar.
|
|
||||||
|
|
||||||
+h(2, "to_array") Doc.to_array
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p
|
|
||||||
| Export the document annotations to a numpy array of shape #[code N*M]
|
|
||||||
| where #[code N] is the length of the document and #[code M] is the number
|
|
||||||
| of attribute IDs to export. The values will be 32-bit integers.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
from spacy import attrs
|
|
||||||
doc = nlp(text)
|
|
||||||
# All strings mapped to integers, for easy export to numpy
|
|
||||||
np_array = doc.to_array([attrs.LOWER, attrs.POS,
|
|
||||||
attrs.ENT_TYPE, attrs.IS_ALPHA])
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code attr_ids]
|
|
||||||
+cell ints
|
|
||||||
+cell A list of attribute ID ints.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
|
||||||
+cell
|
|
||||||
| The exported attributes as a 2D numpy array, with one row per
|
|
||||||
| token and one column per attribute.
|
|
||||||
|
|
||||||
+h(2, "count_by") Doc.count_by
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Count the frequencies of a given attribute.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code attr_id]
|
|
||||||
+cell int
|
|
||||||
+cell The attribute ID
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell dict
|
|
||||||
+cell A dictionary mapping attributes to integer counts.
|
|
||||||
|
|
||||||
+h(2, "from_array") Doc.from_array
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Load attributes from a numpy array.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code attr_ids]
|
|
||||||
+cell ints
|
|
||||||
+cell A list of attribute ID ints.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code values]
|
|
||||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
|
||||||
+cell The attribute values to load.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code None]
|
|
||||||
+cell -
|
|
||||||
|
|
||||||
+h(2, "to_bytes") Doc.to_bytes
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Export the document contents to a binary string.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell bytes
|
|
||||||
+cell
|
|
||||||
| A losslessly serialized copy of the #[code Doc] including all
|
|
||||||
| annotations.
|
|
||||||
|
|
||||||
+h(2, "from_bytes") Doc.from_bytes
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Import the document contents from a binary string.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code byte_string]
|
|
||||||
+cell bytes
|
|
||||||
+cell The string to load from.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code Doc]
|
|
||||||
+cell The #[code self] variable.
|
|
||||||
|
|
||||||
+h(2, "merge") Doc.merge
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p
|
|
||||||
| Retokenize the document, such that the span at
|
|
||||||
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
|
||||||
| #[code start_idx] and #[end_idx] do not mark start and end token
|
|
||||||
| boundaries, the document remains unchanged.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code start_idx]
|
|
||||||
+cell int
|
|
||||||
+cell The character index of the start of the slice to merge.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code end_idx]
|
|
||||||
+cell int
|
|
||||||
+cell The character index after the end of the slice to merge.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code **attributes]
|
|
||||||
+cell -
|
|
||||||
+cell
|
|
||||||
| Attributes to assign to the merged token. By default,
|
|
||||||
| attributes are inherited from the syntactic root token of
|
|
||||||
| the span.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code Token]
|
|
||||||
+cell
|
|
||||||
| The newly merged token, or None if the start and end
|
|
||||||
| indices did not fall at token boundaries
|
|
||||||
|
|
||||||
+h(2, "read_bytes") Doc.read_bytes
|
|
||||||
+tag staticmethod
|
|
||||||
|
|
||||||
p A static method, used to read serialized #[code Doc] objects from a file.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
from spacy.tokens.doc import Doc
|
|
||||||
loc = 'test_serialize.bin'
|
|
||||||
with open(loc, 'wb') as file_:
|
|
||||||
file_.write(nlp(u'This is a document.').to_bytes())
|
|
||||||
file_.write(nlp(u'This is another.').to_bytes())
|
|
||||||
docs = []
|
|
||||||
with open(loc, 'rb') as file_:
|
|
||||||
for byte_string in Doc.read_bytes(file_):
|
|
||||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
|
||||||
assert len(docs) == 2
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell file
|
|
||||||
+cell buffer
|
|
||||||
+cell A binary buffer to read the serialized annotations from.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell yield
|
|
||||||
+cell bytes
|
|
||||||
+cell Binary strings from with documents can be loaded.
|
|
||||||
|
|
||||||
+h(2, "text") Doc.text
|
|
||||||
+tag property
|
|
||||||
|
|
||||||
p A unicode representation of the document text.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell unicode
|
|
||||||
+cell The original verbatim text of the document.
|
|
||||||
|
|
||||||
+h(2, "text_with_ws") Doc.text_with_ws
|
|
||||||
+tag property
|
|
||||||
|
|
||||||
p
|
|
||||||
| An alias of #[code Doc.text], provided for duck-type compatibility with
|
|
||||||
| #[code Span] and #[code Token].
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell unicode
|
|
||||||
+cell The original verbatim text of the document.
|
|
||||||
|
|
||||||
+h(2, "sents") Doc.sents
|
|
||||||
+tag property
|
|
||||||
|
|
||||||
p Iterate over the sentences in the document.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell yield
|
|
||||||
+cell #[code Span]
|
|
||||||
+cell Sentences in the document.
|
|
||||||
|
|
||||||
+h(2, "ents") Doc.ents
|
|
||||||
+tag property
|
|
||||||
|
|
||||||
p Iterate over the entities in the document.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell yield
|
|
||||||
+cell #[code Span]
|
|
||||||
+cell Entities in the document.
|
|
||||||
|
|
||||||
+h(2, "noun_chunks") Doc.noun_chunks
|
|
||||||
+tag property
|
|
||||||
|
|
||||||
p
|
|
||||||
| Iterate over the base noun phrases in the document. A base noun phrase,
|
|
||||||
| or "NP chunk", is a noun phrase that does not permit other NPs to be
|
|
||||||
| nested within it.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell yield
|
|
||||||
+cell #[code Span]
|
|
||||||
+cell Noun chunks in the document
|
|
||||||
|
|
||||||
+h(2, "vector") Doc.vector
|
|
||||||
+tag property
|
|
||||||
|
|
||||||
p
|
|
||||||
| A real-valued meaning representation. Defaults to an average of the
|
|
||||||
| token vectors.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
|
||||||
+cell A 1D numpy array representing the document's semantics.
|
|
||||||
|
|
||||||
+h(2, "has_vector") Doc.has_vector
|
|
||||||
+tag property
|
|
||||||
|
|
||||||
p
|
|
||||||
| A boolean value indicating whether a word vector is associated with the
|
|
||||||
| object.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell bool
|
|
||||||
+cell Whether the document has a vector data attached.
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user