mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update docstrings and API docs for Doc class
This commit is contained in:
parent
0f513850ab
commit
b87066ff10
|
@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|||
|
||||
|
||||
cdef class Doc:
|
||||
"""
|
||||
A sequence of `Token` objects. Access sentences and named entities,
|
||||
export annotations to numpy arrays, losslessly serialize to compressed
|
||||
binary strings.
|
||||
"""A sequence of Token objects. Access sentences and named entities, export
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
||||
The `Doc` object holds an array of `TokenC` structs. The Python-level
|
||||
`Token` and `Span` objects are views of this array, i.e. they don't own
|
||||
the data themselves.
|
||||
|
||||
Aside: Internals
|
||||
The `Doc` object holds an array of `TokenC` structs.
|
||||
The Python-level `Token` and `Span` objects are views of this
|
||||
array, i.e. they don't own the data themselves.
|
||||
|
||||
Code: Construction 1
|
||||
doc = nlp.tokenizer(u'Some text')
|
||||
|
||||
Code: Construction 2
|
||||
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
|
||||
EXAMPLE: Construction 1
|
||||
>>> doc = nlp(u'Some text')
|
||||
|
||||
Construction 2
|
||||
>>> from spacy.tokens import Doc
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
||||
"""
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||
"""
|
||||
Create a Doc object.
|
||||
"""Create a Doc object.
|
||||
|
||||
Arguments:
|
||||
vocab:
|
||||
A Vocabulary object, which must match any models you want to
|
||||
use (e.g. tokenizer, parser, entity recognizer).
|
||||
|
||||
words:
|
||||
A list of unicode strings to add to the document as words. If None,
|
||||
defaults to empty list.
|
||||
|
||||
spaces:
|
||||
A list of boolean values, of the same length as words. True
|
||||
means that the word is followed by a space, False means it is not.
|
||||
If None, defaults to [True]*len(words)
|
||||
vocab (Vocab): A vocabulary object, which must match any models you want
|
||||
to use (e.g. tokenizer, parser, entity recognizer).
|
||||
words (list or None): A list of unicode strings to add to the document
|
||||
as words. If `None`, defaults to empty list.
|
||||
spaces (list or None): A list of boolean values, of the same length as
|
||||
words. True means that the word is followed by a space, False means
|
||||
it is not. If `None`, defaults to `[True]*len(words)`
|
||||
RETURNS (Doc): The newly constructed object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
size = 20
|
||||
|
@ -158,20 +148,22 @@ cdef class Doc:
|
|||
self.is_parsed = True
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""
|
||||
doc[i]
|
||||
Get the Token object at position i, where i is an integer.
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
||||
EXAMPLE:
|
||||
>>> doc[i]
|
||||
Get the `Token` object at position `i`, where `i` is an integer.
|
||||
Negative indexing is supported, and follows the usual Python
|
||||
semantics, i.e. doc[-2] is doc[len(doc) - 2].
|
||||
doc[start : end]]
|
||||
Get a `Span` object, starting at position `start`
|
||||
and ending at position `end`, where `start` and
|
||||
`end` are token indices. For instance,
|
||||
`doc[2:5]` produces a span consisting of
|
||||
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
|
||||
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||
You can use negative indices and open-ended ranges, which have their
|
||||
normal Python semantics.
|
||||
semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
|
||||
|
||||
>>> doc[start : end]]
|
||||
Get a `Span` object, starting at position `start` and ending at
|
||||
position `end`, where `start` and `end` are token indices. For
|
||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
|
||||
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
|
||||
as `Span` objects must be contiguous (cannot have gaps). You can use
|
||||
negative indices and open-ended ranges, which have their normal
|
||||
Python semantics.
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
|
@ -186,14 +178,14 @@ cdef class Doc:
|
|||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
for token in doc
|
||||
Iterate over `Token` objects, from which the annotations can
|
||||
be easily accessed. This is the main way of accessing Token
|
||||
objects, which are the main way annotations are accessed from
|
||||
Python. If faster-than-Python speeds are required, you can
|
||||
instead access the annotations as a numpy array, or access the
|
||||
underlying C data directly from Cython.
|
||||
"""Iterate over `Token` objects, from which the annotations can be
|
||||
easily accessed. This is the main way of accessing `Token` objects,
|
||||
which are the main way annotations are accessed from Python. If faster-
|
||||
than-Python speeds are required, you can instead access the annotations
|
||||
as a numpy array, or access the underlying C data directly from Cython.
|
||||
|
||||
EXAMPLE:
|
||||
>>> for token in doc
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
|
@ -203,9 +195,10 @@ cdef class Doc:
|
|||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
len(doc)
|
||||
The number of tokens in the document.
|
||||
"""The number of tokens in the document.
|
||||
|
||||
EXAMPLE:
|
||||
>>> len(doc)
|
||||
"""
|
||||
return self.length
|
||||
|
||||
|
@ -228,16 +221,12 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def similarity(self, other):
|
||||
"""
|
||||
Make a semantic similarity estimate. The default estimate is cosine
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
other (object): The object to compare with. By default, accepts Doc,
|
||||
Span, Token and Lexeme objects.
|
||||
|
||||
Return:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
other (object): The object to compare with. By default, accepts `Doc`,
|
||||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
"""
|
||||
if 'similarity' in self.user_hooks:
|
||||
return self.user_hooks['similarity'](self, other)
|
||||
|
@ -246,8 +235,10 @@ cdef class Doc:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property has_vector:
|
||||
"""
|
||||
A boolean value indicating whether a word vector is associated with the object.
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.user_hooks:
|
||||
|
@ -256,10 +247,11 @@ cdef class Doc:
|
|||
return any(token.has_vector for token in self)
|
||||
|
||||
property vector:
|
||||
"""
|
||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||
"""A real-valued meaning representation. Defaults to an average of the
|
||||
token vectors.
|
||||
|
||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the document's semantics.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector' in self.user_hooks:
|
||||
|
@ -275,6 +267,7 @@ cdef class Doc:
|
|||
self._vector = value
|
||||
|
||||
property vector_norm:
|
||||
# TODO: docstrings / docs
|
||||
def __get__(self):
|
||||
if 'vector_norm' in self.user_hooks:
|
||||
return self.user_hooks['vector_norm'](self)
|
||||
|
@ -295,34 +288,37 @@ cdef class Doc:
|
|||
return self.text
|
||||
|
||||
property text:
|
||||
"""
|
||||
A unicode representation of the document text.
|
||||
"""A unicode representation of the document text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the document.
|
||||
"""
|
||||
def __get__(self):
|
||||
return u''.join(t.text_with_ws for t in self)
|
||||
|
||||
property text_with_ws:
|
||||
"""
|
||||
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
|
||||
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
||||
`Span` and `Token`.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the document.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.text
|
||||
|
||||
property ents:
|
||||
"""
|
||||
Yields named-entity `Span` objects, if the entity recognizer
|
||||
has been applied to the document. Iterate over the span to get
|
||||
individual Token objects, or access the label:
|
||||
"""Iterate over the entities in the document. Yields named-entity `Span`
|
||||
objects, if the entity recognizer has been applied to the document.
|
||||
|
||||
Example:
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
assert ents[0].label == 346
|
||||
assert ents[0].label_ == 'PERSON'
|
||||
assert ents[0].orth_ == 'Best'
|
||||
assert ents[0].text == 'Mr. Best'
|
||||
YIELDS (Span): Entities in the document.
|
||||
|
||||
EXAMPLE: Iterate over the span to get individual Token objects, or access
|
||||
the label:
|
||||
|
||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
>>> ents = list(tokens.ents)
|
||||
>>> assert ents[0].label == 346
|
||||
>>> assert ents[0].label_ == 'PERSON'
|
||||
>>> assert ents[0].orth_ == 'Best'
|
||||
>>> assert ents[0].text == 'Mr. Best'
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
|
@ -387,12 +383,13 @@ cdef class Doc:
|
|||
self.c[start].ent_iob = 3
|
||||
|
||||
property noun_chunks:
|
||||
"""
|
||||
Yields base noun-phrase #[code Span] objects, if the document
|
||||
has been syntactically parsed. A base noun phrase, or
|
||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||
be nested within it – so no NP-level coordination, no prepositional
|
||||
phrases, and no relative clauses.
|
||||
"""Iterate over the base noun phrases in the document. Yields base
|
||||
noun-phrase #[code Span] objects, if the document has been syntactically
|
||||
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
|
||||
not permit other NPs to be nested within it – so no NP-level
|
||||
coordination, no prepositional phrases, and no relative clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the document.
|
||||
"""
|
||||
def __get__(self):
|
||||
if not self.is_parsed:
|
||||
|
@ -411,17 +408,15 @@ cdef class Doc:
|
|||
yield span
|
||||
|
||||
property sents:
|
||||
"""
|
||||
Yields sentence `Span` objects. Sentence spans have no label.
|
||||
To improve accuracy on informal texts, spaCy calculates sentence
|
||||
boundaries from the syntactic dependency parse. If the parser is disabled,
|
||||
`sents` iterator will be unavailable.
|
||||
"""Iterate over the sentences in the document. Yields sentence `Span`
|
||||
objects. Sentence spans have no label. To improve accuracy on informal
|
||||
texts, spaCy calculates sentence boundaries from the syntactic
|
||||
dependency parse. If the parser is disabled, the `sents` iterator will
|
||||
be unavailable.
|
||||
|
||||
Example:
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
doc = nlp("This is a sentence. Here's another...")
|
||||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
||||
EXAMPLE:
|
||||
>>> doc = nlp("This is a sentence. Here's another...")
|
||||
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'sents' in self.user_hooks:
|
||||
|
@ -467,24 +462,20 @@ cdef class Doc:
|
|||
|
||||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""
|
||||
Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape (N, M), where `N` is the length
|
||||
of the document. The values will be 32-bit integers.
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
The values will be 32-bit integers.
|
||||
|
||||
Example:
|
||||
from spacy import attrs
|
||||
doc = nlp(text)
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
|
||||
|
||||
Arguments:
|
||||
attr_ids (list[int]): A list of attribute ID ints.
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
per word, and one column per attribute indicated in the input
|
||||
`attr_ids`.
|
||||
|
||||
Returns:
|
||||
feat_array (numpy.ndarray[long, ndim=2]):
|
||||
A feature matrix, with one row per word, and one column per attribute
|
||||
indicated in the input attr_ids.
|
||||
EXAMPLE:
|
||||
>>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
>>> doc = nlp(text)
|
||||
>>> # All strings mapped to integers, for easy export to numpy
|
||||
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
|
@ -499,27 +490,20 @@ cdef class Doc:
|
|||
return output
|
||||
|
||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
||||
"""
|
||||
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
by the values of the given attribute ID.
|
||||
"""Count the frequencies of a given attribute. Produces a dict of
|
||||
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
||||
the given attribute ID.
|
||||
|
||||
Example:
|
||||
from spacy.en import English
|
||||
from spacy import attrs
|
||||
nlp = English()
|
||||
tokens = nlp(u'apple apple orange banana')
|
||||
tokens.count_by(attrs.ORTH)
|
||||
# {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
tokens.to_array([attrs.ORTH])
|
||||
# array([[11880],
|
||||
# [11880],
|
||||
# [ 7561],
|
||||
# [12800]])
|
||||
attr_id (int): The attribute ID to key the counts.
|
||||
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
||||
|
||||
Arguments:
|
||||
attr_id
|
||||
int
|
||||
The attribute ID to key the counts.
|
||||
EXAMPLE:
|
||||
>>> from spacy import attrs
|
||||
>>> doc = nlp(u'apple apple orange banana')
|
||||
>>> tokens.count_by(attrs.ORTH)
|
||||
{12800L: 1, 11880L: 2, 7561L: 1}
|
||||
>>> tokens.to_array([attrs.ORTH])
|
||||
array([[11880], [11880], [7561], [12800]])
|
||||
"""
|
||||
cdef int i
|
||||
cdef attr_t attr
|
||||
|
@ -567,8 +551,12 @@ cdef class Doc:
|
|||
self.c[i] = parsed[i]
|
||||
|
||||
def from_array(self, attrs, int[:, :] array):
|
||||
"""
|
||||
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
||||
`(M, N)` array of attributes.
|
||||
|
||||
attrs (ints): A list of attribute ID ints.
|
||||
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
|
||||
RETURNS (Doc): Itself.
|
||||
"""
|
||||
cdef int i, col
|
||||
cdef attr_id_t attr_id
|
||||
|
@ -597,8 +585,10 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def to_bytes(self):
|
||||
"""
|
||||
Serialize, producing a byte string.
|
||||
"""Serialize, i.e. export the document contents to a binary string.
|
||||
|
||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||
all annotations.
|
||||
"""
|
||||
return dill.dumps(
|
||||
(self.text,
|
||||
|
@ -611,8 +601,10 @@ cdef class Doc:
|
|||
protocol=-1)
|
||||
|
||||
def from_bytes(self, data):
|
||||
"""
|
||||
Deserialize, loading from bytes.
|
||||
"""Deserialize, i.e. import the document contents from a binary string.
|
||||
|
||||
data (bytes): The string to load from.
|
||||
RETURNS (Doc): Itself.
|
||||
"""
|
||||
if self.length != 0:
|
||||
raise ValueError("Cannot load into non-empty Doc")
|
||||
|
@ -640,21 +632,16 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""
|
||||
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||
is merged into a single token. If start_idx and end_idx do not mark start
|
||||
and end token boundaries, the document remains unchanged.
|
||||
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
||||
is merged into a single token. If `start_idx` and `end_idx `do not mark
|
||||
start and end token boundaries, the document remains unchanged.
|
||||
|
||||
Arguments:
|
||||
start_idx (int): The character index of the start of the slice to merge.
|
||||
end_idx (int): The character index after the end of the slice to merge.
|
||||
**attributes:
|
||||
Attributes to assign to the merged token. By default, attributes
|
||||
are inherited from the syntactic root token of the span.
|
||||
Returns:
|
||||
token (Token):
|
||||
The newly merged token, or None if the start and end indices did
|
||||
not fall at token boundaries.
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root token of the span.
|
||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||
indices did not fall at token boundaries.
|
||||
"""
|
||||
cdef unicode tag, lemma, ent_type
|
||||
if len(args) == 3:
|
||||
|
@ -758,7 +745,29 @@ cdef class Doc:
|
|||
return self[start]
|
||||
|
||||
def print_tree(self, light=False, flat=False):
|
||||
"""Returns the parse trees in the JSON (Dict) format."""
|
||||
"""Returns the parse trees in JSON (dict) format.
|
||||
|
||||
light (bool): Don't include lemmas or entities.
|
||||
flat (bool): Don't include arcs or modifiers.
|
||||
RETURNS (dict): Parse tree as dict.
|
||||
|
||||
EXAMPLE:
|
||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||
>>> trees = doc.print_tree()
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
"""
|
||||
return parse_tree(self, light=light, flat=flat)
|
||||
|
||||
|
||||
|
|
|
@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
|||
|
||||
|
||||
def merge_ents(doc):
|
||||
"""
|
||||
Helper: merge adjacent entities into single tokens; modifies the doc.
|
||||
"""
|
||||
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
|
||||
for ent in doc.ents:
|
||||
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
||||
return doc
|
||||
|
||||
|
||||
def format_POS(token, light, flat):
|
||||
"""
|
||||
Helper: form the POS output for a token.
|
||||
"""
|
||||
"""Helper: form the POS output for a token."""
|
||||
subtree = dict([
|
||||
("word", token.text),
|
||||
("lemma", token.lemma_), # trigger
|
||||
|
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
|
|||
|
||||
|
||||
def POS_tree(root, light=False, flat=False):
|
||||
"""
|
||||
Helper: generate a POS tree for a root token. The doc must have
|
||||
merge_ents(doc) ran on it.
|
||||
"""Helper: generate a POS tree for a root token. The doc must have
|
||||
`merge_ents(doc)` ran on it.
|
||||
"""
|
||||
subtree = format_POS(root, light=light, flat=flat)
|
||||
for c in root.children:
|
||||
|
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
|
|||
|
||||
|
||||
def parse_tree(doc, light=False, flat=False):
|
||||
"""
|
||||
Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
||||
|
||||
Args:
|
||||
doc: The doc for parsing.
|
||||
doc (Doc): The doc for parsing.
|
||||
RETURNS (dict): The parse tree.
|
||||
|
||||
Returns:
|
||||
[parse_trees (Dict)]:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
EXAMPLE:
|
||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||
>>> trees = doc.print_tree()
|
||||
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
"""
|
||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||
|
|
|
@ -4,6 +4,503 @@ include ../../_includes/_mixins
|
|||
|
||||
p A container for accessing linguistic annotations.
|
||||
|
||||
p
|
||||
| A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
|
||||
| Access sentences and named entities, export annotations to numpy arrays,
|
||||
| losslessly serialize to compressed binary strings. The #[code Doc] object
|
||||
| holds an array of #[code TokenC] structs. The Python-level #[code Token]
|
||||
| and #[+api("span") #[code Span]] objects are views of this array, i.e.
|
||||
| they don't own the data themselves.
|
||||
|
||||
+aside-code("Example").
|
||||
# Construction 1
|
||||
doc = nlp(u'Some text')
|
||||
|
||||
# Construction 2
|
||||
from spacy.tokens import Doc
|
||||
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||
spaces=[True, False, False])
|
||||
|
||||
+h(2, "init") Doc.__init__
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Construct a #[code Doc] object. The most common way to get a #[code Doc]
|
||||
| object is via the #[code nlp] object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code words]
|
||||
+cell -
|
||||
+cell A list of strings to add to the container.
|
||||
|
||||
+row
|
||||
+cell #[code spaces]
|
||||
+cell -
|
||||
+cell
|
||||
| A list of boolean values indicating whether each word has a
|
||||
| subsequent space. Must have the same length as #[code words], if
|
||||
| specified. Defaults to a sequence of #[code True].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "getitem") Doc.__getitem__
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Get a #[+api("token") #[code Token]] object at position #[code i], where
|
||||
| #[code i] is an integer. Negative indexing is supported, and follows the
|
||||
| usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[-1].text == '.'
|
||||
span = doc[1:1]
|
||||
assert span.text == 'it back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell The index of the token.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The token at #[code doc[i]].
|
||||
|
||||
p
|
||||
| Get a #[+api("span") #[code Span]] object, starting at position
|
||||
| #[code start] (token index) and ending at position #[code end] (token
|
||||
| index).
|
||||
|
||||
p
|
||||
| For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
|
||||
| and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
|
||||
| supported, as #[code Span] objects must be contiguous (cannot have gaps).
|
||||
| You can use negative indices and open-ended ranges, which have their
|
||||
| normal Python semantics.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_end]
|
||||
+cell tuple
|
||||
+cell The slice of the document to get.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Span]
|
||||
+cell The span at #[code doc[start : end]].
|
||||
|
||||
+h(2, "iter") Doc.__iter__
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Iterate over #[code Token] objects, from which the annotations can be
|
||||
| easily accessed.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
for token in doc:
|
||||
print(token.text, token.tag_)
|
||||
|
||||
p
|
||||
| This is the main way of accessing #[+api("token") #[code Token]] objects,
|
||||
| which are the main way annotations are accessed from Python. If
|
||||
| faster-than-Python speeds are required, you can instead access the
|
||||
| annotations as a numpy array, or access the underlying C data directly
|
||||
| from Cython.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Token]
|
||||
+cell A #[code Token] object.
|
||||
|
||||
+h(2, "len") Doc.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of tokens in the document.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert len(doc) == 7
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int
|
||||
+cell The number of tokens in the document.
|
||||
|
||||
+h(2, "similarity") Doc.similarity
|
||||
+tag method
|
||||
+tag requires model
|
||||
|
||||
p
|
||||
| Make a semantic similarity estimate. The default estimate is cosine
|
||||
| similarity using an average of word vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
apples, and, oranges = nlp(u'apples and oranges')
|
||||
apples_oranges = apples.similarity(oranges)
|
||||
oranges_apples = oranges.similarity(apples)
|
||||
assert apples_oranges == oranges_apples
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code other]
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "count_by") Doc.count_by
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Count the frequencies of a given attribute. Produces a dict of
|
||||
| #[code {attr (int): count (ints)}] frequencies, keyed by the values
|
||||
| of the given attribute ID.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy import attrs
|
||||
doc = nlp(u'apple apple orange banana')
|
||||
tokens.count_by(attrs.ORTH)
|
||||
# {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
tokens.to_array([attrs.ORTH])
|
||||
# array([[11880], [11880], [7561], [12800]])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_id]
|
||||
+cell int
|
||||
+cell The attribute ID
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell dict
|
||||
+cell A dictionary mapping attributes to integer counts.
|
||||
|
||||
+h(2, "to_array") Doc.to_array
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Export the document annotations to a numpy array of shape #[code N*M]
|
||||
| where #[code N] is the length of the document and #[code M] is the number
|
||||
| of attribute IDs to export. The values will be 32-bit integers.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
doc = nlp(text)
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell
|
||||
| The exported attributes as a 2D numpy array, with one row per
|
||||
| token and one column per attribute.
|
||||
|
||||
+h(2, "from_array") Doc.from_array
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Load attributes from a numpy array. Write to a #[code Doc] object, from
|
||||
| an #[code (M, N)] array of attributes.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
from spacy.tokens import Doc
|
||||
doc = nlp(text)
|
||||
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
doc2 = Doc(doc.vocab)
|
||||
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attrs]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+row
|
||||
+cell #[code array]
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The attribute values to load.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell Itself.
|
||||
|
||||
+h(2, "to_bytes") Doc.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize, i.e. export the document contents to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
doc_bytes = doc.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bytes
|
||||
+cell
|
||||
| A losslessly serialized copy of the #[code Doc], including all
|
||||
| annotations.
|
||||
|
||||
+h(2, "from_bytes") Doc.from_bytes
|
||||
+tag method
|
||||
|
||||
p Deserialize, i.e. import the document contents from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Doc
|
||||
text = u'Give it back! He pleaded.'
|
||||
doc = nlp(text)
|
||||
bytes = doc.to_bytes()
|
||||
doc2 = Doc(doc.vocab).from_bytes(bytes)
|
||||
assert doc.text == doc2.text
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code data]
|
||||
+cell bytes
|
||||
+cell The string to load from.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell Itself.
|
||||
|
||||
+h(2, "merge") Doc.merge
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Retokenize the document, such that the span at
|
||||
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
||||
| #[code start_idx] and #[end_idx] do not mark start and end token
|
||||
| boundaries, the document remains unchanged.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Los Angeles start.')
|
||||
doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
|
||||
print([token.text for token in doc])
|
||||
# ['Los Angeles', 'start', '.']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_idx]
|
||||
+cell int
|
||||
+cell The character index of the start of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code end_idx]
|
||||
+cell int
|
||||
+cell The character index after the end of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code **attributes]
|
||||
+cell -
|
||||
+cell
|
||||
| Attributes to assign to the merged token. By default,
|
||||
| attributes are inherited from the syntactic root token of
|
||||
| the span.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell
|
||||
| The newly merged token, or #[code None] if the start and end
|
||||
| indices did not fall at token boundaries
|
||||
|
||||
+h(2, "print_tree") Doc.print_tree
|
||||
+tag method
|
||||
+tag requires model
|
||||
|
||||
p
|
||||
| Returns the parse trees in JSON (dict) format. Especially useful for
|
||||
| web applications.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp('Alice ate the pizza.')
|
||||
trees = doc.print_tree()
|
||||
# {'modifiers': [
|
||||
# {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
# {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
# {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
|
||||
# ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code light]
|
||||
+cell bool
|
||||
+cell Don't include lemmas or entities.
|
||||
|
||||
+row
|
||||
+cell #[code flat]
|
||||
+cell bool
|
||||
+cell Don't include arcs or modifiers.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell dict
|
||||
+cell Parse tree as dict.
|
||||
|
||||
+h(2, "text") Doc.text
|
||||
+tag property
|
||||
|
||||
p A unicode representation of the document text.
|
||||
|
||||
+aside-code("Example").
|
||||
text = u'Give it back! He pleaded.'
|
||||
doc = nlp(text)
|
||||
assert doc.text == text
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The original verbatim text of the document.
|
||||
|
||||
+h(2, "text_with_ws") Doc.text_with_ws
|
||||
+tag property
|
||||
|
||||
p
|
||||
| An alias of #[code Doc.text], provided for duck-type compatibility with
|
||||
| #[code Span] and #[code Token].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The original verbatim text of the document.
|
||||
|
||||
+h(2, "ents") Doc.ents
|
||||
+tag property
|
||||
+tag requires model
|
||||
|
||||
p
|
||||
| Iterate over the entities in the document. Yields named-entity
|
||||
| #[code Span] objects, if the entity recognizer has been applied to the
|
||||
| document.
|
||||
|
||||
+aside-code("Example").
|
||||
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
assert ents[0].label == 346
|
||||
assert ents[0].label_ == 'PERSON'
|
||||
assert ents[0].text == 'Mr. Best'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Entities in the document.
|
||||
|
||||
+h(2, "noun_chunks") Doc.noun_chunks
|
||||
+tag property
|
||||
+tag requires model
|
||||
|
||||
p
|
||||
| Iterate over the base noun phrases in the document. Yields base
|
||||
| noun-phrase #[code Span] objects, if the document has been syntactically
|
||||
| parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
|
||||
| permit other NPs to be nested within it – so no NP-level coordination, no
|
||||
| prepositional phrases, and no relative clauses.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'A phrase with another phrase occurs.')
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert chunks[0].text == "A phrase"
|
||||
assert chunks[1].text == "another phrase"
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Noun chunks in the document.
|
||||
|
||||
+h(2, "sents") Doc.sents
|
||||
+tag property
|
||||
+tag requires model
|
||||
|
||||
p
|
||||
| Iterate over the sentences in the document. Sentence spans have no label.
|
||||
| To improve accuracy on informal texts, spaCy calculates sentence boundaries
|
||||
| from the syntactic dependency parse. If the parser is disabled,
|
||||
| the #[code sents] iterator will be unavailable.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u"This is a sentence. Here's another...")
|
||||
sents = list(doc.sents)
|
||||
assert len(sents) == 2
|
||||
assert [s.root.text for s in sents] == ["is", "'s"]
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Sentences in the document.
|
||||
|
||||
+h(2, "has_vector") Doc.has_vector
|
||||
+tag property
|
||||
+tag requires model
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| object.
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp(u'apple')
|
||||
assert apple.has_vector
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether the document has a vector data attached.
|
||||
|
||||
+h(2, "vector") Doc.vector
|
||||
+tag property
|
||||
+tag requires model
|
||||
|
||||
p
|
||||
| A real-valued meaning representation. Defaults to an average of the
|
||||
| token vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp(u'apple')
|
||||
(apple.vector.dtype, apple.vector.shape)
|
||||
# (dtype('float32'), (300,))
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the document's semantics.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
@ -59,358 +556,3 @@ p A container for accessing linguistic annotations.
|
|||
+cell
|
||||
| A dictionary that allows customisation of properties of
|
||||
| #[code Span] children.
|
||||
|
||||
+h(2, "init") Doc.__init__
|
||||
+tag method
|
||||
|
||||
p Construct a #[code Doc] object.
|
||||
|
||||
+aside("Note")
|
||||
| The most common way to get a #[code Doc] object is via the #[code nlp]
|
||||
| object. This method is usually only used for deserialization or preset
|
||||
| tokenization.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code words]
|
||||
+cell -
|
||||
+cell A list of strings to add to the container.
|
||||
|
||||
+row
|
||||
+cell #[code spaces]
|
||||
+cell -
|
||||
+cell
|
||||
| A list of boolean values indicating whether each word has a
|
||||
| subsequent space. Must have the same length as #[code words], if
|
||||
| specified. Defaults to a sequence of #[code True].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "getitem") Doc.__getitem__
|
||||
+tag method
|
||||
|
||||
p Get a #[code Token] object.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[-1].text == '.'
|
||||
span = doc[1:1]
|
||||
assert span.text == 'it back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell The index of the token.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The token at #[code doc[i]].
|
||||
|
||||
p Get a #[code Span] object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_end]
|
||||
+cell tuple
|
||||
+cell The slice of the document to get.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Span]
|
||||
+cell The span at #[code doc[start : end]].
|
||||
|
||||
+h(2, "iter") Doc.__iter__
|
||||
+tag method
|
||||
|
||||
p Iterate over #[code Token] objects.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Token]
|
||||
+cell A #[code Token] object.
|
||||
|
||||
+h(2, "len") Doc.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of tokens in the document.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int
|
||||
+cell The number of tokens in the document.
|
||||
|
||||
+h(2, "similarity") Doc.similarity
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Make a semantic similarity estimate. The default estimate is cosine
|
||||
| similarity using an average of word vectors.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code other]
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "to_array") Doc.to_array
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Export the document annotations to a numpy array of shape #[code N*M]
|
||||
| where #[code N] is the length of the document and #[code M] is the number
|
||||
| of attribute IDs to export. The values will be 32-bit integers.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy import attrs
|
||||
doc = nlp(text)
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = doc.to_array([attrs.LOWER, attrs.POS,
|
||||
attrs.ENT_TYPE, attrs.IS_ALPHA])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell
|
||||
| The exported attributes as a 2D numpy array, with one row per
|
||||
| token and one column per attribute.
|
||||
|
||||
+h(2, "count_by") Doc.count_by
|
||||
+tag method
|
||||
|
||||
p Count the frequencies of a given attribute.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_id]
|
||||
+cell int
|
||||
+cell The attribute ID
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell dict
|
||||
+cell A dictionary mapping attributes to integer counts.
|
||||
|
||||
+h(2, "from_array") Doc.from_array
|
||||
+tag method
|
||||
|
||||
p Load attributes from a numpy array.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+row
|
||||
+cell #[code values]
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The attribute values to load.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "to_bytes") Doc.to_bytes
|
||||
+tag method
|
||||
|
||||
p Export the document contents to a binary string.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bytes
|
||||
+cell
|
||||
| A losslessly serialized copy of the #[code Doc] including all
|
||||
| annotations.
|
||||
|
||||
+h(2, "from_bytes") Doc.from_bytes
|
||||
+tag method
|
||||
|
||||
p Import the document contents from a binary string.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code byte_string]
|
||||
+cell bytes
|
||||
+cell The string to load from.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell The #[code self] variable.
|
||||
|
||||
+h(2, "merge") Doc.merge
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Retokenize the document, such that the span at
|
||||
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
||||
| #[code start_idx] and #[end_idx] do not mark start and end token
|
||||
| boundaries, the document remains unchanged.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_idx]
|
||||
+cell int
|
||||
+cell The character index of the start of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code end_idx]
|
||||
+cell int
|
||||
+cell The character index after the end of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code **attributes]
|
||||
+cell -
|
||||
+cell
|
||||
| Attributes to assign to the merged token. By default,
|
||||
| attributes are inherited from the syntactic root token of
|
||||
| the span.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell
|
||||
| The newly merged token, or None if the start and end
|
||||
| indices did not fall at token boundaries
|
||||
|
||||
+h(2, "read_bytes") Doc.read_bytes
|
||||
+tag staticmethod
|
||||
|
||||
p A static method, used to read serialized #[code Doc] objects from a file.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens.doc import Doc
|
||||
loc = 'test_serialize.bin'
|
||||
with open(loc, 'wb') as file_:
|
||||
file_.write(nlp(u'This is a document.').to_bytes())
|
||||
file_.write(nlp(u'This is another.').to_bytes())
|
||||
docs = []
|
||||
with open(loc, 'rb') as file_:
|
||||
for byte_string in Doc.read_bytes(file_):
|
||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||
assert len(docs) == 2
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell file
|
||||
+cell buffer
|
||||
+cell A binary buffer to read the serialized annotations from.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell bytes
|
||||
+cell Binary strings from with documents can be loaded.
|
||||
|
||||
+h(2, "text") Doc.text
|
||||
+tag property
|
||||
|
||||
p A unicode representation of the document text.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The original verbatim text of the document.
|
||||
|
||||
+h(2, "text_with_ws") Doc.text_with_ws
|
||||
+tag property
|
||||
|
||||
p
|
||||
| An alias of #[code Doc.text], provided for duck-type compatibility with
|
||||
| #[code Span] and #[code Token].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The original verbatim text of the document.
|
||||
|
||||
+h(2, "sents") Doc.sents
|
||||
+tag property
|
||||
|
||||
p Iterate over the sentences in the document.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Sentences in the document.
|
||||
|
||||
+h(2, "ents") Doc.ents
|
||||
+tag property
|
||||
|
||||
p Iterate over the entities in the document.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Entities in the document.
|
||||
|
||||
+h(2, "noun_chunks") Doc.noun_chunks
|
||||
+tag property
|
||||
|
||||
p
|
||||
| Iterate over the base noun phrases in the document. A base noun phrase,
|
||||
| or "NP chunk", is a noun phrase that does not permit other NPs to be
|
||||
| nested within it.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Noun chunks in the document
|
||||
|
||||
+h(2, "vector") Doc.vector
|
||||
+tag property
|
||||
|
||||
p
|
||||
| A real-valued meaning representation. Defaults to an average of the
|
||||
| token vectors.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the document's semantics.
|
||||
|
||||
+h(2, "has_vector") Doc.has_vector
|
||||
+tag property
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether the document has a vector data attached.
|
||||
|
|
Loading…
Reference in New Issue
Block a user