mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Tidy up and document Doc, Token and Span
This commit is contained in:
parent
1a559d4c95
commit
6a0483b7aa
|
@ -2,4 +2,4 @@ from .doc import Doc
|
||||||
from .token import Token
|
from .token import Token
|
||||||
from .span import Span
|
from .span import Span
|
||||||
|
|
||||||
__all__ = [Doc, Token, Span]
|
__all__ = ['Doc', 'Token', 'Span']
|
||||||
|
|
|
@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs import intify_attrs, IDS
|
from ..attrs import intify_attrs, IDS
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport SENT_START
|
from ..attrs cimport ENT_TYPE, SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config, copy_reg, pickle
|
from ..compat import is_config, copy_reg, pickle
|
||||||
|
@ -78,17 +78,18 @@ def _get_chunker(lang):
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""A sequence of Token objects. Access sentences and named entities, export
|
"""A sequence of Token objects. Access sentences and named entities, export
|
||||||
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||||
The `Doc` object holds an array of `TokenC` structs. The Python-level
|
strings. The `Doc` object holds an array of `TokenC` structs. The
|
||||||
`Token` and `Span` objects are views of this array, i.e. they don't own
|
Python-level `Token` and `Span` objects are views of this array, i.e.
|
||||||
the data themselves.
|
they don't own the data themselves.
|
||||||
|
|
||||||
EXAMPLE: Construction 1
|
EXAMPLE: Construction 1
|
||||||
>>> doc = nlp(u'Some text')
|
>>> doc = nlp(u'Some text')
|
||||||
|
|
||||||
Construction 2
|
Construction 2
|
||||||
>>> from spacy.tokens import Doc
|
>>> from spacy.tokens import Doc
|
||||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||||
|
spaces=[True, False, False])
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_extension(cls, name, default=None, method=None,
|
def set_extension(cls, name, default=None, method=None,
|
||||||
|
@ -109,15 +110,14 @@ cdef class Doc:
|
||||||
orths_and_spaces=None):
|
orths_and_spaces=None):
|
||||||
"""Create a Doc object.
|
"""Create a Doc object.
|
||||||
|
|
||||||
vocab (Vocab): A vocabulary object, which must match any models you want
|
vocab (Vocab): A vocabulary object, which must match any models you
|
||||||
to use (e.g. tokenizer, parser, entity recognizer).
|
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||||
words (list or None): A list of unicode strings to add to the document
|
words (list or None): A list of unicode strings to add to the document
|
||||||
as words. If `None`, defaults to empty list.
|
as words. If `None`, defaults to empty list.
|
||||||
spaces (list or None): A list of boolean values, of the same length as
|
spaces (list or None): A list of boolean values, of the same length as
|
||||||
words. True means that the word is followed by a space, False means
|
words. True means that the word is followed by a space, False means
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
|
|
||||||
RETURNS (Doc): The newly constructed object.
|
RETURNS (Doc): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -153,10 +153,10 @@ cdef class Doc:
|
||||||
spaces = [True] * len(words)
|
spaces = [True] * len(words)
|
||||||
elif len(spaces) != len(words):
|
elif len(spaces) != len(words):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Arguments 'words' and 'spaces' should be sequences of the "
|
"Arguments 'words' and 'spaces' should be sequences of "
|
||||||
"same length, or 'spaces' should be left default at None. "
|
"the same length, or 'spaces' should be left default at "
|
||||||
"spaces should be a sequence of booleans, with True meaning "
|
"None. spaces should be a sequence of booleans, with True "
|
||||||
"that the word owns a ' ' character following it.")
|
"meaning that the word owns a ' ' character following it.")
|
||||||
orths_and_spaces = zip(words, spaces)
|
orths_and_spaces = zip(words, spaces)
|
||||||
if orths_and_spaces is not None:
|
if orths_and_spaces is not None:
|
||||||
for orth_space in orths_and_spaces:
|
for orth_space in orths_and_spaces:
|
||||||
|
@ -166,7 +166,8 @@ cdef class Doc:
|
||||||
elif isinstance(orth_space, bytes):
|
elif isinstance(orth_space, bytes):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"orths_and_spaces expects either List(unicode) or "
|
"orths_and_spaces expects either List(unicode) or "
|
||||||
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
|
"List((unicode, bool)). "
|
||||||
|
"Got bytes instance: %s" % (str(orth_space)))
|
||||||
else:
|
else:
|
||||||
orth, has_space = orth_space
|
orth, has_space = orth_space
|
||||||
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
||||||
|
@ -186,7 +187,8 @@ cdef class Doc:
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a `Token` or `Span` object.
|
"""Get a `Token` or `Span` object.
|
||||||
|
|
||||||
i (int or tuple) The index of the token, or the slice of the document to get.
|
i (int or tuple) The index of the token, or the slice of the document
|
||||||
|
to get.
|
||||||
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
||||||
`doc[start : end]`.
|
`doc[start : end]`.
|
||||||
|
|
||||||
|
@ -199,11 +201,11 @@ cdef class Doc:
|
||||||
>>> doc[start : end]]
|
>>> doc[start : end]]
|
||||||
Get a `Span` object, starting at position `start` and ending at
|
Get a `Span` object, starting at position `start` and ending at
|
||||||
position `end`, where `start` and `end` are token indices. For
|
position `end`, where `start` and `end` are token indices. For
|
||||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
|
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
|
||||||
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
|
4. Stepped slices (e.g. `doc[start : end : step]`) are not
|
||||||
as `Span` objects must be contiguous (cannot have gaps). You can use
|
supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||||
negative indices and open-ended ranges, which have their normal
|
You can use negative indices and open-ended ranges, which have
|
||||||
Python semantics.
|
their normal Python semantics.
|
||||||
"""
|
"""
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
|
@ -262,8 +264,10 @@ cdef class Doc:
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first character of the span.
|
start (int): The index of the first character of the span.
|
||||||
end (int): The index of the first character after the span.
|
end (int): The index of the first character after the span.
|
||||||
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
|
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
named entities.
|
||||||
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||||
|
the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, int):
|
if not isinstance(label, int):
|
||||||
|
@ -377,13 +381,14 @@ cdef class Doc:
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property ents:
|
property ents:
|
||||||
"""Iterate over the entities in the document. Yields named-entity `Span`
|
"""Iterate over the entities in the document. Yields named-entity
|
||||||
objects, if the entity recognizer has been applied to the document.
|
`Span` objects, if the entity recognizer has been applied to the
|
||||||
|
document.
|
||||||
|
|
||||||
YIELDS (Span): Entities in the document.
|
YIELDS (Span): Entities in the document.
|
||||||
|
|
||||||
EXAMPLE: Iterate over the span to get individual Token objects, or access
|
EXAMPLE: Iterate over the span to get individual Token objects,
|
||||||
the label:
|
or access the label:
|
||||||
|
|
||||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
>>> ents = list(tokens.ents)
|
>>> ents = list(tokens.ents)
|
||||||
|
@ -456,10 +461,11 @@ cdef class Doc:
|
||||||
|
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
"""Iterate over the base noun phrases in the document. Yields base
|
"""Iterate over the base noun phrases in the document. Yields base
|
||||||
noun-phrase #[code Span] objects, if the document has been syntactically
|
noun-phrase #[code Span] objects, if the document has been
|
||||||
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
|
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||||
not permit other NPs to be nested within it – so no NP-level
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
coordination, no prepositional phrases, and no relative clauses.
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
|
clauses.
|
||||||
|
|
||||||
YIELDS (Span): Noun chunks in the document.
|
YIELDS (Span): Noun chunks in the document.
|
||||||
"""
|
"""
|
||||||
|
@ -467,12 +473,14 @@ cdef class Doc:
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. For more info, see the "
|
"requires a statistical model to be installed and loaded. "
|
||||||
|
"For more info, see the "
|
||||||
"documentation: \n%s\n" % about.__docs_models__)
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
# Accumulate the result before beginning to iterate over it. This prevents
|
# Accumulate the result before beginning to iterate over it. This
|
||||||
# the tokenisation from being changed out from under us during the iteration.
|
# prevents the tokenisation from being changed out from under us
|
||||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
# during the iteration. The tricky thing here is that Span accepts
|
||||||
# so it's okay once we have the Span objects. See Issue #375
|
# its tokenisation changing, so it's okay once we have the Span
|
||||||
|
# objects. See Issue #375.
|
||||||
spans = []
|
spans = []
|
||||||
for start, end, label in self.noun_chunks_iterator(self):
|
for start, end, label in self.noun_chunks_iterator(self):
|
||||||
spans.append(Span(self, start, end, label=label))
|
spans.append(Span(self, start, end, label=label))
|
||||||
|
@ -497,8 +505,9 @@ cdef class Doc:
|
||||||
|
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sentence boundary detection requires the dependency parse, which "
|
"Sentence boundary detection requires the dependency "
|
||||||
"requires data to be installed. For more info, see the "
|
"parse, which requires a statistical model to be "
|
||||||
|
"installed and loaded. For more info, see the "
|
||||||
"documentation: \n%s\n" % about.__docs_models__)
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
cdef int i
|
cdef int i
|
||||||
start = 0
|
start = 0
|
||||||
|
@ -537,12 +546,11 @@ cdef class Doc:
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
"""Export given token attributes to a numpy `ndarray`.
|
"""Export given token attributes to a numpy `ndarray`.
|
||||||
|
If `attr_ids` is a sequence of M attributes, the output array will be
|
||||||
If `attr_ids` is a sequence of M attributes, the output array will
|
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
|
||||||
be of shape `(N, M)`, where N is the length of the `Doc`
|
`attr_ids` is a single attribute, the output shape will be (N,). You
|
||||||
(in tokens). If `attr_ids` is a single attribute, the output shape will
|
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
|
||||||
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
|
string name (e.g. 'LEMMA' or 'lemma').
|
||||||
or string name (e.g. 'LEMMA' or 'lemma').
|
|
||||||
|
|
||||||
attr_ids (list[]): A list of attributes (int IDs or string names).
|
attr_ids (list[]): A list of attributes (int IDs or string names).
|
||||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||||
|
@ -641,13 +649,12 @@ cdef class Doc:
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
if SENT_START in attrs and HEAD in attrs:
|
if SENT_START in attrs and HEAD in attrs:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Conflicting attributes specified in doc.from_array():\n"
|
"Conflicting attributes specified in doc.from_array(): "
|
||||||
"(HEAD, SENT_START)\n"
|
"(HEAD, SENT_START)\n"
|
||||||
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
|
"The HEAD attribute currently sets sentence boundaries "
|
||||||
"based on the tree structure. This means the HEAD attribute would "
|
"implicitly, based on the tree structure. This means the HEAD "
|
||||||
"potentially override the sentence boundaries set by SENT_START.\n"
|
"attribute would potentially override the sentence boundaries "
|
||||||
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
|
"set by SENT_START.")
|
||||||
"workarounds, and to propose solutions.")
|
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
|
@ -675,18 +682,14 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_lca_matrix(self):
|
def get_lca_matrix(self):
|
||||||
'''
|
"""Calculates the lowest common ancestor matrix for a given `Doc`.
|
||||||
Calculates the lowest common ancestor matrix
|
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||||
for a given Spacy doc.
|
if no common ancestor is found (ex if span excludes a necessary
|
||||||
Returns LCA matrix containing the integer index
|
ancestor). Apologies about the recursion, but the impact on
|
||||||
of the ancestor, or -1 if no common ancestor is
|
performance is negligible given the natural limitations on the depth
|
||||||
found (ex if span excludes a necessary ancestor).
|
of a typical human sentence.
|
||||||
Apologies about the recursion, but the
|
"""
|
||||||
impact on performance is negligible given
|
|
||||||
the natural limitations on the depth of a typical human sentence.
|
|
||||||
'''
|
|
||||||
# Efficiency notes:
|
# Efficiency notes:
|
||||||
#
|
|
||||||
# We can easily improve the performance here by iterating in Cython.
|
# We can easily improve the performance here by iterating in Cython.
|
||||||
# To loop over the tokens in Cython, the easiest way is:
|
# To loop over the tokens in Cython, the easiest way is:
|
||||||
# for token in doc.c[:doc.c.length]:
|
# for token in doc.c[:doc.c.length]:
|
||||||
|
@ -719,7 +722,6 @@ cdef class Doc:
|
||||||
token_k = self[k]
|
token_k = self[k]
|
||||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
|
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
|
||||||
lca_matrix[k][j] = lca_matrix[j][k]
|
lca_matrix[k][j] = lca_matrix[j][k]
|
||||||
|
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
@ -819,14 +821,15 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
"""Retokenize the document, such that the span at
|
||||||
is merged into a single token. If `start_idx` and `end_idx `do not mark
|
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||||
start and end token boundaries, the document remains unchanged.
|
`start_idx` and `end_idx `do not mark start and end token boundaries,
|
||||||
|
the document remains unchanged.
|
||||||
|
|
||||||
start_idx (int): The character index of the start of the slice to merge.
|
start_idx (int): Character index of the start of the slice to merge.
|
||||||
end_idx (int): The character index after the end of the slice to merge.
|
end_idx (int): Character index after the end of the slice to merge.
|
||||||
**attributes: Attributes to assign to the merged token. By default,
|
**attributes: Attributes to assign to the merged token. By default,
|
||||||
attributes are inherited from the syntactic root token of the span.
|
attributes are inherited from the syntactic root of the span.
|
||||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||||
indices did not fall at token boundaries.
|
indices did not fall at token boundaries.
|
||||||
"""
|
"""
|
||||||
|
@ -847,10 +850,10 @@ cdef class Doc:
|
||||||
attributes[ENT_TYPE] = attributes['ent_type']
|
attributes[ENT_TYPE] = attributes['ent_type']
|
||||||
elif args:
|
elif args:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Doc.merge received %d non-keyword arguments. "
|
"Doc.merge received %d non-keyword arguments. Expected either "
|
||||||
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
|
"3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||||
"Arguments supplied:\n%s\n"
|
"Arguments supplied:\n%s\n"
|
||||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
"Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))
|
||||||
|
|
||||||
# More deprecated attribute handling =/
|
# More deprecated attribute handling =/
|
||||||
if 'label' in attributes:
|
if 'label' in attributes:
|
||||||
|
@ -882,8 +885,9 @@ cdef class Doc:
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a dependency
|
# Before thinking of something simpler, beware the case where a
|
||||||
# bridges over the entity. Here the alignment of the tokens changes.
|
# dependency bridges over the entity. Here the alignment of the
|
||||||
|
# tokens changes.
|
||||||
span_root = span.root.i
|
span_root = span.root.i
|
||||||
token.dep = span.root.dep
|
token.dep = span.root.dep
|
||||||
# We update token.lex after keeping span root and dep, since
|
# We update token.lex after keeping span root and dep, since
|
||||||
|
@ -932,8 +936,9 @@ cdef class Doc:
|
||||||
>>> trees = doc.print_tree()
|
>>> trees = doc.print_tree()
|
||||||
>>> trees[1]
|
>>> trees[1]
|
||||||
{'modifiers': [
|
{'modifiers': [
|
||||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
|
||||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
|
||||||
|
'lemma': 'Alice'},
|
||||||
{'modifiers': [
|
{'modifiers': [
|
||||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||||
|
@ -1018,4 +1023,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
||||||
|
|
||||||
|
|
||||||
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
||||||
|
|
||||||
|
|
|
@ -35,15 +35,16 @@ cdef class Span:
|
||||||
def has_extension(cls, name):
|
def has_extension(cls, name):
|
||||||
return name in Underscore.span_extensions
|
return name in Underscore.span_extensions
|
||||||
|
|
||||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
|
||||||
vector_norm=None):
|
vector=None, vector_norm=None):
|
||||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||||
|
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first token of the span.
|
start (int): The index of the first token of the span.
|
||||||
end (int): The index of the first token after the span.
|
end (int): The index of the first token after the span.
|
||||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||||
|
of the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if not (0 <= start <= end <= len(doc)):
|
if not (0 <= start <= end <= len(doc)):
|
||||||
|
@ -162,7 +163,8 @@ cdef class Span:
|
||||||
attributes are inherited from the syntactic root token of the span.
|
attributes are inherited from the syntactic root token of the span.
|
||||||
RETURNS (Token): The newly merged token.
|
RETURNS (Token): The newly merged token.
|
||||||
"""
|
"""
|
||||||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
return self.doc.merge(self.start_char, self.end_char, *args,
|
||||||
|
**attributes)
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||||
|
@ -179,24 +181,19 @@ cdef class Span:
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
def get_lca_matrix(self):
|
def get_lca_matrix(self):
|
||||||
'''
|
"""Calculates the lowest common ancestor matrix for a given `Span`.
|
||||||
Calculates the lowest common ancestor matrix
|
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||||
for a given Spacy span.
|
if no common ancestor is found (ex if span excludes a necessary
|
||||||
Returns LCA matrix containing the integer index
|
ancestor). Apologies about the recursion, but the impact on
|
||||||
of the ancestor, or -1 if no common ancestor is
|
performance is negligible given the natural limitations on the depth
|
||||||
found (ex if span excludes a necessary ancestor).
|
of a typical human sentence.
|
||||||
Apologies about the recursion, but the
|
"""
|
||||||
impact on performance is negligible given
|
|
||||||
the natural limitations on the depth of a typical human sentence.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
|
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
|
||||||
offset = margins[0]
|
offset = margins[0]
|
||||||
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
|
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
|
||||||
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
|
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
|
||||||
token_j_i = token_j.i - offset
|
token_j_i = token_j.i - offset
|
||||||
token_k_i = token_k.i - offset
|
token_k_i = token_k.i - offset
|
||||||
|
|
||||||
if lca_matrix[token_j_i][token_k_i] != -2:
|
if lca_matrix[token_j_i][token_k_i] != -2:
|
||||||
return lca_matrix[token_j_i][token_k_i]
|
return lca_matrix[token_j_i][token_k_i]
|
||||||
elif token_j == token_k:
|
elif token_j == token_k:
|
||||||
|
@ -209,23 +206,19 @@ cdef class Span:
|
||||||
lca_index = -1
|
lca_index = -1
|
||||||
else:
|
else:
|
||||||
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
|
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
|
||||||
|
|
||||||
lca_matrix[token_j_i][token_k_i] = lca_index
|
lca_matrix[token_j_i][token_k_i] = lca_index
|
||||||
lca_matrix[token_k_i][token_j_i] = lca_index
|
lca_matrix[token_k_i][token_j_i] = lca_index
|
||||||
|
|
||||||
return lca_index
|
return lca_index
|
||||||
|
|
||||||
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
||||||
lca_matrix.fill(-2)
|
lca_matrix.fill(-2)
|
||||||
margins = [self.start, self.end]
|
margins = [self.start, self.end]
|
||||||
|
|
||||||
for j in range(len(self)):
|
for j in range(len(self)):
|
||||||
token_j = self[j]
|
token_j = self[j]
|
||||||
for k in range(len(self)):
|
for k in range(len(self)):
|
||||||
token_k = self[k]
|
token_k = self[k]
|
||||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
|
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
|
||||||
lca_matrix[k][j] = lca_matrix[j][k]
|
lca_matrix[k][j] = lca_matrix[j][k]
|
||||||
|
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
|
@ -349,7 +342,8 @@ cdef class Span:
|
||||||
"""The text content of the span with a trailing whitespace character if
|
"""The text content of the span with a trailing whitespace character if
|
||||||
the last token has one.
|
the last token has one.
|
||||||
|
|
||||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
RETURNS (unicode): The text content of the span (with trailing
|
||||||
|
whitespace).
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return u''.join([t.text_with_ws for t in self])
|
return u''.join([t.text_with_ws for t in self])
|
||||||
|
@ -358,7 +352,8 @@ cdef class Span:
|
||||||
"""Yields base noun-phrase `Span` objects, if the document has been
|
"""Yields base noun-phrase `Span` objects, if the document has been
|
||||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||||
phrase that does not permit other NPs to be nested within it – so no
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
NP-level coordination, no prepositional phrases, and no relative clauses.
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
|
clauses.
|
||||||
|
|
||||||
YIELDS (Span): Base noun-phrase `Span` objects
|
YIELDS (Span): Base noun-phrase `Span` objects
|
||||||
"""
|
"""
|
||||||
|
@ -366,7 +361,8 @@ cdef class Span:
|
||||||
if not self.doc.is_parsed:
|
if not self.doc.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. For more info, see the "
|
"requires a statistical model to be installed and loaded. "
|
||||||
|
"For more info, see the "
|
||||||
"documentation: \n%s\n" % about.__docs_models__)
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
# Accumulate the result before beginning to iterate over it. This prevents
|
# Accumulate the result before beginning to iterate over it. This prevents
|
||||||
# the tokenisation from being changed out from under us during the iteration.
|
# the tokenisation from being changed out from under us during the iteration.
|
||||||
|
@ -385,9 +381,9 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (Token): The root token.
|
RETURNS (Token): The root token.
|
||||||
|
|
||||||
EXAMPLE: The root token has the shortest path to the root of the sentence
|
EXAMPLE: The root token has the shortest path to the root of the
|
||||||
(or is the root itself). If multiple words are equally high in the
|
sentence (or is the root itself). If multiple words are equally
|
||||||
tree, the first word is taken. For example:
|
high in the tree, the first word is taken. For example:
|
||||||
|
|
||||||
>>> toks = nlp(u'I like New York in Autumn.')
|
>>> toks = nlp(u'I like New York in Autumn.')
|
||||||
|
|
||||||
|
@ -437,11 +433,11 @@ cdef class Span:
|
||||||
if self.doc.c[i].head == 0:
|
if self.doc.c[i].head == 0:
|
||||||
return self.doc[i]
|
return self.doc[i]
|
||||||
# If we don't have a sentence root, we do something that's not so
|
# If we don't have a sentence root, we do something that's not so
|
||||||
# algorithmically clever, but I think should be quite fast, especially
|
# algorithmically clever, but I think should be quite fast,
|
||||||
# for short spans.
|
# especially for short spans.
|
||||||
# For each word, we count the path length, and arg min this measure.
|
# For each word, we count the path length, and arg min this measure.
|
||||||
# We could use better tree logic to save steps here...But I think this
|
# We could use better tree logic to save steps here...But I
|
||||||
# should be okay.
|
# think this should be okay.
|
||||||
cdef int current_best = self.doc.length
|
cdef int current_best = self.doc.length
|
||||||
cdef int root = -1
|
cdef int root = -1
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.start, self.end):
|
||||||
|
@ -463,7 +459,7 @@ cdef class Span:
|
||||||
YIELDS (Token):A left-child of a token of the span.
|
YIELDS (Token):A left-child of a token of the span.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for token in reversed(self): # Reverse, so we get the tokens in order
|
for token in reversed(self): # Reverse, so we get tokens in order
|
||||||
for left in token.lefts:
|
for left in token.lefts:
|
||||||
if left.i < self.start:
|
if left.i < self.start:
|
||||||
yield left
|
yield left
|
||||||
|
@ -493,7 +489,7 @@ cdef class Span:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
"""An (integer) entity ID.
|
||||||
|
|
||||||
RETURNS (uint64): The entity ID.
|
RETURNS (uint64): The entity ID.
|
||||||
"""
|
"""
|
||||||
|
@ -503,8 +499,8 @@ cdef class Span:
|
||||||
def __set__(self, hash_t key):
|
def __set__(self, hash_t key):
|
||||||
# TODO
|
# TODO
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id from Span. Vote for this feature on "
|
||||||
"tracker: http://github.com/explosion/spaCy/issues")
|
"the issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
|
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||||
|
@ -517,13 +513,16 @@ cdef class Span:
|
||||||
def __set__(self, hash_t key):
|
def __set__(self, hash_t key):
|
||||||
# TODO
|
# TODO
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id_ from Span. Vote for this feature on the "
|
||||||
"tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
# TODO: docstring
|
"""Verbatim text content (identical to Span.text). Exists mostly for
|
||||||
|
consistency with other attributes.
|
||||||
|
|
||||||
|
RETURNS (unicode): The span's text."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string for t in self]).strip()
|
return ''.join([t.orth_ for t in self]).strip()
|
||||||
|
|
||||||
property lemma_:
|
property lemma_:
|
||||||
"""The span's lemma.
|
"""The span's lemma.
|
||||||
|
@ -534,19 +533,19 @@ cdef class Span:
|
||||||
return ' '.join([t.lemma_ for t in self]).strip()
|
return ' '.join([t.lemma_ for t in self]).strip()
|
||||||
|
|
||||||
property upper_:
|
property upper_:
|
||||||
# TODO: docstring
|
"""Deprecated. Use Span.text.upper() instead."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string.upper() for t in self]).strip()
|
return ''.join([t.text_with_ws.upper() for t in self]).strip()
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
# TODO: docstring
|
"""Deprecated. Use Span.text.lower() instead."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string.lower() for t in self]).strip()
|
return ''.join([t.text_with_ws.lower() for t in self]).strip()
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
# TODO: docstring
|
"""Deprecated: Use Span.text instead."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string for t in self])
|
return ''.join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
property label_:
|
property label_:
|
||||||
"""The span's label.
|
"""The span's label.
|
||||||
|
@ -570,7 +569,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
n += 1
|
n += 1
|
||||||
if n >= sent_length:
|
if n >= sent_length:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Array bounds exceeded while searching for root word. This likely "
|
"Array bounds exceeded while searching for root word. This "
|
||||||
"means the parse tree is in an invalid state. Please report this "
|
"likely means the parse tree is in an invalid state. Please "
|
||||||
"issue here: http://github.com/explosion/spaCy/issues")
|
"report this issue here: "
|
||||||
|
"http://github.com/explosion/spaCy/issues")
|
||||||
return n
|
return n
|
||||||
|
|
|
@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||||
from ..attrs cimport LEMMA, POS, TAG, DEP
|
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
from .. import about
|
from .. import about
|
||||||
from .underscore import Underscore
|
from .underscore import Underscore
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
|
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||||
|
etc."""
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_extension(cls, name, default=None, method=None,
|
def set_extension(cls, name, default=None, method=None,
|
||||||
getter=None, setter=None):
|
getter=None, setter=None):
|
||||||
|
@ -171,10 +172,11 @@ cdef class Token:
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property text_with_ws:
|
property text_with_ws:
|
||||||
"""The text content of the token with a trailing whitespace character if
|
"""The text content of the token with a trailing whitespace character
|
||||||
it has one.
|
if it has one.
|
||||||
|
|
||||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
RETURNS (unicode): The text content of the span (with trailing
|
||||||
|
whitespace).
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||||
|
@ -306,9 +308,8 @@ cdef class Token:
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
if self.doc.is_parsed:
|
if self.doc.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Refusing to write to token.sent_start if its document is parsed, '
|
"Refusing to write to token.sent_start if its document "
|
||||||
'because this may cause inconsistent state. '
|
"is parsed, because this may cause inconsistent state.")
|
||||||
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
|
|
||||||
if value is None:
|
if value is None:
|
||||||
self.c.sent_start = 0
|
self.c.sent_start = 0
|
||||||
elif value is True:
|
elif value is True:
|
||||||
|
@ -316,13 +317,12 @@ cdef class Token:
|
||||||
elif value is False:
|
elif value is False:
|
||||||
self.c.sent_start = -1
|
self.c.sent_start = -1
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid value for token.sent_start -- must be one of "
|
raise ValueError("Invalid value for token.sent_start. Must be "
|
||||||
"None, True, False")
|
"one of: None, True, False")
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""
|
"""The leftward immediate children of the word, in the syntactic
|
||||||
The leftward immediate children of the word, in the syntactic
|
|
||||||
dependency parse.
|
dependency parse.
|
||||||
"""
|
"""
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
|
@ -334,13 +334,12 @@ cdef class Token:
|
||||||
nr_iter += 1
|
nr_iter += 1
|
||||||
# This is ugly, but it's a way to guard out infinite loops
|
# This is ugly, but it's a way to guard out infinite loops
|
||||||
if nr_iter >= 10000000:
|
if nr_iter >= 10000000:
|
||||||
raise RuntimeError(
|
raise RuntimeError("Possibly infinite loop encountered "
|
||||||
"Possibly infinite loop encountered while looking for token.lefts")
|
"while looking for token.lefts")
|
||||||
|
|
||||||
property rights:
|
property rights:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""
|
"""The rightward immediate children of the word, in the syntactic
|
||||||
The rightward immediate children of the word, in the syntactic
|
|
||||||
dependency parse.
|
dependency parse.
|
||||||
"""
|
"""
|
||||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||||
|
@ -352,27 +351,26 @@ cdef class Token:
|
||||||
ptr -= 1
|
ptr -= 1
|
||||||
nr_iter += 1
|
nr_iter += 1
|
||||||
if nr_iter >= 10000000:
|
if nr_iter >= 10000000:
|
||||||
raise RuntimeError(
|
raise RuntimeError("Possibly infinite loop encountered "
|
||||||
"Possibly infinite loop encountered while looking for token.rights")
|
"while looking for token.rights")
|
||||||
tokens.reverse()
|
tokens.reverse()
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
property children:
|
property children:
|
||||||
"""
|
"""A sequence of the token's immediate syntactic children.
|
||||||
A sequence of the token's immediate syntactic children.
|
|
||||||
|
|
||||||
Yields: Token A child token such that child.head==self
|
YIELDS (Token): A child token such that child.head==self
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
yield from self.lefts
|
yield from self.lefts
|
||||||
yield from self.rights
|
yield from self.rights
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
"""
|
"""A sequence of all the token's syntactic descendents.
|
||||||
A sequence of all the token's syntactic descendents.
|
|
||||||
|
|
||||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
YIELDS (Token): A descendent token such that
|
||||||
|
`self.is_ancestor(descendent)`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
|
@ -456,13 +454,15 @@ cdef class Token:
|
||||||
if self.c.head > 0: # left dependent
|
if self.c.head > 0: # left dependent
|
||||||
old_head.c.l_kids -= 1
|
old_head.c.l_kids -= 1
|
||||||
if self.c.l_edge == old_head.c.l_edge:
|
if self.c.l_edge == old_head.c.l_edge:
|
||||||
# the token dominates the left edge so the left edge of the head
|
# the token dominates the left edge so the left edge of
|
||||||
# may change when the token is reattached
|
# the head may change when the token is reattached, it may
|
||||||
# it may not change if the new head is a descendant of the current head
|
# not change if the new head is a descendant of the current
|
||||||
|
# head
|
||||||
|
|
||||||
new_edge = self.c.l_edge
|
new_edge = self.c.l_edge
|
||||||
# the new l_edge is the left-most l_edge on any of the other dependents
|
# the new l_edge is the left-most l_edge on any of the
|
||||||
# where the l_edge is left of the head, otherwise it is the head
|
# other dependents where the l_edge is left of the head,
|
||||||
|
# otherwise it is the head
|
||||||
if not is_desc:
|
if not is_desc:
|
||||||
new_edge = old_head.i
|
new_edge = old_head.i
|
||||||
for child in old_head.children:
|
for child in old_head.children:
|
||||||
|
@ -472,14 +472,15 @@ cdef class Token:
|
||||||
new_edge = child.c.l_edge
|
new_edge = child.c.l_edge
|
||||||
old_head.c.l_edge = new_edge
|
old_head.c.l_edge = new_edge
|
||||||
|
|
||||||
# walk up the tree from old_head and assign new l_edge to ancestors
|
# walk up the tree from old_head and assign new l_edge to
|
||||||
# until an ancestor already has an l_edge that's further left
|
# ancestors until an ancestor already has an l_edge that's
|
||||||
|
# further left
|
||||||
for anc in old_head.ancestors:
|
for anc in old_head.ancestors:
|
||||||
if anc.c.l_edge <= new_edge:
|
if anc.c.l_edge <= new_edge:
|
||||||
break
|
break
|
||||||
anc.c.l_edge = new_edge
|
anc.c.l_edge = new_edge
|
||||||
|
|
||||||
elif self.c.head < 0: # right dependent
|
elif self.c.head < 0: # right dependent
|
||||||
old_head.c.r_kids -= 1
|
old_head.c.r_kids -= 1
|
||||||
# do the same thing as for l_edge
|
# do the same thing as for l_edge
|
||||||
if self.c.r_edge == old_head.c.r_edge:
|
if self.c.r_edge == old_head.c.r_edge:
|
||||||
|
@ -500,7 +501,7 @@ cdef class Token:
|
||||||
anc.c.r_edge = new_edge
|
anc.c.r_edge = new_edge
|
||||||
|
|
||||||
# update number of deps of new head
|
# update number of deps of new head
|
||||||
if rel_newhead_i > 0: # left dependent
|
if rel_newhead_i > 0: # left dependent
|
||||||
new_head.c.l_kids += 1
|
new_head.c.l_kids += 1
|
||||||
# walk up the tree from new head and set l_edge to self.l_edge
|
# walk up the tree from new head and set l_edge to self.l_edge
|
||||||
# until you hit a token with an l_edge further to the left
|
# until you hit a token with an l_edge further to the left
|
||||||
|
@ -511,7 +512,7 @@ cdef class Token:
|
||||||
break
|
break
|
||||||
anc.c.l_edge = self.c.l_edge
|
anc.c.l_edge = self.c.l_edge
|
||||||
|
|
||||||
elif rel_newhead_i < 0: # right dependent
|
elif rel_newhead_i < 0: # right dependent
|
||||||
new_head.c.r_kids += 1
|
new_head.c.r_kids += 1
|
||||||
# do the same as for l_edge
|
# do the same as for l_edge
|
||||||
if self.c.r_edge > new_head.c.r_edge:
|
if self.c.r_edge > new_head.c.r_edge:
|
||||||
|
@ -572,8 +573,8 @@ cdef class Token:
|
||||||
|
|
||||||
property ent_iob_:
|
property ent_iob_:
|
||||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||||
"I" means it is inside an entity, "O" means it is outside an entity, and
|
"I" means it is inside an entity, "O" means it is outside an entity,
|
||||||
"" means no entity tag is set.
|
and "" means no entity tag is set.
|
||||||
|
|
||||||
RETURNS (unicode): IOB code of named entity tag.
|
RETURNS (unicode): IOB code of named entity tag.
|
||||||
"""
|
"""
|
||||||
|
@ -582,8 +583,7 @@ cdef class Token:
|
||||||
return iob_strings[self.c.ent_iob]
|
return iob_strings[self.c.ent_iob]
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""ID of the entity the token is an instance of, if any. Usually
|
"""ID of the entity the token is an instance of, if any.
|
||||||
assigned by patterns in the Matcher.
|
|
||||||
|
|
||||||
RETURNS (uint64): ID of the entity.
|
RETURNS (uint64): ID of the entity.
|
||||||
"""
|
"""
|
||||||
|
@ -594,8 +594,7 @@ cdef class Token:
|
||||||
self.c.ent_id = key
|
self.c.ent_id = key
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
"""ID of the entity the token is an instance of, if any. Usually
|
"""ID of the entity the token is an instance of, if any.
|
||||||
assigned by patterns in the Matcher.
|
|
||||||
|
|
||||||
RETURNS (unicode): ID of the entity.
|
RETURNS (unicode): ID of the entity.
|
||||||
"""
|
"""
|
||||||
|
@ -606,34 +605,70 @@ cdef class Token:
|
||||||
self.c.ent_id = self.vocab.strings.add(name)
|
self.c.ent_id = self.vocab.strings.add(name)
|
||||||
|
|
||||||
property whitespace_:
|
property whitespace_:
|
||||||
|
"""Trailing space character if present.
|
||||||
|
|
||||||
|
RETURNS (unicode): The whitespace character.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ' ' if self.c.spacy else ''
|
return ' ' if self.c.spacy else ''
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
|
"""Verbatim text content (identical to `Token.text`). Existst mostly
|
||||||
|
for consistency with the other attributes.
|
||||||
|
|
||||||
|
RETURNS (unicode): The token text.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.orth]
|
return self.vocab.strings[self.c.lex.orth]
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
|
"""Lowercase form of the token text. Equivalent to
|
||||||
|
`Token.text.lower()`.
|
||||||
|
|
||||||
|
RETURNS (unicode): The lowercase token text.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.lower]
|
return self.vocab.strings[self.c.lex.lower]
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
|
"""The token's norm, i.e. a normalised form of the token text.
|
||||||
|
Usually set in the language's tokenizer exceptions or norm exceptions.
|
||||||
|
|
||||||
|
RETURNS (unicode): The norm.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.norm]
|
return self.vocab.strings[self.c.lex.norm]
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
|
"""Transform of the tokens's string, to show orthographic features.
|
||||||
|
For example, "Xxxx" or "dd".
|
||||||
|
|
||||||
|
RETURNS (unicode): The token shape.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.shape]
|
return self.vocab.strings[self.c.lex.shape]
|
||||||
|
|
||||||
property prefix_:
|
property prefix_:
|
||||||
|
"""A length-N substring from the start of the token. Defaults to `N=1`.
|
||||||
|
|
||||||
|
RETURNS (unicode): The token's prefix.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.prefix]
|
return self.vocab.strings[self.c.lex.prefix]
|
||||||
|
|
||||||
property suffix_:
|
property suffix_:
|
||||||
|
"""A length-N substring from the end of the token. Defaults to `N=3`.
|
||||||
|
|
||||||
|
RETURNS (unicode): The token's suffix.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.suffix]
|
return self.vocab.strings[self.c.lex.suffix]
|
||||||
|
|
||||||
property lang_:
|
property lang_:
|
||||||
|
"""Language of the parent document's vocabulary, e.g. 'en'.
|
||||||
|
|
||||||
|
RETURNS (unicode): The language code.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.lang]
|
return self.vocab.strings[self.c.lex.lang]
|
||||||
|
|
||||||
|
@ -648,65 +683,152 @@ cdef class Token:
|
||||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
|
"""Coarse-grained part-of-speech.
|
||||||
|
|
||||||
|
RETURNS (unicode): The part-of-speech tag.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return parts_of_speech.NAMES[self.c.pos]
|
return parts_of_speech.NAMES[self.c.pos]
|
||||||
|
|
||||||
property tag_:
|
property tag_:
|
||||||
|
"""Fine-grained part-of-speech.
|
||||||
|
|
||||||
|
RETURNS (unicode): The part-of-speech tag.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.tag]
|
return self.vocab.strings[self.c.tag]
|
||||||
def __set__(self, tag):
|
def __set__(self, tag):
|
||||||
self.tag = self.vocab.strings.add(tag)
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
property dep_:
|
property dep_:
|
||||||
|
"""Syntactic dependency relation.
|
||||||
|
|
||||||
|
RETURNS (unicode): The dependency label.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
def __set__(self, unicode label):
|
def __set__(self, unicode label):
|
||||||
self.c.dep = self.vocab.strings.add(label)
|
self.c.dep = self.vocab.strings.add(label)
|
||||||
|
|
||||||
property is_oov:
|
property is_oov:
|
||||||
|
"""Is the token out-of-vocabulary?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is out-of-vocabulary.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||||
|
|
||||||
property is_stop:
|
property is_stop:
|
||||||
|
"""Is the token part of a "stop list"? (defined by the language data)
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is a stop word.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
|
"""Does the token consist of alphabetic characters? Equivalent to
|
||||||
|
`token.text.isalpha()`.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token consists of alpha characters.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
|
||||||
|
|
||||||
property is_ascii:
|
property is_ascii:
|
||||||
|
"""Does the token consist of ASCII characters? Equivalent to
|
||||||
|
`[any(ord(c) >= 128 for c in token.text)]`.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token consists of ASCII characters.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
|
||||||
|
|
||||||
property is_digit:
|
property is_digit:
|
||||||
|
"""Does the token consist of digits? Equivalent to
|
||||||
|
`token.text.isdigit()`.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token consists of digits.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
|
||||||
|
|
||||||
property is_lower:
|
property is_lower:
|
||||||
|
"""Is the token in lowercase? Equivalent to `token.text.islower()`.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is in lowercase.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
|
||||||
|
|
||||||
|
property is_upper:
|
||||||
|
"""Is the token in uppercase? Equivalent to `token.text.isupper()`.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is in uppercase.
|
||||||
|
"""
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
|
||||||
|
|
||||||
property is_title:
|
property is_title:
|
||||||
|
"""Is the token in titlecase? Equivalent to `token.text.istitle()`.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is in titlecase.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
|
||||||
|
|
||||||
property is_punct:
|
property is_punct:
|
||||||
|
"""Is the token punctuation?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is punctuation.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
||||||
|
|
||||||
property is_space:
|
property is_space:
|
||||||
|
"""Does the token consist of whitespace characters? Equivalent to
|
||||||
|
`token.text.isspace()`.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token consists of whitespace characters.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||||
|
|
||||||
property is_bracket:
|
property is_bracket:
|
||||||
|
"""Is the token a bracket?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is a bracket.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||||
|
|
||||||
property is_quote:
|
property is_quote:
|
||||||
|
"""Is the token a quotation mark?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is a quotation mark.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||||
|
|
||||||
property is_left_punct:
|
property is_left_punct:
|
||||||
|
"""Is the token a left punctuation mark, e.g. "("?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is a left punctuation mark.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
property is_right_punct:
|
property is_right_punct:
|
||||||
|
"""Is the token a left punctuation mark, e.g. "("?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token is a left punctuation mark.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
|
"""Does the token resemble a URL?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token resembles a URL.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
||||||
|
|
||||||
property like_num:
|
property like_num:
|
||||||
|
"""Does the token represent a number? e.g. "10.9", "10", "ten", etc.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token resembles a number.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
|
||||||
|
|
||||||
property like_email:
|
property like_email:
|
||||||
|
"""Does the token resemble an email address?
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the token resembles an email address.
|
||||||
|
"""
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
||||||
|
|
|
@ -248,6 +248,28 @@ p
|
||||||
+cell float
|
+cell float
|
||||||
+cell A scalar similarity score. Higher is more similar.
|
+cell A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
|
+h(2, "get_lca_matrix") Span.get_lca_matrix
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Calculates the lowest common ancestor matrix for a given #[code Span].
|
||||||
|
| Returns LCA matrix containing the integer index of the ancestor, or
|
||||||
|
| #[code -1] if no common ancestor is found, e.g. if span excludes a
|
||||||
|
| necessary ancestor.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn')
|
||||||
|
span = doc[1:4]
|
||||||
|
matrix = span.get_lca_matrix()
|
||||||
|
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||||
|
+cell The lowest common ancestor matrix of the #[code Span].
|
||||||
|
|
||||||
|
|
||||||
+h(2, "to_array") Span.to_array
|
+h(2, "to_array") Span.to_array
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
@ -495,6 +517,18 @@ p
|
||||||
| The text content of the span with a trailing whitespace character
|
| The text content of the span with a trailing whitespace character
|
||||||
| if the last token has one.
|
| if the last token has one.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth]
|
||||||
|
+cell int
|
||||||
|
+cell ID of the verbatim text content.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth_]
|
||||||
|
+cell unicode
|
||||||
|
+cell
|
||||||
|
| Verbatim text content (identical to #[code Span.text]). Existst
|
||||||
|
| mostly for consistency with the other attributes.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code label]
|
+cell #[code label]
|
||||||
+cell int
|
+cell int
|
||||||
|
|
|
@ -489,15 +489,35 @@ p The L2 norm of the token's vector representation.
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Base form of the token, with no inflectional suffixes.
|
+cell Base form of the token, with no inflectional suffixes.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code norm]
|
||||||
|
+cell int
|
||||||
|
+cell
|
||||||
|
| The token's norm, i.e. a normalised form of the token text.
|
||||||
|
| Usually set in the language's
|
||||||
|
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||||
|
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code norm_]
|
||||||
|
+cell unicode
|
||||||
|
+cell
|
||||||
|
| The token's norm, i.e. a normalised form of the token text.
|
||||||
|
| Usually set in the language's
|
||||||
|
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||||
|
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lower]
|
+cell #[code lower]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Lower-case form of the token.
|
+cell Lowercase form of the token.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lower_]
|
+cell #[code lower_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Lower-case form of the token.
|
+cell
|
||||||
|
| Lowercase form of the token text. Equivalent to
|
||||||
|
| #[code Token.text.lower()].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code shape]
|
+cell #[code shape]
|
||||||
|
@ -537,7 +557,9 @@ p The L2 norm of the token's vector representation.
|
||||||
+row
|
+row
|
||||||
+cell #[code suffix_]
|
+cell #[code suffix_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
|
+cell
|
||||||
|
| Length-N substring from the end of the token. Defaults to
|
||||||
|
| #[code N=3].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code is_alpha]
|
+cell #[code is_alpha]
|
||||||
|
@ -672,6 +694,7 @@ p The L2 norm of the token's vector representation.
|
||||||
+cell #[code lang]
|
+cell #[code lang]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Language of the parent document's vocabulary.
|
+cell Language of the parent document's vocabulary.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lang_]
|
+cell #[code lang_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
|
|
Loading…
Reference in New Issue
Block a user