Tidy up and document Doc, Token and Span

This commit is contained in:
ines 2017-10-27 15:41:45 +02:00
parent 1a559d4c95
commit 6a0483b7aa
6 changed files with 356 additions and 173 deletions

View File

@ -2,4 +2,4 @@ from .doc import Doc
from .token import Token from .token import Token
from .span import Span from .span import Span
__all__ = [Doc, Token, Span] __all__ = ['Doc', 'Token', 'Span']

View File

@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs, IDS from ..attrs import intify_attrs, IDS
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport SENT_START from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle from ..compat import is_config, copy_reg, pickle
@ -78,17 +78,18 @@ def _get_chunker(lang):
cdef class Doc: cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export """A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings. annotations to numpy arrays, losslessly serialize to compressed binary
The `Doc` object holds an array of `TokenC` structs. The Python-level strings. The `Doc` object holds an array of `TokenC` structs. The
`Token` and `Span` objects are views of this array, i.e. they don't own Python-level `Token` and `Span` objects are views of this array, i.e.
the data themselves. they don't own the data themselves.
EXAMPLE: Construction 1 EXAMPLE: Construction 1
>>> doc = nlp(u'Some text') >>> doc = nlp(u'Some text')
Construction 2 Construction 2
>>> from spacy.tokens import Doc >>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
""" """
@classmethod @classmethod
def set_extension(cls, name, default=None, method=None, def set_extension(cls, name, default=None, method=None,
@ -109,15 +110,14 @@ cdef class Doc:
orths_and_spaces=None): orths_and_spaces=None):
"""Create a Doc object. """Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you want vocab (Vocab): A vocabulary object, which must match any models you
to use (e.g. tokenizer, parser, entity recognizer). want to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings to add to the document words (list or None): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list. as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)` it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc. user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object. RETURNS (Doc): The newly constructed object.
""" """
self.vocab = vocab self.vocab = vocab
@ -153,10 +153,10 @@ cdef class Doc:
spaces = [True] * len(words) spaces = [True] * len(words)
elif len(spaces) != len(words): elif len(spaces) != len(words):
raise ValueError( raise ValueError(
"Arguments 'words' and 'spaces' should be sequences of the " "Arguments 'words' and 'spaces' should be sequences of "
"same length, or 'spaces' should be left default at None. " "the same length, or 'spaces' should be left default at "
"spaces should be a sequence of booleans, with True meaning " "None. spaces should be a sequence of booleans, with True "
"that the word owns a ' ' character following it.") "meaning that the word owns a ' ' character following it.")
orths_and_spaces = zip(words, spaces) orths_and_spaces = zip(words, spaces)
if orths_and_spaces is not None: if orths_and_spaces is not None:
for orth_space in orths_and_spaces: for orth_space in orths_and_spaces:
@ -166,7 +166,8 @@ cdef class Doc:
elif isinstance(orth_space, bytes): elif isinstance(orth_space, bytes):
raise ValueError( raise ValueError(
"orths_and_spaces expects either List(unicode) or " "orths_and_spaces expects either List(unicode) or "
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space))) "List((unicode, bool)). "
"Got bytes instance: %s" % (str(orth_space)))
else: else:
orth, has_space = orth_space orth, has_space = orth_space
# Note that we pass self.mem here --- we have ownership, if LexemeC # Note that we pass self.mem here --- we have ownership, if LexemeC
@ -186,7 +187,8 @@ cdef class Doc:
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or `Span` object. """Get a `Token` or `Span` object.
i (int or tuple) The index of the token, or the slice of the document to get. i (int or tuple) The index of the token, or the slice of the document
to get.
RETURNS (Token or Span): The token at `doc[i]]`, or the span at RETURNS (Token or Span): The token at `doc[i]]`, or the span at
`doc[start : end]`. `doc[start : end]`.
@ -199,11 +201,11 @@ cdef class Doc:
>>> doc[start : end]] >>> doc[start : end]]
Get a `Span` object, starting at position `start` and ending at Get a `Span` object, starting at position `start` and ending at
position `end`, where `start` and `end` are token indices. For position `end`, where `start` and `end` are token indices. For
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4. instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
Stepped slices (e.g. `doc[start : end : step]`) are not supported, 4. Stepped slices (e.g. `doc[start : end : step]`) are not
as `Span` objects must be contiguous (cannot have gaps). You can use supported, as `Span` objects must be contiguous (cannot have gaps).
negative indices and open-ended ranges, which have their normal You can use negative indices and open-ended ranges, which have
Python semantics. their normal Python semantics.
""" """
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -262,8 +264,10 @@ cdef class Doc:
doc (Doc): The parent document. doc (Doc): The parent document.
start (int): The index of the first character of the span. start (int): The index of the first character of the span.
end (int): The index of the first character after the span. end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for named entities. label (uint64 or string): A label to attach to the Span, e.g. for
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
if not isinstance(label, int): if not isinstance(label, int):
@ -377,13 +381,14 @@ cdef class Doc:
return self.text return self.text
property ents: property ents:
"""Iterate over the entities in the document. Yields named-entity `Span` """Iterate over the entities in the document. Yields named-entity
objects, if the entity recognizer has been applied to the document. `Span` objects, if the entity recognizer has been applied to the
document.
YIELDS (Span): Entities in the document. YIELDS (Span): Entities in the document.
EXAMPLE: Iterate over the span to get individual Token objects, or access EXAMPLE: Iterate over the span to get individual Token objects,
the label: or access the label:
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents) >>> ents = list(tokens.ents)
@ -456,10 +461,11 @@ cdef class Doc:
property noun_chunks: property noun_chunks:
"""Iterate over the base noun phrases in the document. Yields base """Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been syntactically noun-phrase #[code Span] objects, if the document has been
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does syntactically parsed. A base noun phrase, or "NP chunk", is a noun
not permit other NPs to be nested within it so no NP-level phrase that does not permit other NPs to be nested within it so no
coordination, no prepositional phrases, and no relative clauses. NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Noun chunks in the document. YIELDS (Span): Noun chunks in the document.
""" """
@ -467,12 +473,14 @@ cdef class Doc:
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"noun_chunks requires the dependency parse, which " "noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the " "requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__) "documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents # Accumulate the result before beginning to iterate over it. This
# the tokenisation from being changed out from under us during the iteration. # prevents the tokenisation from being changed out from under us
# The tricky thing here is that Span accepts its tokenisation changing, # during the iteration. The tricky thing here is that Span accepts
# so it's okay once we have the Span objects. See Issue #375 # its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375.
spans = [] spans = []
for start, end, label in self.noun_chunks_iterator(self): for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label)) spans.append(Span(self, start, end, label=label))
@ -497,8 +505,9 @@ cdef class Doc:
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"sentence boundary detection requires the dependency parse, which " "Sentence boundary detection requires the dependency "
"requires data to be installed. For more info, see the " "parse, which requires a statistical model to be "
"installed and loaded. For more info, see the "
"documentation: \n%s\n" % about.__docs_models__) "documentation: \n%s\n" % about.__docs_models__)
cdef int i cdef int i
start = 0 start = 0
@ -537,12 +546,11 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Export given token attributes to a numpy `ndarray`. """Export given token attributes to a numpy `ndarray`.
If `attr_ids` is a sequence of M attributes, the output array will be
If `attr_ids` is a sequence of M attributes, the output array will of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
be of shape `(N, M)`, where N is the length of the `Doc` `attr_ids` is a single attribute, the output shape will be (N,). You
(in tokens). If `attr_ids` is a single attribute, the output shape will can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) string name (e.g. 'LEMMA' or 'lemma').
or string name (e.g. 'LEMMA' or 'lemma').
attr_ids (list[]): A list of attributes (int IDs or string names). attr_ids (list[]): A list of attributes (int IDs or string names).
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
@ -641,13 +649,12 @@ cdef class Doc:
def from_array(self, attrs, array): def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs: if SENT_START in attrs and HEAD in attrs:
raise ValueError( raise ValueError(
"Conflicting attributes specified in doc.from_array():\n" "Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START)\n" "(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries implicitly,\n" "The HEAD attribute currently sets sentence boundaries "
"based on the tree structure. This means the HEAD attribute would " "implicitly, based on the tree structure. This means the HEAD "
"potentially override the sentence boundaries set by SENT_START.\n" "attribute would potentially override the sentence boundaries "
"See https://github.com/spacy-io/spaCy/issues/235 for details and " "set by SENT_START.")
"workarounds, and to propose solutions.")
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
@ -675,18 +682,14 @@ cdef class Doc:
return self return self
def get_lca_matrix(self): def get_lca_matrix(self):
''' """Calculates the lowest common ancestor matrix for a given `Doc`.
Calculates the lowest common ancestor matrix Returns LCA matrix containing the integer index of the ancestor, or -1
for a given Spacy doc. if no common ancestor is found (ex if span excludes a necessary
Returns LCA matrix containing the integer index ancestor). Apologies about the recursion, but the impact on
of the ancestor, or -1 if no common ancestor is performance is negligible given the natural limitations on the depth
found (ex if span excludes a necessary ancestor). of a typical human sentence.
Apologies about the recursion, but the """
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
'''
# Efficiency notes: # Efficiency notes:
#
# We can easily improve the performance here by iterating in Cython. # We can easily improve the performance here by iterating in Cython.
# To loop over the tokens in Cython, the easiest way is: # To loop over the tokens in Cython, the easiest way is:
# for token in doc.c[:doc.c.length]: # for token in doc.c[:doc.c.length]:
@ -719,7 +722,6 @@ cdef class Doc:
token_k = self[k] token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
lca_matrix[k][j] = lca_matrix[j][k] lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix return lca_matrix
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
@ -819,14 +821,15 @@ cdef class Doc:
return self return self
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` """Retokenize the document, such that the span at
is merged into a single token. If `start_idx` and `end_idx `do not mark `doc.text[start_idx : end_idx]` is merged into a single token. If
start and end token boundaries, the document remains unchanged. `start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): The character index of the start of the slice to merge. start_idx (int): Character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge. end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default, **attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span. attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries. indices did not fall at token boundaries.
""" """
@ -847,8 +850,8 @@ cdef class Doc:
attributes[ENT_TYPE] = attributes['ent_type'] attributes[ENT_TYPE] = attributes['ent_type']
elif args: elif args:
raise ValueError( raise ValueError(
"Doc.merge received %d non-keyword arguments. " "Doc.merge received %d non-keyword arguments. Expected either "
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " "3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n" "Arguments supplied:\n%s\n"
"Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes))) "Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))
@ -882,8 +885,9 @@ cdef class Doc:
Token.set_struct_attr(token, attr_name, attr_value) Token.set_struct_attr(token, attr_name, attr_value)
# Begin by setting all the head indices to absolute token positions # Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets # This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency # Before thinking of something simpler, beware the case where a
# bridges over the entity. Here the alignment of the tokens changes. # dependency bridges over the entity. Here the alignment of the
# tokens changes.
span_root = span.root.i span_root = span.root.i
token.dep = span.root.dep token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since # We update token.lex after keeping span root and dep, since
@ -932,8 +936,9 @@ cdef class Doc:
>>> trees = doc.print_tree() >>> trees = doc.print_tree()
>>> trees[1] >>> trees[1]
{'modifiers': [ {'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
'lemma': 'Alice'},
{'modifiers': [ {'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
@ -1018,4 +1023,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
copy_reg.pickle(Doc, pickle_doc, unpickle_doc) copy_reg.pickle(Doc, pickle_doc, unpickle_doc)

View File

@ -35,15 +35,16 @@ cdef class Span:
def has_extension(cls, name): def has_extension(cls, name):
return name in Underscore.span_extensions return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
vector_norm=None): vector=None, vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`. """Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document. doc (Doc): The parent document.
start (int): The index of the first token of the span. start (int): The index of the first token of the span.
end (int): The index of the first token after the span. end (int): The index of the first token after the span.
label (uint64): A label to attach to the Span, e.g. for named entities. label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
if not (0 <= start <= end <= len(doc)): if not (0 <= start <= end <= len(doc)):
@ -162,7 +163,8 @@ cdef class Span:
attributes are inherited from the syntactic root token of the span. attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token. RETURNS (Token): The newly merged token.
""" """
return self.doc.merge(self.start_char, self.end_char, *args, **attributes) return self.doc.merge(self.start_char, self.end_char, *args,
**attributes)
def similarity(self, other): def similarity(self, other):
"""Make a semantic similarity estimate. The default estimate is cosine """Make a semantic similarity estimate. The default estimate is cosine
@ -179,24 +181,19 @@ cdef class Span:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
def get_lca_matrix(self): def get_lca_matrix(self):
''' """Calculates the lowest common ancestor matrix for a given `Span`.
Calculates the lowest common ancestor matrix Returns LCA matrix containing the integer index of the ancestor, or -1
for a given Spacy span. if no common ancestor is found (ex if span excludes a necessary
Returns LCA matrix containing the integer index ancestor). Apologies about the recursion, but the impact on
of the ancestor, or -1 if no common ancestor is performance is negligible given the natural limitations on the depth
found (ex if span excludes a necessary ancestor). of a typical human sentence.
Apologies about the recursion, but the """
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
'''
def __pairwise_lca(token_j, token_k, lca_matrix, margins): def __pairwise_lca(token_j, token_k, lca_matrix, margins):
offset = margins[0] offset = margins[0]
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
token_j_i = token_j.i - offset token_j_i = token_j.i - offset
token_k_i = token_k.i - offset token_k_i = token_k.i - offset
if lca_matrix[token_j_i][token_k_i] != -2: if lca_matrix[token_j_i][token_k_i] != -2:
return lca_matrix[token_j_i][token_k_i] return lca_matrix[token_j_i][token_k_i]
elif token_j == token_k: elif token_j == token_k:
@ -209,23 +206,19 @@ cdef class Span:
lca_index = -1 lca_index = -1
else: else:
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
lca_matrix[token_j_i][token_k_i] = lca_index lca_matrix[token_j_i][token_k_i] = lca_index
lca_matrix[token_k_i][token_j_i] = lca_index lca_matrix[token_k_i][token_j_i] = lca_index
return lca_index return lca_index
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
lca_matrix.fill(-2) lca_matrix.fill(-2)
margins = [self.start, self.end] margins = [self.start, self.end]
for j in range(len(self)): for j in range(len(self)):
token_j = self[j] token_j = self[j]
for k in range(len(self)): for k in range(len(self)):
token_k = self[k] token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
lca_matrix[k][j] = lca_matrix[j][k] lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix return lca_matrix
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
@ -349,7 +342,8 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if """The text content of the span with a trailing whitespace character if
the last token has one. the last token has one.
RETURNS (unicode): The text content of the span (with trailing whitespace). RETURNS (unicode): The text content of the span (with trailing
whitespace).
""" """
def __get__(self): def __get__(self):
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
@ -358,7 +352,8 @@ cdef class Span:
"""Yields base noun-phrase `Span` objects, if the document has been """Yields base noun-phrase `Span` objects, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun syntactically parsed. A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative clauses. NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Base noun-phrase `Span` objects YIELDS (Span): Base noun-phrase `Span` objects
""" """
@ -366,7 +361,8 @@ cdef class Span:
if not self.doc.is_parsed: if not self.doc.is_parsed:
raise ValueError( raise ValueError(
"noun_chunks requires the dependency parse, which " "noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the " "requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__) "documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents # Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration. # the tokenisation from being changed out from under us during the iteration.
@ -385,9 +381,9 @@ cdef class Span:
RETURNS (Token): The root token. RETURNS (Token): The root token.
EXAMPLE: The root token has the shortest path to the root of the sentence EXAMPLE: The root token has the shortest path to the root of the
(or is the root itself). If multiple words are equally high in the sentence (or is the root itself). If multiple words are equally
tree, the first word is taken. For example: high in the tree, the first word is taken. For example:
>>> toks = nlp(u'I like New York in Autumn.') >>> toks = nlp(u'I like New York in Autumn.')
@ -437,11 +433,11 @@ cdef class Span:
if self.doc.c[i].head == 0: if self.doc.c[i].head == 0:
return self.doc[i] return self.doc[i]
# If we don't have a sentence root, we do something that's not so # If we don't have a sentence root, we do something that's not so
# algorithmically clever, but I think should be quite fast, especially # algorithmically clever, but I think should be quite fast,
# for short spans. # especially for short spans.
# For each word, we count the path length, and arg min this measure. # For each word, we count the path length, and arg min this measure.
# We could use better tree logic to save steps here...But I think this # We could use better tree logic to save steps here...But I
# should be okay. # think this should be okay.
cdef int current_best = self.doc.length cdef int current_best = self.doc.length
cdef int root = -1 cdef int root = -1
for i in range(self.start, self.end): for i in range(self.start, self.end):
@ -463,7 +459,7 @@ cdef class Span:
YIELDS (Token):A left-child of a token of the span. YIELDS (Token):A left-child of a token of the span.
""" """
def __get__(self): def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order for token in reversed(self): # Reverse, so we get tokens in order
for left in token.lefts: for left in token.lefts:
if left.i < self.start: if left.i < self.start:
yield left yield left
@ -493,7 +489,7 @@ cdef class Span:
yield from word.subtree yield from word.subtree
property ent_id: property ent_id:
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`. """An (integer) entity ID.
RETURNS (uint64): The entity ID. RETURNS (uint64): The entity ID.
""" """
@ -503,8 +499,8 @@ cdef class Span:
def __set__(self, hash_t key): def __set__(self, hash_t key):
# TODO # TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue " "Can't yet set ent_id from Span. Vote for this feature on "
"tracker: http://github.com/explosion/spaCy/issues") "the issue tracker: http://github.com/explosion/spaCy/issues")
property ent_id_: property ent_id_:
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`. """A (string) entity ID. Usually assigned by patterns in the `Matcher`.
@ -517,13 +513,16 @@ cdef class Span:
def __set__(self, hash_t key): def __set__(self, hash_t key):
# TODO # TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the issue " "Can't yet set ent_id_ from Span. Vote for this feature on the "
"tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
property orth_: property orth_:
# TODO: docstring """Verbatim text content (identical to Span.text). Exists mostly for
consistency with other attributes.
RETURNS (unicode): The span's text."""
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]).strip() return ''.join([t.orth_ for t in self]).strip()
property lemma_: property lemma_:
"""The span's lemma. """The span's lemma.
@ -534,19 +533,19 @@ cdef class Span:
return ' '.join([t.lemma_ for t in self]).strip() return ' '.join([t.lemma_ for t in self]).strip()
property upper_: property upper_:
# TODO: docstring """Deprecated. Use Span.text.upper() instead."""
def __get__(self): def __get__(self):
return ''.join([t.string.upper() for t in self]).strip() return ''.join([t.text_with_ws.upper() for t in self]).strip()
property lower_: property lower_:
# TODO: docstring """Deprecated. Use Span.text.lower() instead."""
def __get__(self): def __get__(self):
return ''.join([t.string.lower() for t in self]).strip() return ''.join([t.text_with_ws.lower() for t in self]).strip()
property string: property string:
# TODO: docstring """Deprecated: Use Span.text instead."""
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]) return ''.join([t.text_with_ws for t in self])
property label_: property label_:
"""The span's label. """The span's label.
@ -570,7 +569,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
n += 1 n += 1
if n >= sent_length: if n >= sent_length:
raise RuntimeError( raise RuntimeError(
"Array bounds exceeded while searching for root word. This likely " "Array bounds exceeded while searching for root word. This "
"means the parse tree is in an invalid state. Please report this " "likely means the parse tree is in an invalid state. Please "
"issue here: http://github.com/explosion/spaCy/issues") "report this issue here: "
"http://github.com/explosion/spaCy/issues")
return n return n

View File

@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .. import parts_of_speech from .. import parts_of_speech
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LEMMA, POS, TAG, DEP from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .underscore import Underscore from .underscore import Underscore
cdef class Token: cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, etc.""" """An individual token i.e. a word, punctuation symbol, whitespace,
etc."""
@classmethod @classmethod
def set_extension(cls, name, default=None, method=None, def set_extension(cls, name, default=None, method=None,
getter=None, setter=None): getter=None, setter=None):
@ -171,10 +172,11 @@ cdef class Token:
return self.orth_ return self.orth_
property text_with_ws: property text_with_ws:
"""The text content of the token with a trailing whitespace character if """The text content of the token with a trailing whitespace character
it has one. if it has one.
RETURNS (unicode): The text content of the span (with trailing whitespace). RETURNS (unicode): The text content of the span (with trailing
whitespace).
""" """
def __get__(self): def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth] cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -306,9 +308,8 @@ cdef class Token:
def __set__(self, value): def __set__(self, value):
if self.doc.is_parsed: if self.doc.is_parsed:
raise ValueError( raise ValueError(
'Refusing to write to token.sent_start if its document is parsed, ' "Refusing to write to token.sent_start if its document "
'because this may cause inconsistent state. ' "is parsed, because this may cause inconsistent state.")
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
if value is None: if value is None:
self.c.sent_start = 0 self.c.sent_start = 0
elif value is True: elif value is True:
@ -316,13 +317,12 @@ cdef class Token:
elif value is False: elif value is False:
self.c.sent_start = -1 self.c.sent_start = -1
else: else:
raise ValueError("Invalid value for token.sent_start -- must be one of " raise ValueError("Invalid value for token.sent_start. Must be "
"None, True, False") "one of: None, True, False")
property lefts: property lefts:
def __get__(self): def __get__(self):
""" """The leftward immediate children of the word, in the syntactic
The leftward immediate children of the word, in the syntactic
dependency parse. dependency parse.
""" """
cdef int nr_iter = 0 cdef int nr_iter = 0
@ -334,13 +334,12 @@ cdef class Token:
nr_iter += 1 nr_iter += 1
# This is ugly, but it's a way to guard out infinite loops # This is ugly, but it's a way to guard out infinite loops
if nr_iter >= 10000000: if nr_iter >= 10000000:
raise RuntimeError( raise RuntimeError("Possibly infinite loop encountered "
"Possibly infinite loop encountered while looking for token.lefts") "while looking for token.lefts")
property rights: property rights:
def __get__(self): def __get__(self):
""" """The rightward immediate children of the word, in the syntactic
The rightward immediate children of the word, in the syntactic
dependency parse. dependency parse.
""" """
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
@ -352,27 +351,26 @@ cdef class Token:
ptr -= 1 ptr -= 1
nr_iter += 1 nr_iter += 1
if nr_iter >= 10000000: if nr_iter >= 10000000:
raise RuntimeError( raise RuntimeError("Possibly infinite loop encountered "
"Possibly infinite loop encountered while looking for token.rights") "while looking for token.rights")
tokens.reverse() tokens.reverse()
for t in tokens: for t in tokens:
yield t yield t
property children: property children:
""" """A sequence of the token's immediate syntactic children.
A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self YIELDS (Token): A child token such that child.head==self
""" """
def __get__(self): def __get__(self):
yield from self.lefts yield from self.lefts
yield from self.rights yield from self.rights
property subtree: property subtree:
""" """A sequence of all the token's syntactic descendents.
A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent) YIELDS (Token): A descendent token such that
`self.is_ancestor(descendent)`.
""" """
def __get__(self): def __get__(self):
for word in self.lefts: for word in self.lefts:
@ -456,13 +454,15 @@ cdef class Token:
if self.c.head > 0: # left dependent if self.c.head > 0: # left dependent
old_head.c.l_kids -= 1 old_head.c.l_kids -= 1
if self.c.l_edge == old_head.c.l_edge: if self.c.l_edge == old_head.c.l_edge:
# the token dominates the left edge so the left edge of the head # the token dominates the left edge so the left edge of
# may change when the token is reattached # the head may change when the token is reattached, it may
# it may not change if the new head is a descendant of the current head # not change if the new head is a descendant of the current
# head
new_edge = self.c.l_edge new_edge = self.c.l_edge
# the new l_edge is the left-most l_edge on any of the other dependents # the new l_edge is the left-most l_edge on any of the
# where the l_edge is left of the head, otherwise it is the head # other dependents where the l_edge is left of the head,
# otherwise it is the head
if not is_desc: if not is_desc:
new_edge = old_head.i new_edge = old_head.i
for child in old_head.children: for child in old_head.children:
@ -472,8 +472,9 @@ cdef class Token:
new_edge = child.c.l_edge new_edge = child.c.l_edge
old_head.c.l_edge = new_edge old_head.c.l_edge = new_edge
# walk up the tree from old_head and assign new l_edge to ancestors # walk up the tree from old_head and assign new l_edge to
# until an ancestor already has an l_edge that's further left # ancestors until an ancestor already has an l_edge that's
# further left
for anc in old_head.ancestors: for anc in old_head.ancestors:
if anc.c.l_edge <= new_edge: if anc.c.l_edge <= new_edge:
break break
@ -572,8 +573,8 @@ cdef class Token:
property ent_iob_: property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity, """IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity, and "I" means it is inside an entity, "O" means it is outside an entity,
"" means no entity tag is set. and "" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag. RETURNS (unicode): IOB code of named entity tag.
""" """
@ -582,8 +583,7 @@ cdef class Token:
return iob_strings[self.c.ent_iob] return iob_strings[self.c.ent_iob]
property ent_id: property ent_id:
"""ID of the entity the token is an instance of, if any. Usually """ID of the entity the token is an instance of, if any.
assigned by patterns in the Matcher.
RETURNS (uint64): ID of the entity. RETURNS (uint64): ID of the entity.
""" """
@ -594,8 +594,7 @@ cdef class Token:
self.c.ent_id = key self.c.ent_id = key
property ent_id_: property ent_id_:
"""ID of the entity the token is an instance of, if any. Usually """ID of the entity the token is an instance of, if any.
assigned by patterns in the Matcher.
RETURNS (unicode): ID of the entity. RETURNS (unicode): ID of the entity.
""" """
@ -606,34 +605,70 @@ cdef class Token:
self.c.ent_id = self.vocab.strings.add(name) self.c.ent_id = self.vocab.strings.add(name)
property whitespace_: property whitespace_:
"""Trailing space character if present.
RETURNS (unicode): The whitespace character.
"""
def __get__(self): def __get__(self):
return ' ' if self.c.spacy else '' return ' ' if self.c.spacy else ''
property orth_: property orth_:
"""Verbatim text content (identical to `Token.text`). Existst mostly
for consistency with the other attributes.
RETURNS (unicode): The token text.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.orth] return self.vocab.strings[self.c.lex.orth]
property lower_: property lower_:
"""Lowercase form of the token text. Equivalent to
`Token.text.lower()`.
RETURNS (unicode): The lowercase token text.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.lower] return self.vocab.strings[self.c.lex.lower]
property norm_: property norm_:
"""The token's norm, i.e. a normalised form of the token text.
Usually set in the language's tokenizer exceptions or norm exceptions.
RETURNS (unicode): The norm.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.norm] return self.vocab.strings[self.c.lex.norm]
property shape_: property shape_:
"""Transform of the tokens's string, to show orthographic features.
For example, "Xxxx" or "dd".
RETURNS (unicode): The token shape.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.shape] return self.vocab.strings[self.c.lex.shape]
property prefix_: property prefix_:
"""A length-N substring from the start of the token. Defaults to `N=1`.
RETURNS (unicode): The token's prefix.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.prefix] return self.vocab.strings[self.c.lex.prefix]
property suffix_: property suffix_:
"""A length-N substring from the end of the token. Defaults to `N=3`.
RETURNS (unicode): The token's suffix.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.suffix] return self.vocab.strings[self.c.lex.suffix]
property lang_: property lang_:
"""Language of the parent document's vocabulary, e.g. 'en'.
RETURNS (unicode): The language code.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.lang] return self.vocab.strings[self.c.lex.lang]
@ -648,65 +683,152 @@ cdef class Token:
self.c.lemma = self.vocab.strings.add(lemma_) self.c.lemma = self.vocab.strings.add(lemma_)
property pos_: property pos_:
"""Coarse-grained part-of-speech.
RETURNS (unicode): The part-of-speech tag.
"""
def __get__(self): def __get__(self):
return parts_of_speech.NAMES[self.c.pos] return parts_of_speech.NAMES[self.c.pos]
property tag_: property tag_:
"""Fine-grained part-of-speech.
RETURNS (unicode): The part-of-speech tag.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.tag] return self.vocab.strings[self.c.tag]
def __set__(self, tag): def __set__(self, tag):
self.tag = self.vocab.strings.add(tag) self.tag = self.vocab.strings.add(tag)
property dep_: property dep_:
"""Syntactic dependency relation.
RETURNS (unicode): The dependency label.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.dep] return self.vocab.strings[self.c.dep]
def __set__(self, unicode label): def __set__(self, unicode label):
self.c.dep = self.vocab.strings.add(label) self.c.dep = self.vocab.strings.add(label)
property is_oov: property is_oov:
"""Is the token out-of-vocabulary?
RETURNS (bool): Whether the token is out-of-vocabulary.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
property is_stop: property is_stop:
"""Is the token part of a "stop list"? (defined by the language data)
RETURNS (bool): Whether the token is a stop word.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
property is_alpha: property is_alpha:
"""Does the token consist of alphabetic characters? Equivalent to
`token.text.isalpha()`.
RETURNS (bool): Whether the token consists of alpha characters.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
property is_ascii: property is_ascii:
"""Does the token consist of ASCII characters? Equivalent to
`[any(ord(c) >= 128 for c in token.text)]`.
RETURNS (bool): Whether the token consists of ASCII characters.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
property is_digit: property is_digit:
"""Does the token consist of digits? Equivalent to
`token.text.isdigit()`.
RETURNS (bool): Whether the token consists of digits.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
property is_lower: property is_lower:
"""Is the token in lowercase? Equivalent to `token.text.islower()`.
RETURNS (bool): Whether the token is in lowercase.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
property is_upper:
"""Is the token in uppercase? Equivalent to `token.text.isupper()`.
RETURNS (bool): Whether the token is in uppercase.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
property is_title: property is_title:
"""Is the token in titlecase? Equivalent to `token.text.istitle()`.
RETURNS (bool): Whether the token is in titlecase.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
property is_punct: property is_punct:
"""Is the token punctuation?
RETURNS (bool): Whether the token is punctuation.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space: property is_space:
"""Does the token consist of whitespace characters? Equivalent to
`token.text.isspace()`.
RETURNS (bool): Whether the token consists of whitespace characters.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket: property is_bracket:
"""Is the token a bracket?
RETURNS (bool): Whether the token is a bracket.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote: property is_quote:
"""Is the token a quotation mark?
RETURNS (bool): Whether the token is a quotation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct: property is_left_punct:
"""Is the token a left punctuation mark, e.g. "("?
RETURNS (bool): Whether the token is a left punctuation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct: property is_right_punct:
"""Is the token a left punctuation mark, e.g. "("?
RETURNS (bool): Whether the token is a left punctuation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url: property like_url:
"""Does the token resemble a URL?
RETURNS (bool): Whether the token resembles a URL.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL) def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
property like_num: property like_num:
"""Does the token represent a number? e.g. "10.9", "10", "ten", etc.
RETURNS (bool): Whether the token resembles a number.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
property like_email: property like_email:
"""Does the token resemble an email address?
RETURNS (bool): Whether the token resembles an email address.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)

View File

@ -248,6 +248,28 @@ p
+cell float +cell float
+cell A scalar similarity score. Higher is more similar. +cell A scalar similarity score. Higher is more similar.
+h(2, "get_lca_matrix") Span.get_lca_matrix
+tag method
p
| Calculates the lowest common ancestor matrix for a given #[code Span].
| Returns LCA matrix containing the integer index of the ancestor, or
| #[code -1] if no common ancestor is found, e.g. if span excludes a
| necessary ancestor.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn')
span = doc[1:4]
matrix = span.get_lca_matrix()
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell The lowest common ancestor matrix of the #[code Span].
+h(2, "to_array") Span.to_array +h(2, "to_array") Span.to_array
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -495,6 +517,18 @@ p
| The text content of the span with a trailing whitespace character | The text content of the span with a trailing whitespace character
| if the last token has one. | if the last token has one.
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Span.text]). Existst
| mostly for consistency with the other attributes.
+row +row
+cell #[code label] +cell #[code label]
+cell int +cell int

View File

@ -489,15 +489,35 @@ p The L2 norm of the token's vector representation.
+cell unicode +cell unicode
+cell Base form of the token, with no inflectional suffixes. +cell Base form of the token, with no inflectional suffixes.
+row
+cell #[code norm]
+cell int
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+row
+cell #[code norm_]
+cell unicode
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+row +row
+cell #[code lower] +cell #[code lower]
+cell int +cell int
+cell Lower-case form of the token. +cell Lowercase form of the token.
+row +row
+cell #[code lower_] +cell #[code lower_]
+cell unicode +cell unicode
+cell Lower-case form of the token. +cell
| Lowercase form of the token text. Equivalent to
| #[code Token.text.lower()].
+row +row
+cell #[code shape] +cell #[code shape]
@ -537,7 +557,9 @@ p The L2 norm of the token's vector representation.
+row +row
+cell #[code suffix_] +cell #[code suffix_]
+cell unicode +cell unicode
+cell Length-N substring from the end of the token. Defaults to #[code N=3]. +cell
| Length-N substring from the end of the token. Defaults to
| #[code N=3].
+row +row
+cell #[code is_alpha] +cell #[code is_alpha]
@ -672,6 +694,7 @@ p The L2 norm of the token's vector representation.
+cell #[code lang] +cell #[code lang]
+cell int +cell int
+cell Language of the parent document's vocabulary. +cell Language of the parent document's vocabulary.
+row +row
+cell #[code lang_] +cell #[code lang_]
+cell unicode +cell unicode