Tidy up Doc, Token and Span and add missing docs

This commit is contained in:
ines 2017-10-27 17:07:26 +02:00
parent a6135336f5
commit 544a407b93
7 changed files with 384 additions and 237 deletions

View File

@ -326,7 +326,8 @@ cdef class Doc:
if self._vector is not None:
return self._vector
elif not len(self):
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
self._vector = numpy.zeros((self.vocab.vectors_length,),
dtype='f')
return self._vector
elif self.has_vector:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
@ -338,7 +339,8 @@ cdef class Doc:
self._vector = self.tensor.mean(axis=0)
return self._vector
else:
return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
return numpy.zeros((self.vocab.vectors_length,),
dtype='float32')
def __set__(self, value):
self._vector = value
@ -424,7 +426,8 @@ cdef class Doc:
def __set__(self, ents):
# TODO:
# 1. Allow negative matches
# 2. Ensure pre-set NERs are not over-written during statistical prediction
# 2. Ensure pre-set NERs are not over-written during statistical
# prediction
# 3. Test basic data-driven ORTH gazetteer
# 4. Test more nuanced date and currency regex
cdef int i
@ -433,7 +436,7 @@ cdef class Doc:
# At this point we don't know whether the NER has run over the
# Doc. If the ent_iob is missing, leave it missing.
if self.c[i].ent_iob != 0:
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
cdef attr_t ent_type
cdef int start, end
for ent_info in ents:
@ -574,18 +577,19 @@ cdef class Doc:
# Allow strings, e.g. 'lemma' or 'LEMMA'
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
for id_ in py_attr_ids]
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
# Make an array from the attributes --- otherwise our inner loop is
# Python dict iteration.
attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)),
dtype=numpy.uint64)
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature)
# Handle 1d case
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
def count_by(self, attr_id_t attr_id, exclude=None,
PreshCounter counts=None):
"""Count the frequencies of a given attribute. Produces a dict of
`{attribute (int): count (ints)}` frequencies, keyed by the values of
the given attribute ID.
@ -708,7 +712,8 @@ cdef class Doc:
elif (token_j.head == token_j) and (token_k.head == token_k):
lca_index = -1
else:
lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
lca_index = __pairwise_lca(token_j.head, token_k.head,
lca_matrix)
lca_matrix[token_j.i][token_k.i] = lca_index
lca_matrix[token_k.i][token_j.i] = lca_index
@ -728,7 +733,7 @@ cdef class Doc:
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
"""
with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude))
@ -751,7 +756,7 @@ cdef class Doc:
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
"""
array_head = [LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE]
array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE]
# Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within
# keys, we must have tuples. In values we just have to hope
@ -794,7 +799,8 @@ cdef class Doc:
# keys, we must have tuples. In values we just have to hope
# users don't mind getting a list instead of a tuple.
if 'user_data' not in exclude and 'user_data_keys' in msg:
user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False)
user_data_keys = msgpack.loads(msg['user_data_keys'],
use_list=False)
user_data_values = msgpack.loads(msg['user_data_values'])
for key, value in zip(user_data_keys, user_data_values):
self.user_data[key] = value
@ -853,7 +859,8 @@ cdef class Doc:
"Doc.merge received %d non-keyword arguments. Expected either "
"3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n"
"Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))
"Keyword arguments: %s\n" % (len(args), repr(args),
repr(attributes)))
# More deprecated attribute handling =/
if 'label' in attributes:

View File

@ -128,14 +128,17 @@ cdef class Span:
@property
def _(self):
"""User space for adding custom attribute extensions."""
return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char)
def as_doc(self):
'''Create a Doc object view of the Span's data.
# TODO: fix
"""Create a `Doc` object view of the Span's data. This is mostly
useful for C-typed interfaces.
This is mostly useful for C-typed interfaces.
'''
RETURNS (Doc): The `Doc` view of the span.
"""
cdef Doc doc = Doc(self.doc.vocab)
doc.length = self.end-self.start
doc.c = &self.doc.c[self.start]
@ -259,10 +262,7 @@ cdef class Span:
self.end = end + 1
property sent:
"""The sentence span that this span is a part of.
RETURNS (Span): The sentence span that the span is a part of.
"""
"""RETURNS (Span): The sentence span that the span is a part of."""
def __get__(self):
if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self)
@ -275,13 +275,10 @@ cdef class Span:
n += 1
if n >= self.doc.length:
raise RuntimeError
return self.doc[root.l_edge : root.r_edge + 1]
return self.doc[root.l_edge:root.r_edge + 1]
property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
if 'has_vector' in self.doc.user_span_hooks:
@ -303,10 +300,7 @@ cdef class Span:
return self._vector
property vector_norm:
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
"""RETURNS (float): The L2 norm of the vector representation."""
def __get__(self):
if 'vector_norm' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self)
@ -320,7 +314,9 @@ cdef class Span:
return self._vector_norm
property sentiment:
# TODO: docstring
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the span.
"""
def __get__(self):
if 'sentiment' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sentiment'](self)
@ -328,10 +324,7 @@ cdef class Span:
return sum([token.sentiment for token in self]) / len(self)
property text:
"""A unicode representation of the span text.
RETURNS (unicode): The original verbatim text of the span.
"""
"""RETURNS (unicode): The original verbatim text of the span."""
def __get__(self):
text = self.text_with_ws
if self[-1].whitespace_:
@ -364,10 +357,11 @@ cdef class Span:
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration.
# The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375
spans = []
cdef attr_t label
for start, end, label in self.doc.noun_chunks_iterator(self):
@ -459,7 +453,7 @@ cdef class Span:
YIELDS (Token):A left-child of a token of the span.
"""
def __get__(self):
for token in reversed(self): # Reverse, so we get tokens in order
for token in reversed(self): # Reverse, so we get tokens in order
for left in token.lefts:
if left.i < self.start:
yield left
@ -476,6 +470,20 @@ cdef class Span:
if right.i >= self.end:
yield right
property n_lefts:
"""RETURNS (int): The number of leftward immediate children of the
span, in the syntactic dependency parse.
"""
# TODO: implement
raise NotImplementedError()
property n_rights:
"""RETURNS (int): The number of rightward immediate children of the
span, in the syntactic dependency parse.
"""
# TODO: implement
raise NotImplementedError()
property subtree:
"""Tokens that descend from tokens in the span, but fall outside it.
@ -489,29 +497,21 @@ cdef class Span:
yield from word.subtree
property ent_id:
"""An (integer) entity ID.
RETURNS (uint64): The entity ID.
"""
"""RETURNS (uint64): The entity ID."""
def __get__(self):
return self.root.ent_id
def __set__(self, hash_t key):
# TODO
raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on "
"the issue tracker: http://github.com/explosion/spaCy/issues")
property ent_id_:
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (unicode): The entity ID.
"""
"""RETURNS (unicode): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
def __set__(self, hash_t key):
# TODO
raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
@ -525,10 +525,7 @@ cdef class Span:
return ''.join([t.orth_ for t in self]).strip()
property lemma_:
"""The span's lemma.
RETURNS (unicode): The span's lemma.
"""
"""RETURNS (unicode): The span's lemma."""
def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip()
@ -543,15 +540,12 @@ cdef class Span:
return ''.join([t.text_with_ws.lower() for t in self]).strip()
property string:
"""Deprecated: Use Span.text instead."""
"""Deprecated: Use Span.text_with_ws instead."""
def __get__(self):
return ''.join([t.text_with_ws for t in self])
property label_:
"""The span's label.
RETURNS (unicode): The span's label.
"""
"""RETURNS (unicode): The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]

View File

@ -145,37 +145,32 @@ cdef class Token:
return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm))
property lex_id:
"""ID of the token's lexical type.
RETURNS (int): ID of the token's lexical type."""
"""RETURNS (int): Sequential ID of the token's lexical type."""
def __get__(self):
return self.c.lex.id
property rank:
# TODO: add docstring
"""RETURNS (int): Sequential ID of the token's lexical type, used to
index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.lex.id
property string:
"""Deprecated: Use Token.text_with_ws instead."""
def __get__(self):
return self.text_with_ws
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
"""RETURNS (unicode): The original verbatim text of the token."""
def __get__(self):
return self.orth_
property text_with_ws:
"""The text content of the token with a trailing whitespace character
if it has one.
RETURNS (unicode): The text content of the span (with trailing
"""RETURNS (unicode): The text content of the span (with trailing
whitespace).
"""
def __get__(self):
@ -186,74 +181,104 @@ cdef class Token:
return orth
property prob:
"""RETURNS (float): Smoothed log probability estimate of token type."""
def __get__(self):
return self.c.lex.prob
property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the token."""
def __get__(self):
if 'sentiment' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['sentiment'](self)
return self.c.lex.sentiment
property lang:
"""RETURNS (uint64): ID of the language of the parent document's
vocabulary.
"""
def __get__(self):
return self.c.lex.lang
property idx:
"""RETURNS (int): The character offset of the token within the parent
document.
"""
def __get__(self):
return self.c.idx
property cluster:
"""RETURNS (int): Brown cluster ID."""
def __get__(self):
return self.c.lex.cluster
property orth:
"""RETURNS (uint64): ID of the verbatim text content."""
def __get__(self):
return self.c.lex.orth
property lower:
"""RETURNS (uint64): ID of the lowercase token text."""
def __get__(self):
return self.c.lex.lower
property norm:
"""RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
the token text. Usually set in the language's tokenizer exceptions
or norm exceptions.
"""
def __get__(self):
return self.c.lex.norm
property shape:
"""RETURNS (uint64): ID of the token's shape, a transform of the
tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
"""
def __get__(self):
return self.c.lex.shape
property prefix:
"""RETURNS (uint64): ID of a length-N substring from the start of the
token. Defaults to `N=1`.
"""
def __get__(self):
return self.c.lex.prefix
property suffix:
"""RETURNS (uint64): ID of a length-N substring from the end of the
token. Defaults to `N=3`.
"""
def __get__(self):
return self.c.lex.suffix
property lemma:
"""Base form of the word, with no inflectional suffixes.
RETURNS (uint64): Token lemma.
"""RETURNS (uint64): ID of the base form of the word, with no
inflectional suffixes.
"""
def __get__(self):
return self.c.lemma
def __set__(self, attr_t lemma):
self.c.lemma = lemma
property pos:
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
def __get__(self):
return self.c.pos
property tag:
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
def __get__(self):
return self.c.tag
def __set__(self, attr_t tag):
self.vocab.morphology.assign_tag(self.c, tag)
property dep:
"""RETURNS (uint64): ID of syntactic dependency label."""
def __get__(self):
return self.c.dep
def __set__(self, attr_t label):
self.c.dep = label
@ -294,14 +319,21 @@ cdef class Token:
return numpy.sqrt((vector ** 2).sum())
property n_lefts:
"""RETURNS (int): The number of leftward immediate children of the
word, in the syntactic dependency parse.
"""
def __get__(self):
return self.c.l_kids
property n_rights:
"""RETURNS (int): The number of rightward immediate children of the
word, in the syntactic dependency parse.
"""
def __get__(self):
return self.c.r_kids
property sent_start:
# TODO: fix and document
def __get__(self):
return self.c.sent_start
@ -321,10 +353,12 @@ cdef class Token:
"one of: None, True, False")
property lefts:
"""The leftward immediate children of the word, in the syntactic
dependency parse.
YIELDS (Token): A left-child of the token.
"""
def __get__(self):
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef int nr_iter = 0
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
while ptr < self.c:
@ -338,10 +372,12 @@ cdef class Token:
"while looking for token.lefts")
property rights:
"""The rightward immediate children of the word, in the syntactic
dependency parse.
YIELDS (Token): A right-child of the token.
"""
def __get__(self):
"""The rightward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
tokens = []
cdef int nr_iter = 0
@ -420,18 +456,17 @@ cdef class Token:
"""
if self.doc is not descendant.doc:
return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
property head:
"""The syntactic parent, or "governor", of this token.
RETURNS (Token): The token head.
RETURNS (Token): The token predicted by the parser to be the head of
the current token.
"""
def __get__(self):
"""The token predicted by the parser to be the head of the current
token.
"""
return self.doc[self.i + self.c.head]
def __set__(self, Token new_head):
# this function sets the head of self to new_head
# and updates the counters for left/right dependents
@ -451,7 +486,7 @@ cdef class Token:
cdef Token anc, child
# update number of deps of old head
if self.c.head > 0: # left dependent
if self.c.head > 0: # left dependent
old_head.c.l_kids -= 1
if self.c.l_edge == old_head.c.l_edge:
# the token dominates the left edge so the left edge of
@ -543,12 +578,10 @@ cdef class Token:
yield from word.conjuncts
property ent_type:
"""Named entity type.
RETURNS (uint64): Named entity type.
"""
"""RETURNS (uint64): Named entity type."""
def __get__(self):
return self.c.ent_type
def __set__(self, ent_type):
self.c.ent_type = ent_type
@ -562,12 +595,10 @@ cdef class Token:
return self.c.ent_iob
property ent_type_:
"""Named entity type.
RETURNS (unicode): Named entity type.
"""
"""RETURNS (unicode): Named entity type."""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
def __set__(self, ent_type):
self.c.ent_type = self.vocab.strings.add(ent_type)
@ -583,9 +614,8 @@ cdef class Token:
return iob_strings[self.c.ent_iob]
property ent_id:
"""ID of the entity the token is an instance of, if any.
RETURNS (uint64): ID of the entity.
"""RETURNS (uint64): ID of the entity the token is an instance of,
if any.
"""
def __get__(self):
return self.c.ent_id
@ -594,9 +624,8 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
"""ID of the entity the token is an instance of, if any.
RETURNS (unicode): ID of the entity.
"""RETURNS (unicode): ID of the entity the token is an instance of,
if any.
"""
def __get__(self):
return self.vocab.strings[self.c.ent_id]
@ -605,230 +634,192 @@ cdef class Token:
self.c.ent_id = self.vocab.strings.add(name)
property whitespace_:
"""Trailing space character if present.
RETURNS (unicode): The whitespace character.
"""RETURNS (unicode): The trailing whitespace character, if present.
"""
def __get__(self):
return ' ' if self.c.spacy else ''
property orth_:
"""Verbatim text content (identical to `Token.text`). Existst mostly
for consistency with the other attributes.
RETURNS (unicode): The token text.
"""RETURNS (unicode): Verbatim text content (identical to
`Token.text`). Existst mostly for consistency with the other
attributes.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.orth]
property lower_:
"""Lowercase form of the token text. Equivalent to
`Token.text.lower()`.
RETURNS (unicode): The lowercase token text.
"""RETURNS (unicode): The lowercase token text. Equivalent to
`Token.text.lower()`.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.lower]
property norm_:
"""The token's norm, i.e. a normalised form of the token text.
Usually set in the language's tokenizer exceptions or norm exceptions.
RETURNS (unicode): The norm.
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or
norm exceptions.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.norm]
property shape_:
"""Transform of the tokens's string, to show orthographic features.
For example, "Xxxx" or "dd".
RETURNS (unicode): The token shape.
"""RETURNS (unicode): Transform of the tokens's string, to show
orthographic features. For example, "Xxxx" or "dd".
"""
def __get__(self):
return self.vocab.strings[self.c.lex.shape]
property prefix_:
"""A length-N substring from the start of the token. Defaults to `N=1`.
RETURNS (unicode): The token's prefix.
"""RETURNS (unicode): A length-N substring from the start of the token.
Defaults to `N=1`.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.prefix]
property suffix_:
"""A length-N substring from the end of the token. Defaults to `N=3`.
RETURNS (unicode): The token's suffix.
"""RETURNS (unicode): A length-N substring from the end of the token.
Defaults to `N=3`.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
property lang_:
"""Language of the parent document's vocabulary, e.g. 'en'.
RETURNS (unicode): The language code.
"""RETURNS (unicode): Language of the parent document's vocabulary,
e.g. 'en'.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.lang]
property lemma_:
"""Base form of the word, with no inflectional suffixes.
RETURNS (unicode): Token lemma.
"""RETURNS (unicode): The token lemma, i.e. the base form of the word,
with no inflectional suffixes.
"""
def __get__(self):
return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
"""Coarse-grained part-of-speech.
RETURNS (unicode): The part-of-speech tag.
"""
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
def __get__(self):
return parts_of_speech.NAMES[self.c.pos]
property tag_:
"""Fine-grained part-of-speech.
RETURNS (unicode): The part-of-speech tag.
"""
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
def __get__(self):
return self.vocab.strings[self.c.tag]
def __set__(self, tag):
self.tag = self.vocab.strings.add(tag)
property dep_:
"""Syntactic dependency relation.
RETURNS (unicode): The dependency label.
"""
"""RETURNS (unicode): The syntactic dependency label."""
def __get__(self):
return self.vocab.strings[self.c.dep]
def __set__(self, unicode label):
self.c.dep = self.vocab.strings.add(label)
property is_oov:
"""Is the token out-of-vocabulary?
RETURNS (bool): Whether the token is out-of-vocabulary.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
"""RETURNS (bool): Whether the token is out-of-vocabulary."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_OOV)
property is_stop:
"""Is the token part of a "stop list"? (defined by the language data)
RETURNS (bool): Whether the token is a stop word.
"""RETURNS (bool): Whether the token is a stop word, i.e. part of a
"stop list" defined by the language data.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_STOP)
property is_alpha:
"""Does the token consist of alphabetic characters? Equivalent to
`token.text.isalpha()`.
RETURNS (bool): Whether the token consists of alpha characters.
"""RETURNS (bool): Whether the token consists of alpha characters.
Equivalent to `token.text.isalpha()`.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
property is_ascii:
"""Does the token consist of ASCII characters? Equivalent to
`[any(ord(c) >= 128 for c in token.text)]`.
RETURNS (bool): Whether the token consists of ASCII characters.
"""RETURNS (bool): Whether the token consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
property is_digit:
"""Does the token consist of digits? Equivalent to
`token.text.isdigit()`.
RETURNS (bool): Whether the token consists of digits.
"""RETURNS (bool): Whether the token consists of digits. Equivalent to
`token.text.isdigit()`.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
property is_lower:
"""Is the token in lowercase? Equivalent to `token.text.islower()`.
RETURNS (bool): Whether the token is in lowercase.
"""RETURNS (bool): Whether the token is in lowercase. Equivalent to
`token.text.islower()`.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
property is_upper:
"""Is the token in uppercase? Equivalent to `token.text.isupper()`.
RETURNS (bool): Whether the token is in uppercase.
"""RETURNS (bool): Whether the token is in uppercase. Equivalent to
`token.text.isupper()`
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
property is_title:
"""Is the token in titlecase? Equivalent to `token.text.istitle()`.
RETURNS (bool): Whether the token is in titlecase.
"""RETURNS (bool): Whether the token is in titlecase. Equivalent to
`token.text.istitle()`.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
property is_punct:
"""Is the token punctuation?
RETURNS (bool): Whether the token is punctuation.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
"""RETURNS (bool): Whether the token is punctuation."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space:
"""Does the token consist of whitespace characters? Equivalent to
`token.text.isspace()`.
RETURNS (bool): Whether the token consists of whitespace characters.
"""RETURNS (bool): Whether the token consists of whitespace characters.
Equivalent to `token.text.isspace()`.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket:
"""Is the token a bracket?
RETURNS (bool): Whether the token is a bracket.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
"""RETURNS (bool): Whether the token is a bracket."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote:
"""Is the token a quotation mark?
RETURNS (bool): Whether the token is a quotation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
"""RETURNS (bool): Whether the token is a quotation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct:
"""Is the token a left punctuation mark, e.g. "("?
RETURNS (bool): Whether the token is a left punctuation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
"""RETURNS (bool): Whether the token is a left punctuation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct:
"""Is the token a left punctuation mark, e.g. "("?
RETURNS (bool): Whether the token is a left punctuation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
"""RETURNS (bool): Whether the token is a left punctuation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url:
"""Does the token resemble a URL?
RETURNS (bool): Whether the token resembles a URL.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
"""RETURNS (bool): Whether the token resembles a URL."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
property like_num:
"""Does the token represent a number? e.g. "10.9", "10", "ten", etc.
RETURNS (bool): Whether the token resembles a number.
"""RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
"10", "ten", etc.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
property like_email:
"""Does the token resemble an email address?
RETURNS (bool): Whether the token resembles an email address.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
"""RETURNS (bool): Whether the token resembles an email address."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)

View File

@ -784,3 +784,10 @@ p
+cell
| A dictionary that allows customisation of properties of
| #[code Span] children.
+row
+cell #[code _]
+cell #[code Underscore]
+cell
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].

View File

@ -369,7 +369,7 @@ p
+tag property
+tag-model("parse")
p Tokens that are to the left of the span, whose head is within the span.
p Tokens that are to the left of the span, whose heads are within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
@ -386,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span.
+tag property
+tag-model("parse")
p Tokens that are to the right of the span, whose head is within the span.
p Tokens that are to the right of the span, whose heads are within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
@ -399,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span.
+cell #[code Token]
+cell A right-child of a token of the span.
+h(2, "n_lefts") Span.n_lefts
+tag property
+tag-model("parse")
p
| The number of tokens that are to the left of the span, whose heads are
| within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3:7].n_lefts == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of left-child tokens.
+h(2, "n_rights") Span.n_rights
+tag property
+tag-model("parse")
p
| The number of tokens that are to the right of the span, whose heads are
| within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[2:4].n_rights == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of right-child tokens.
+h(2, "subtree") Span.subtree
+tag property
+tag-model("parse")
@ -553,3 +589,17 @@ p
+cell #[code ent_id_]
+cell unicode
+cell The string ID of the named entity the token is an instance of.
+row
+cell #[code sentiment]
+cell float
+cell
| A scalar value indicating the positivity or negativity of the
| span.
+row
+cell #[code _]
+cell #[code Underscore]
+cell
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].

View File

@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children.
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "lefts") Token.lefts
+tag property
+tag-model("parse")
p
| The leftward immediate children of the word, in the syntactic dependency
| parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
lefts = [t.text for t in doc[3].lefts]
assert lefts == [u'New']
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code Token]
+cell A left-child of the token.
+h(2, "rights") Token.rights
+tag property
+tag-model("parse")
p
| The rightward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
rights = [t.text for t in doc[3].rights]
assert rights == [u'in']
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code Token]
+cell A right-child of the token.
+h(2, "n_lefts") Token.n_lefts
+tag property
+tag-model("parse")
p
| The number of leftward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_lefts == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of left-child tokens.
+h(2, "n_rights") Token.n_rights
+tag property
+tag-model("parse")
p
| The number of rightward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_rights == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of right-child tokens.
+h(2, "subtree") Token.subtree
+tag property
+tag-model("parse")
@ -713,9 +787,30 @@ p The L2 norm of the token's vector representation.
+row
+cell #[code sentiment]
+cell float
+cell A scalar value indicating the positivity or negativity of the token.
+cell
| A scalar value indicating the positivity or negativity of the
| token.
+row
+cell #[code lex_id]
+cell int
+cell ID of the token's lexical type.
+cell Sequential ID of the token's lexical type.
+row
+cell #[code rank]
+cell int
+cell
| Sequential ID of the token's lexical type, used to index into
| tagles, e.g. for word vectors.
+row
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+row
+cell #[code _]
+cell #[code Underscore]
+cell
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].

View File

@ -111,11 +111,13 @@ p
p
| A few more convenience attributes are provided for iterating around the
| local tree from the token. The #[code .lefts] and #[code .rights]
| attributes provide sequences of syntactic children that occur before and
| after the token. Both sequences are in sentences order. There are also
| two integer-typed attributes, #[code .n_rights] and #[code .n_lefts],
| that give the number of left and right children.
| local tree from the token. The #[+api("token#lefts") #[code Token.lefts]]
| and #[+api("token#rights") #[code Token.rights]] attributes provide
| sequences of syntactic children that occur before and after the token.
| Both sequences are in sentence order. There are also two integer-typed
| attributes, #[+api("token#n_rights") #[code Token.n_rights]] and
| #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of
| left and right children.
+code.
doc = nlp(u'bright red apples on the tree')
@ -126,10 +128,11 @@ p
p
| You can get a whole phrase by its syntactic head using the
| #[code .subtree] attribute. This returns an ordered sequence of tokens.
| You can walk up the tree with the #[code .ancestors] attribute, and
| check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]]
| method.
| #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an
| ordered sequence of tokens. You can walk up the tree with the
| #[+api("token#ancestors") #[code Token.ancestors]] attribute, and
| check dominance with
| #[+api("token#is_ancestor") #[code Token.is_ancestor()]].
+aside("Projective vs. non-projective")
| For the #[+a("/models/en") default English model], the