diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7c276e3c2..7a2e95e4b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -326,7 +326,8 @@ cdef class Doc: if self._vector is not None: return self._vector elif not len(self): - self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') + self._vector = numpy.zeros((self.vocab.vectors_length,), + dtype='f') return self._vector elif self.has_vector: vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') @@ -338,7 +339,8 @@ cdef class Doc: self._vector = self.tensor.mean(axis=0) return self._vector else: - return numpy.zeros((self.vocab.vectors_length,), dtype='float32') + return numpy.zeros((self.vocab.vectors_length,), + dtype='float32') def __set__(self, value): self._vector = value @@ -424,7 +426,8 @@ cdef class Doc: def __set__(self, ents): # TODO: # 1. Allow negative matches - # 2. Ensure pre-set NERs are not over-written during statistical prediction + # 2. Ensure pre-set NERs are not over-written during statistical + # prediction # 3. Test basic data-driven ORTH gazetteer # 4. Test more nuanced date and currency regex cdef int i @@ -433,7 +436,7 @@ cdef class Doc: # At this point we don't know whether the NER has run over the # Doc. If the ent_iob is missing, leave it missing. if self.c[i].ent_iob != 0: - self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. + self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. cdef attr_t ent_type cdef int start, end for ent_info in ents: @@ -574,18 +577,19 @@ cdef class Doc: # Allow strings, e.g. 'lemma' or 'LEMMA' py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) for id_ in py_attr_ids] - # Make an array from the attributes --- otherwise our inner loop is Python - # dict iteration. + # Make an array from the attributes --- otherwise our inner loop is + # Python dict iteration. attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) - output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), + dtype=numpy.uint64) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.c[i], feature) # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,)) - - def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): + def count_by(self, attr_id_t attr_id, exclude=None, + PreshCounter counts=None): """Count the frequencies of a given attribute. Produces a dict of `{attribute (int): count (ints)}` frequencies, keyed by the values of the given attribute ID. @@ -708,7 +712,8 @@ cdef class Doc: elif (token_j.head == token_j) and (token_k.head == token_k): lca_index = -1 else: - lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) + lca_index = __pairwise_lca(token_j.head, token_k.head, + lca_matrix) lca_matrix[token_j.i][token_k.i] = lca_index lca_matrix[token_k.i][token_j.i] = lca_index @@ -728,7 +733,7 @@ cdef class Doc: """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ with path.open('wb') as file_: file_.write(self.to_bytes(**exclude)) @@ -751,7 +756,7 @@ cdef class Doc: RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. """ - array_head = [LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE] + array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE] # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope @@ -794,7 +799,8 @@ cdef class Doc: # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. if 'user_data' not in exclude and 'user_data_keys' in msg: - user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False) + user_data_keys = msgpack.loads(msg['user_data_keys'], + use_list=False) user_data_values = msgpack.loads(msg['user_data_values']) for key, value in zip(user_data_keys, user_data_values): self.user_data[key] = value @@ -853,7 +859,8 @@ cdef class Doc: "Doc.merge received %d non-keyword arguments. Expected either " "3 arguments (deprecated), or 0 (use keyword arguments). " "Arguments supplied:\n%s\n" - "Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes))) + "Keyword arguments: %s\n" % (len(args), repr(args), + repr(attributes))) # More deprecated attribute handling =/ if 'label' in attributes: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 3b2d14b2b..6f760bfbc 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -128,14 +128,17 @@ cdef class Span: @property def _(self): + """User space for adding custom attribute extensions.""" return Underscore(Underscore.span_extensions, self, start=self.start_char, end=self.end_char) def as_doc(self): - '''Create a Doc object view of the Span's data. + # TODO: fix + """Create a `Doc` object view of the Span's data. This is mostly + useful for C-typed interfaces. - This is mostly useful for C-typed interfaces. - ''' + RETURNS (Doc): The `Doc` view of the span. + """ cdef Doc doc = Doc(self.doc.vocab) doc.length = self.end-self.start doc.c = &self.doc.c[self.start] @@ -259,10 +262,7 @@ cdef class Span: self.end = end + 1 property sent: - """The sentence span that this span is a part of. - - RETURNS (Span): The sentence span that the span is a part of. - """ + """RETURNS (Span): The sentence span that the span is a part of.""" def __get__(self): if 'sent' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sent'](self) @@ -275,13 +275,10 @@ cdef class Span: n += 1 if n >= self.doc.length: raise RuntimeError - return self.doc[root.l_edge : root.r_edge + 1] + return self.doc[root.l_edge:root.r_edge + 1] property has_vector: - """A boolean value indicating whether a word vector is associated with - the object. - - RETURNS (bool): Whether a word vector is associated with the object. + """RETURNS (bool): Whether a word vector is associated with the object. """ def __get__(self): if 'has_vector' in self.doc.user_span_hooks: @@ -303,10 +300,7 @@ cdef class Span: return self._vector property vector_norm: - """The L2 norm of the document's vector representation. - - RETURNS (float): The L2 norm of the vector representation. - """ + """RETURNS (float): The L2 norm of the vector representation.""" def __get__(self): if 'vector_norm' in self.doc.user_span_hooks: return self.doc.user_span_hooks['vector'](self) @@ -320,7 +314,9 @@ cdef class Span: return self._vector_norm property sentiment: - # TODO: docstring + """RETURNS (float): A scalar value indicating the positivity or + negativity of the span. + """ def __get__(self): if 'sentiment' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sentiment'](self) @@ -328,10 +324,7 @@ cdef class Span: return sum([token.sentiment for token in self]) / len(self) property text: - """A unicode representation of the span text. - - RETURNS (unicode): The original verbatim text of the span. - """ + """RETURNS (unicode): The original verbatim text of the span.""" def __get__(self): text = self.text_with_ws if self[-1].whitespace_: @@ -364,10 +357,11 @@ cdef class Span: "requires a statistical model to be installed and loaded. " "For more info, see the " "documentation: \n%s\n" % about.__docs_models__) - # Accumulate the result before beginning to iterate over it. This prevents - # the tokenisation from being changed out from under us during the iteration. - # The tricky thing here is that Span accepts its tokenisation changing, - # so it's okay once we have the Span objects. See Issue #375 + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375 spans = [] cdef attr_t label for start, end, label in self.doc.noun_chunks_iterator(self): @@ -459,7 +453,7 @@ cdef class Span: YIELDS (Token):A left-child of a token of the span. """ def __get__(self): - for token in reversed(self): # Reverse, so we get tokens in order + for token in reversed(self): # Reverse, so we get tokens in order for left in token.lefts: if left.i < self.start: yield left @@ -476,6 +470,20 @@ cdef class Span: if right.i >= self.end: yield right + property n_lefts: + """RETURNS (int): The number of leftward immediate children of the + span, in the syntactic dependency parse. + """ + # TODO: implement + raise NotImplementedError() + + property n_rights: + """RETURNS (int): The number of rightward immediate children of the + span, in the syntactic dependency parse. + """ + # TODO: implement + raise NotImplementedError() + property subtree: """Tokens that descend from tokens in the span, but fall outside it. @@ -489,29 +497,21 @@ cdef class Span: yield from word.subtree property ent_id: - """An (integer) entity ID. - - RETURNS (uint64): The entity ID. - """ + """RETURNS (uint64): The entity ID.""" def __get__(self): return self.root.ent_id def __set__(self, hash_t key): - # TODO raise NotImplementedError( "Can't yet set ent_id from Span. Vote for this feature on " "the issue tracker: http://github.com/explosion/spaCy/issues") property ent_id_: - """A (string) entity ID. Usually assigned by patterns in the `Matcher`. - - RETURNS (unicode): The entity ID. - """ + """RETURNS (unicode): The (string) entity ID.""" def __get__(self): return self.root.ent_id_ def __set__(self, hash_t key): - # TODO raise NotImplementedError( "Can't yet set ent_id_ from Span. Vote for this feature on the " "issue tracker: http://github.com/explosion/spaCy/issues") @@ -525,10 +525,7 @@ cdef class Span: return ''.join([t.orth_ for t in self]).strip() property lemma_: - """The span's lemma. - - RETURNS (unicode): The span's lemma. - """ + """RETURNS (unicode): The span's lemma.""" def __get__(self): return ' '.join([t.lemma_ for t in self]).strip() @@ -543,15 +540,12 @@ cdef class Span: return ''.join([t.text_with_ws.lower() for t in self]).strip() property string: - """Deprecated: Use Span.text instead.""" + """Deprecated: Use Span.text_with_ws instead.""" def __get__(self): return ''.join([t.text_with_ws for t in self]) property label_: - """The span's label. - - RETURNS (unicode): The span's label. - """ + """RETURNS (unicode): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 04aa3f582..fa07d0e9e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -145,37 +145,32 @@ cdef class Token: return self.doc.user_token_hooks['similarity'](self) if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 - return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + return (numpy.dot(self.vector, other.vector) / + (self.vector_norm * other.vector_norm)) property lex_id: - """ID of the token's lexical type. - - RETURNS (int): ID of the token's lexical type.""" + """RETURNS (int): Sequential ID of the token's lexical type.""" def __get__(self): return self.c.lex.id property rank: - # TODO: add docstring + """RETURNS (int): Sequential ID of the token's lexical type, used to + index into tables, e.g. for word vectors.""" def __get__(self): return self.c.lex.id property string: + """Deprecated: Use Token.text_with_ws instead.""" def __get__(self): return self.text_with_ws property text: - """A unicode representation of the token text. - - RETURNS (unicode): The original verbatim text of the token. - """ + """RETURNS (unicode): The original verbatim text of the token.""" def __get__(self): return self.orth_ property text_with_ws: - """The text content of the token with a trailing whitespace character - if it has one. - - RETURNS (unicode): The text content of the span (with trailing + """RETURNS (unicode): The text content of the span (with trailing whitespace). """ def __get__(self): @@ -186,74 +181,104 @@ cdef class Token: return orth property prob: + """RETURNS (float): Smoothed log probability estimate of token type.""" def __get__(self): return self.c.lex.prob property sentiment: + """RETURNS (float): A scalar value indicating the positivity or + negativity of the token.""" def __get__(self): if 'sentiment' in self.doc.user_token_hooks: return self.doc.user_token_hooks['sentiment'](self) return self.c.lex.sentiment property lang: + """RETURNS (uint64): ID of the language of the parent document's + vocabulary. + """ def __get__(self): return self.c.lex.lang property idx: + """RETURNS (int): The character offset of the token within the parent + document. + """ def __get__(self): return self.c.idx property cluster: + """RETURNS (int): Brown cluster ID.""" def __get__(self): return self.c.lex.cluster property orth: + """RETURNS (uint64): ID of the verbatim text content.""" def __get__(self): return self.c.lex.orth property lower: + """RETURNS (uint64): ID of the lowercase token text.""" def __get__(self): return self.c.lex.lower property norm: + """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of + the token text. Usually set in the language's tokenizer exceptions + or norm exceptions. + """ def __get__(self): return self.c.lex.norm property shape: + """RETURNS (uint64): ID of the token's shape, a transform of the + tokens's string, to show orthographic features (e.g. "Xxxx", "dd"). + """ def __get__(self): return self.c.lex.shape property prefix: + """RETURNS (uint64): ID of a length-N substring from the start of the + token. Defaults to `N=1`. + """ def __get__(self): return self.c.lex.prefix property suffix: + """RETURNS (uint64): ID of a length-N substring from the end of the + token. Defaults to `N=3`. + """ def __get__(self): return self.c.lex.suffix property lemma: - """Base form of the word, with no inflectional suffixes. - - RETURNS (uint64): Token lemma. + """RETURNS (uint64): ID of the base form of the word, with no + inflectional suffixes. """ def __get__(self): return self.c.lemma + def __set__(self, attr_t lemma): self.c.lemma = lemma property pos: + """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" def __get__(self): return self.c.pos property tag: + """RETURNS (uint64): ID of fine-grained part-of-speech tag.""" def __get__(self): return self.c.tag + def __set__(self, attr_t tag): self.vocab.morphology.assign_tag(self.c, tag) property dep: + """RETURNS (uint64): ID of syntactic dependency label.""" def __get__(self): return self.c.dep + def __set__(self, attr_t label): self.c.dep = label @@ -294,14 +319,21 @@ cdef class Token: return numpy.sqrt((vector ** 2).sum()) property n_lefts: + """RETURNS (int): The number of leftward immediate children of the + word, in the syntactic dependency parse. + """ def __get__(self): return self.c.l_kids property n_rights: + """RETURNS (int): The number of rightward immediate children of the + word, in the syntactic dependency parse. + """ def __get__(self): return self.c.r_kids property sent_start: + # TODO: fix and document def __get__(self): return self.c.sent_start @@ -321,10 +353,12 @@ cdef class Token: "one of: None, True, False") property lefts: + """The leftward immediate children of the word, in the syntactic + dependency parse. + + YIELDS (Token): A left-child of the token. + """ def __get__(self): - """The leftward immediate children of the word, in the syntactic - dependency parse. - """ cdef int nr_iter = 0 cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: @@ -338,10 +372,12 @@ cdef class Token: "while looking for token.lefts") property rights: + """The rightward immediate children of the word, in the syntactic + dependency parse. + + YIELDS (Token): A right-child of the token. + """ def __get__(self): - """The rightward immediate children of the word, in the syntactic - dependency parse. - """ cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] cdef int nr_iter = 0 @@ -420,18 +456,17 @@ cdef class Token: """ if self.doc is not descendant.doc: return False - return any( ancestor.i == self.i for ancestor in descendant.ancestors ) + return any(ancestor.i == self.i for ancestor in descendant.ancestors) property head: """The syntactic parent, or "governor", of this token. - RETURNS (Token): The token head. + RETURNS (Token): The token predicted by the parser to be the head of + the current token. """ def __get__(self): - """The token predicted by the parser to be the head of the current - token. - """ return self.doc[self.i + self.c.head] + def __set__(self, Token new_head): # this function sets the head of self to new_head # and updates the counters for left/right dependents @@ -451,7 +486,7 @@ cdef class Token: cdef Token anc, child # update number of deps of old head - if self.c.head > 0: # left dependent + if self.c.head > 0: # left dependent old_head.c.l_kids -= 1 if self.c.l_edge == old_head.c.l_edge: # the token dominates the left edge so the left edge of @@ -543,12 +578,10 @@ cdef class Token: yield from word.conjuncts property ent_type: - """Named entity type. - - RETURNS (uint64): Named entity type. - """ + """RETURNS (uint64): Named entity type.""" def __get__(self): return self.c.ent_type + def __set__(self, ent_type): self.c.ent_type = ent_type @@ -562,12 +595,10 @@ cdef class Token: return self.c.ent_iob property ent_type_: - """Named entity type. - - RETURNS (unicode): Named entity type. - """ + """RETURNS (unicode): Named entity type.""" def __get__(self): return self.vocab.strings[self.c.ent_type] + def __set__(self, ent_type): self.c.ent_type = self.vocab.strings.add(ent_type) @@ -583,9 +614,8 @@ cdef class Token: return iob_strings[self.c.ent_iob] property ent_id: - """ID of the entity the token is an instance of, if any. - - RETURNS (uint64): ID of the entity. + """RETURNS (uint64): ID of the entity the token is an instance of, + if any. """ def __get__(self): return self.c.ent_id @@ -594,9 +624,8 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """ID of the entity the token is an instance of, if any. - - RETURNS (unicode): ID of the entity. + """RETURNS (unicode): ID of the entity the token is an instance of, + if any. """ def __get__(self): return self.vocab.strings[self.c.ent_id] @@ -605,230 +634,192 @@ cdef class Token: self.c.ent_id = self.vocab.strings.add(name) property whitespace_: - """Trailing space character if present. - - RETURNS (unicode): The whitespace character. + """RETURNS (unicode): The trailing whitespace character, if present. """ def __get__(self): return ' ' if self.c.spacy else '' property orth_: - """Verbatim text content (identical to `Token.text`). Existst mostly - for consistency with the other attributes. - - RETURNS (unicode): The token text. + """RETURNS (unicode): Verbatim text content (identical to + `Token.text`). Existst mostly for consistency with the other + attributes. """ def __get__(self): return self.vocab.strings[self.c.lex.orth] property lower_: - """Lowercase form of the token text. Equivalent to - `Token.text.lower()`. - - RETURNS (unicode): The lowercase token text. + """RETURNS (unicode): The lowercase token text. Equivalent to + `Token.text.lower()`. """ def __get__(self): return self.vocab.strings[self.c.lex.lower] property norm_: - """The token's norm, i.e. a normalised form of the token text. - Usually set in the language's tokenizer exceptions or norm exceptions. - - RETURNS (unicode): The norm. + """RETURNS (unicode): The token's norm, i.e. a normalised form of the + token text. Usually set in the language's tokenizer exceptions or + norm exceptions. """ def __get__(self): return self.vocab.strings[self.c.lex.norm] property shape_: - """Transform of the tokens's string, to show orthographic features. - For example, "Xxxx" or "dd". - - RETURNS (unicode): The token shape. + """RETURNS (unicode): Transform of the tokens's string, to show + orthographic features. For example, "Xxxx" or "dd". """ def __get__(self): return self.vocab.strings[self.c.lex.shape] property prefix_: - """A length-N substring from the start of the token. Defaults to `N=1`. - - RETURNS (unicode): The token's prefix. + """RETURNS (unicode): A length-N substring from the start of the token. + Defaults to `N=1`. """ def __get__(self): return self.vocab.strings[self.c.lex.prefix] property suffix_: - """A length-N substring from the end of the token. Defaults to `N=3`. - - RETURNS (unicode): The token's suffix. + """RETURNS (unicode): A length-N substring from the end of the token. + Defaults to `N=3`. """ def __get__(self): return self.vocab.strings[self.c.lex.suffix] property lang_: - """Language of the parent document's vocabulary, e.g. 'en'. - - RETURNS (unicode): The language code. + """RETURNS (unicode): Language of the parent document's vocabulary, + e.g. 'en'. """ def __get__(self): return self.vocab.strings[self.c.lex.lang] property lemma_: - """Base form of the word, with no inflectional suffixes. - - RETURNS (unicode): Token lemma. + """RETURNS (unicode): The token lemma, i.e. the base form of the word, + with no inflectional suffixes. """ def __get__(self): return self.vocab.strings[self.c.lemma] + def __set__(self, unicode lemma_): self.c.lemma = self.vocab.strings.add(lemma_) property pos_: - """Coarse-grained part-of-speech. - - RETURNS (unicode): The part-of-speech tag. - """ + """RETURNS (unicode): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] property tag_: - """Fine-grained part-of-speech. - - RETURNS (unicode): The part-of-speech tag. - """ + """RETURNS (unicode): Fine-grained part-of-speech tag.""" def __get__(self): return self.vocab.strings[self.c.tag] + def __set__(self, tag): self.tag = self.vocab.strings.add(tag) property dep_: - """Syntactic dependency relation. - - RETURNS (unicode): The dependency label. - """ + """RETURNS (unicode): The syntactic dependency label.""" def __get__(self): return self.vocab.strings[self.c.dep] + def __set__(self, unicode label): self.c.dep = self.vocab.strings.add(label) property is_oov: - """Is the token out-of-vocabulary? - - RETURNS (bool): Whether the token is out-of-vocabulary. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) + """RETURNS (bool): Whether the token is out-of-vocabulary.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_OOV) property is_stop: - """Is the token part of a "stop list"? (defined by the language data) - - RETURNS (bool): Whether the token is a stop word. + """RETURNS (bool): Whether the token is a stop word, i.e. part of a + "stop list" defined by the language data. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_STOP) property is_alpha: - """Does the token consist of alphabetic characters? Equivalent to - `token.text.isalpha()`. - - RETURNS (bool): Whether the token consists of alpha characters. + """RETURNS (bool): Whether the token consists of alpha characters. + Equivalent to `token.text.isalpha()`. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) property is_ascii: - """Does the token consist of ASCII characters? Equivalent to - `[any(ord(c) >= 128 for c in token.text)]`. - - RETURNS (bool): Whether the token consists of ASCII characters. + """RETURNS (bool): Whether the token consists of ASCII characters. + Equivalent to `[any(ord(c) >= 128 for c in token.text)]`. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_ASCII) property is_digit: - """Does the token consist of digits? Equivalent to - `token.text.isdigit()`. - - RETURNS (bool): Whether the token consists of digits. + """RETURNS (bool): Whether the token consists of digits. Equivalent to + `token.text.isdigit()`. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) property is_lower: - """Is the token in lowercase? Equivalent to `token.text.islower()`. - - RETURNS (bool): Whether the token is in lowercase. + """RETURNS (bool): Whether the token is in lowercase. Equivalent to + `token.text.islower()`. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_LOWER) property is_upper: - """Is the token in uppercase? Equivalent to `token.text.isupper()`. - - RETURNS (bool): Whether the token is in uppercase. + """RETURNS (bool): Whether the token is in uppercase. Equivalent to + `token.text.isupper()` """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_UPPER) property is_title: - """Is the token in titlecase? Equivalent to `token.text.istitle()`. - - RETURNS (bool): Whether the token is in titlecase. + """RETURNS (bool): Whether the token is in titlecase. Equivalent to + `token.text.istitle()`. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_TITLE) property is_punct: - """Is the token punctuation? - - RETURNS (bool): Whether the token is punctuation. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) + """RETURNS (bool): Whether the token is punctuation.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) property is_space: - """Does the token consist of whitespace characters? Equivalent to - `token.text.isspace()`. - - RETURNS (bool): Whether the token consists of whitespace characters. + """RETURNS (bool): Whether the token consists of whitespace characters. + Equivalent to `token.text.isspace()`. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_SPACE) property is_bracket: - """Is the token a bracket? - - RETURNS (bool): Whether the token is a bracket. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) + """RETURNS (bool): Whether the token is a bracket.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) property is_quote: - """Is the token a quotation mark? - - RETURNS (bool): Whether the token is a quotation mark. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) + """RETURNS (bool): Whether the token is a quotation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) property is_left_punct: - """Is the token a left punctuation mark, e.g. "("? - - RETURNS (bool): Whether the token is a left punctuation mark. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) + """RETURNS (bool): Whether the token is a left punctuation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) property is_right_punct: - """Is the token a left punctuation mark, e.g. "("? - - RETURNS (bool): Whether the token is a left punctuation mark. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) + """RETURNS (bool): Whether the token is a left punctuation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) property like_url: - """Does the token resemble a URL? - - RETURNS (bool): Whether the token resembles a URL. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL) + """RETURNS (bool): Whether the token resembles a URL.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_URL) property like_num: - """Does the token represent a number? e.g. "10.9", "10", "ten", etc. - - RETURNS (bool): Whether the token resembles a number. + """RETURNS (bool): Whether the token resembles a number, e.g. "10.9", + "10", "ten", etc. """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) property like_email: - """Does the token resemble an email address? - - RETURNS (bool): Whether the token resembles an email address. - """ - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) + """RETURNS (bool): Whether the token resembles an email address.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) diff --git a/website/api/doc.jade b/website/api/doc.jade index f2c73de9f..ac91ad427 100644 --- a/website/api/doc.jade +++ b/website/api/doc.jade @@ -784,3 +784,10 @@ p +cell | A dictionary that allows customisation of properties of | #[code Span] children. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/api/span.jade b/website/api/span.jade index f00cb936f..266518076 100644 --- a/website/api/span.jade +++ b/website/api/span.jade @@ -369,7 +369,7 @@ p +tag property +tag-model("parse") -p Tokens that are to the left of the span, whose head is within the span. +p Tokens that are to the left of the span, whose heads are within the span. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') @@ -386,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span. +tag property +tag-model("parse") -p Tokens that are to the right of the span, whose head is within the span. +p Tokens that are to the right of the span, whose heads are within the span. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') @@ -399,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span. +cell #[code Token] +cell A right-child of a token of the span. ++h(2, "n_lefts") Span.n_lefts + +tag property + +tag-model("parse") + +p + | The number of tokens that are to the left of the span, whose heads are + | within the span. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3:7].n_lefts == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of left-child tokens. + ++h(2, "n_rights") Span.n_rights + +tag property + +tag-model("parse") + +p + | The number of tokens that are to the right of the span, whose heads are + | within the span. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[2:4].n_rights == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of right-child tokens. + +h(2, "subtree") Span.subtree +tag property +tag-model("parse") @@ -553,3 +589,17 @@ p +cell #[code ent_id_] +cell unicode +cell The string ID of the named entity the token is an instance of. + + +row + +cell #[code sentiment] + +cell float + +cell + | A scalar value indicating the positivity or negativity of the + | span. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/api/token.jade b/website/api/token.jade index 3ce11d07a..e375e987d 100644 --- a/website/api/token.jade +++ b/website/api/token.jade @@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children. +cell #[code Token] +cell A child token such that #[code child.head==self]. ++h(2, "lefts") Token.lefts + +tag property + +tag-model("parse") + +p + | The leftward immediate children of the word, in the syntactic dependency + | parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + lefts = [t.text for t in doc[3].lefts] + assert lefts == [u'New'] + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code Token] + +cell A left-child of the token. + ++h(2, "rights") Token.rights + +tag property + +tag-model("parse") + +p + | The rightward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + rights = [t.text for t in doc[3].rights] + assert rights == [u'in'] + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code Token] + +cell A right-child of the token. + ++h(2, "n_lefts") Token.n_lefts + +tag property + +tag-model("parse") + +p + | The number of leftward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3].n_lefts == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of left-child tokens. + ++h(2, "n_rights") Token.n_rights + +tag property + +tag-model("parse") + +p + | The number of rightward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3].n_rights == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of right-child tokens. + +h(2, "subtree") Token.subtree +tag property +tag-model("parse") @@ -713,9 +787,30 @@ p The L2 norm of the token's vector representation. +row +cell #[code sentiment] +cell float - +cell A scalar value indicating the positivity or negativity of the token. + +cell + | A scalar value indicating the positivity or negativity of the + | token. +row +cell #[code lex_id] +cell int - +cell ID of the token's lexical type. + +cell Sequential ID of the token's lexical type. + + +row + +cell #[code rank] + +cell int + +cell + | Sequential ID of the token's lexical type, used to index into + | tagles, e.g. for word vectors. + + +row + +cell #[code cluster] + +cell int + +cell Brown cluster ID. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/usage/_linguistic-features/_dependency-parse.jade b/website/usage/_linguistic-features/_dependency-parse.jade index 85d9179df..0fcdd4713 100644 --- a/website/usage/_linguistic-features/_dependency-parse.jade +++ b/website/usage/_linguistic-features/_dependency-parse.jade @@ -111,11 +111,13 @@ p p | A few more convenience attributes are provided for iterating around the - | local tree from the token. The #[code .lefts] and #[code .rights] - | attributes provide sequences of syntactic children that occur before and - | after the token. Both sequences are in sentences order. There are also - | two integer-typed attributes, #[code .n_rights] and #[code .n_lefts], - | that give the number of left and right children. + | local tree from the token. The #[+api("token#lefts") #[code Token.lefts]] + | and #[+api("token#rights") #[code Token.rights]] attributes provide + | sequences of syntactic children that occur before and after the token. + | Both sequences are in sentence order. There are also two integer-typed + | attributes, #[+api("token#n_rights") #[code Token.n_rights]] and + | #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of + | left and right children. +code. doc = nlp(u'bright red apples on the tree') @@ -126,10 +128,11 @@ p p | You can get a whole phrase by its syntactic head using the - | #[code .subtree] attribute. This returns an ordered sequence of tokens. - | You can walk up the tree with the #[code .ancestors] attribute, and - | check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]] - | method. + | #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an + | ordered sequence of tokens. You can walk up the tree with the + | #[+api("token#ancestors") #[code Token.ancestors]] attribute, and + | check dominance with + | #[+api("token#is_ancestor") #[code Token.is_ancestor()]]. +aside("Projective vs. non-projective") | For the #[+a("/models/en") default English model], the