From 5d5742b773ff28a8f7b0afa9e229a8ab8921403b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 19 Oct 2016 20:54:03 +0200 Subject: [PATCH] Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc. --- spacy/tokens/doc.pxd | 7 +++++-- spacy/tokens/doc.pyx | 13 +++++++++++++ spacy/tokens/span.pyx | 20 ++++++++++---------- spacy/tokens/token.pyx | 26 ++++++++++++++++---------- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 6b11476a9..6b83bceb8 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -40,8 +40,11 @@ cdef class Doc: cdef public bint is_tagged cdef public bint is_parsed - cdef public dict getters_for_tokens - cdef public dict getters_for_spans + cdef public float sentiment + + cdef public dict user_hooks + cdef public dict user_token_hooks + cdef public dict user_span_hooks cdef public list _py_tokens diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4870efcb6..cb9b325e5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -115,6 +115,7 @@ cdef class Doc: self.length = 0 self.is_tagged = False self.is_parsed = False + self.sentiment = 0.0 self.getters_for_tokens = {} self.getters_for_spans = {} self.tensor = numpy.zeros((0,), dtype='float32') @@ -217,16 +218,23 @@ cdef class Doc: return self.__str__() def similarity(self, other): + if 'similarity' in self.user_hooks: + return self.user_hooks['similarity'](self, other) if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property has_vector: def __get__(self): + if 'has_vector' in self.user_hooks: + return self.user_hooks['has_vector'](self) + return any(token.has_vector for token in self) property vector: def __get__(self): + if 'vector' in self.user_hooks: + return self.user_hooks['vector'](self) if self._vector is None: if len(self): self._vector = sum(t.vector for t in self) / len(self) @@ -239,6 +247,8 @@ cdef class Doc: property vector_norm: def __get__(self): + if 'vector_norm' in self.user_hooks: + return self.user_hooks['vector_norm'](self) cdef float value if self._vector_norm is None: self._vector_norm = 1e-20 @@ -376,6 +386,9 @@ cdef class Doc: assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] """ def __get__(self): + if 'sents' in self.user_hooks: + return self.user_hooks['sents'](self) + if not self.is_parsed: raise ValueError( "sentence boundary detection requires the dependency parse, which " diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index dc23481f6..fa081c90a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -81,8 +81,8 @@ cdef class Span: self.doc.merge(self.start_char, self.end_char, *args, **attributes) def similarity(self, other): - if 'similarity' in self.doc.getters_for_spans: - self.doc.getters_for_spans['similarity'](self, other) + if 'similarity' in self.doc.user_span_hooks: + self.doc.user_span_hooks['similarity'](self, other) if self.vector_norm == 0.0 or other.vector_norm == 0.0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) @@ -104,8 +104,8 @@ cdef class Span: property sent: '''Get the sentence span that this span is a part of.''' def __get__(self): - if 'sent' in self.doc.getters_for_spans: - return self.doc.getters_for_spans['sent'](self) + if 'sent' in self.doc.user_span_hooks: + return self.doc.user_span_hooks['sent'](self) # This should raise if we're not parsed. self.doc.sents cdef int n = 0 @@ -119,14 +119,14 @@ cdef class Span: property has_vector: def __get__(self): - if 'has_vector' in self.doc.getters_for_spans: - return self.doc.getters_for_spans['has_vector'](self) + if 'has_vector' in self.doc.user_span_hooks: + return self.doc.user_span_hooks['has_vector'](self) return any(token.has_vector for token in self) property vector: def __get__(self): - if 'vector' in self.doc.getters_for_spans: - return self.doc.getters_for_spans['vector'](self) + if 'vector' in self.doc.user_span_hooks: + return self.doc.user_span_hooks['vector'](self) if self._vector is None: self._vector = sum(t.vector for t in self) / len(self) return self._vector @@ -197,8 +197,8 @@ cdef class Span: """ def __get__(self): self._recalculate_indices() - if 'root' in self.doc.getters_for_spans: - return self.doc.getters_for_spans['root'](self) + if 'root' in self.doc.user_span_hooks: + return self.doc.user_span_hooks['root'](self) # This should probably be called 'head', and the other one called # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ cdef int i diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 52e393b9b..f15869d59 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -63,8 +63,8 @@ cdef class Token: return self.doc[self.i+i] def similarity(self, other): - if 'similarity' in self.doc.getters_for_tokens: - return self.doc.getters_for_tokens['similarity'](self, other) + if 'similarity' in self.doc.user_token_hooks: + return self.doc.user_token_hooks['similarity'](self) if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) @@ -97,6 +97,12 @@ cdef class Token: def __get__(self): return self.c.lex.prob + property sentiment: + def __get__(self): + if 'sentiment' in self.doc.user_token_hooks: + return self.doc.user_token_hooks['sentiment'](self) + return self.c.lex.sentiment + property lang: def __get__(self): return self.c.lex.lang @@ -153,8 +159,8 @@ cdef class Token: property has_vector: def __get__(self): - if 'has_vector' in self.doc.getters_for_tokens: - return self.doc.getters_for_tokens['has_vector'](self) + if 'has_vector' in self.doc.user_token_hooks: + return self.doc.user_token_hooks['has_vector'](self) cdef int i for i in range(self.vocab.vectors_length): if self.c.lex.vector[i] != 0: @@ -164,8 +170,8 @@ cdef class Token: property vector: def __get__(self): - if 'vector' in self.doc.getters_for_tokens: - return self.doc.getters_for_tokens['vector'](self) + if 'vector' in self.doc.user_token_hooks: + return self.doc.user_token_hooks['vector'](self) cdef int length = self.vocab.vectors_length if length == 0: raise ValueError( @@ -186,8 +192,8 @@ cdef class Token: property vector_norm: def __get__(self): - if 'vector_norm' in self.doc.getters_for_tokens: - return self.doc.getters_for_tokens['vector_norm'](self) + if 'vector_norm' in self.doc.user_token_hooks: + return self.doc.user_token_hooks['vector_norm'](self) return self.c.lex.l2_norm property n_lefts: @@ -367,8 +373,8 @@ cdef class Token: def __get__(self): """Get a list of conjoined words.""" cdef Token word - if 'conjuncts' in self.doc.getters_for_tokens: - yield from self.doc.getters_for_tokens['conjuncts'](self) + if 'conjuncts' in self.doc.user_token_hooks: + yield from self.doc.user_token_hooks['conjuncts'](self) else: if self.dep_ != 'conj': for word in self.rights: