Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

This commit is contained in:
Matthew Honnibal 2016-10-19 20:54:03 +02:00
parent ed5e178817
commit 5d5742b773
4 changed files with 44 additions and 22 deletions

View File

@ -40,8 +40,11 @@ cdef class Doc:
cdef public bint is_tagged cdef public bint is_tagged
cdef public bint is_parsed cdef public bint is_parsed
cdef public dict getters_for_tokens cdef public float sentiment
cdef public dict getters_for_spans
cdef public dict user_hooks
cdef public dict user_token_hooks
cdef public dict user_span_hooks
cdef public list _py_tokens cdef public list _py_tokens

View File

@ -115,6 +115,7 @@ cdef class Doc:
self.length = 0 self.length = 0
self.is_tagged = False self.is_tagged = False
self.is_parsed = False self.is_parsed = False
self.sentiment = 0.0
self.getters_for_tokens = {} self.getters_for_tokens = {}
self.getters_for_spans = {} self.getters_for_spans = {}
self.tensor = numpy.zeros((0,), dtype='float32') self.tensor = numpy.zeros((0,), dtype='float32')
@ -217,16 +218,23 @@ cdef class Doc:
return self.__str__() return self.__str__()
def similarity(self, other): def similarity(self, other):
if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector: property has_vector:
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self)
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
def __get__(self): def __get__(self):
if 'vector' in self.user_hooks:
return self.user_hooks['vector'](self)
if self._vector is None: if self._vector is None:
if len(self): if len(self):
self._vector = sum(t.vector for t in self) / len(self) self._vector = sum(t.vector for t in self) / len(self)
@ -239,6 +247,8 @@ cdef class Doc:
property vector_norm: property vector_norm:
def __get__(self): def __get__(self):
if 'vector_norm' in self.user_hooks:
return self.user_hooks['vector_norm'](self)
cdef float value cdef float value
if self._vector_norm is None: if self._vector_norm is None:
self._vector_norm = 1e-20 self._vector_norm = 1e-20
@ -376,6 +386,9 @@ cdef class Doc:
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
""" """
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self)
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"sentence boundary detection requires the dependency parse, which " "sentence boundary detection requires the dependency parse, which "

View File

@ -81,8 +81,8 @@ cdef class Span:
self.doc.merge(self.start_char, self.end_char, *args, **attributes) self.doc.merge(self.start_char, self.end_char, *args, **attributes)
def similarity(self, other): def similarity(self, other):
if 'similarity' in self.doc.getters_for_spans: if 'similarity' in self.doc.user_span_hooks:
self.doc.getters_for_spans['similarity'](self, other) self.doc.user_span_hooks['similarity'](self, other)
if self.vector_norm == 0.0 or other.vector_norm == 0.0: if self.vector_norm == 0.0 or other.vector_norm == 0.0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
@ -104,8 +104,8 @@ cdef class Span:
property sent: property sent:
'''Get the sentence span that this span is a part of.''' '''Get the sentence span that this span is a part of.'''
def __get__(self): def __get__(self):
if 'sent' in self.doc.getters_for_spans: if 'sent' in self.doc.user_span_hooks:
return self.doc.getters_for_spans['sent'](self) return self.doc.user_span_hooks['sent'](self)
# This should raise if we're not parsed. # This should raise if we're not parsed.
self.doc.sents self.doc.sents
cdef int n = 0 cdef int n = 0
@ -119,14 +119,14 @@ cdef class Span:
property has_vector: property has_vector:
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.getters_for_spans: if 'has_vector' in self.doc.user_span_hooks:
return self.doc.getters_for_spans['has_vector'](self) return self.doc.user_span_hooks['has_vector'](self)
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
def __get__(self): def __get__(self):
if 'vector' in self.doc.getters_for_spans: if 'vector' in self.doc.user_span_hooks:
return self.doc.getters_for_spans['vector'](self) return self.doc.user_span_hooks['vector'](self)
if self._vector is None: if self._vector is None:
self._vector = sum(t.vector for t in self) / len(self) self._vector = sum(t.vector for t in self) / len(self)
return self._vector return self._vector
@ -197,8 +197,8 @@ cdef class Span:
""" """
def __get__(self): def __get__(self):
self._recalculate_indices() self._recalculate_indices()
if 'root' in self.doc.getters_for_spans: if 'root' in self.doc.user_span_hooks:
return self.doc.getters_for_spans['root'](self) return self.doc.user_span_hooks['root'](self)
# This should probably be called 'head', and the other one called # This should probably be called 'head', and the other one called
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
cdef int i cdef int i

View File

@ -63,8 +63,8 @@ cdef class Token:
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other): def similarity(self, other):
if 'similarity' in self.doc.getters_for_tokens: if 'similarity' in self.doc.user_token_hooks:
return self.doc.getters_for_tokens['similarity'](self, other) return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
@ -97,6 +97,12 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.c.lex.prob return self.c.lex.prob
property sentiment:
def __get__(self):
if 'sentiment' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['sentiment'](self)
return self.c.lex.sentiment
property lang: property lang:
def __get__(self): def __get__(self):
return self.c.lex.lang return self.c.lex.lang
@ -153,8 +159,8 @@ cdef class Token:
property has_vector: property has_vector:
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.getters_for_tokens: if 'has_vector' in self.doc.user_token_hooks:
return self.doc.getters_for_tokens['has_vector'](self) return self.doc.user_token_hooks['has_vector'](self)
cdef int i cdef int i
for i in range(self.vocab.vectors_length): for i in range(self.vocab.vectors_length):
if self.c.lex.vector[i] != 0: if self.c.lex.vector[i] != 0:
@ -164,8 +170,8 @@ cdef class Token:
property vector: property vector:
def __get__(self): def __get__(self):
if 'vector' in self.doc.getters_for_tokens: if 'vector' in self.doc.user_token_hooks:
return self.doc.getters_for_tokens['vector'](self) return self.doc.user_token_hooks['vector'](self)
cdef int length = self.vocab.vectors_length cdef int length = self.vocab.vectors_length
if length == 0: if length == 0:
raise ValueError( raise ValueError(
@ -186,8 +192,8 @@ cdef class Token:
property vector_norm: property vector_norm:
def __get__(self): def __get__(self):
if 'vector_norm' in self.doc.getters_for_tokens: if 'vector_norm' in self.doc.user_token_hooks:
return self.doc.getters_for_tokens['vector_norm'](self) return self.doc.user_token_hooks['vector_norm'](self)
return self.c.lex.l2_norm return self.c.lex.l2_norm
property n_lefts: property n_lefts:
@ -367,8 +373,8 @@ cdef class Token:
def __get__(self): def __get__(self):
"""Get a list of conjoined words.""" """Get a list of conjoined words."""
cdef Token word cdef Token word
if 'conjuncts' in self.doc.getters_for_tokens: if 'conjuncts' in self.doc.user_token_hooks:
yield from self.doc.getters_for_tokens['conjuncts'](self) yield from self.doc.user_token_hooks['conjuncts'](self)
else: else:
if self.dep_ != 'conj': if self.dep_ != 'conj':
for word in self.rights: for word in self.rights: