mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.
This commit is contained in:
parent
ed5e178817
commit
5d5742b773
|
@ -40,8 +40,11 @@ cdef class Doc:
|
|||
cdef public bint is_tagged
|
||||
cdef public bint is_parsed
|
||||
|
||||
cdef public dict getters_for_tokens
|
||||
cdef public dict getters_for_spans
|
||||
cdef public float sentiment
|
||||
|
||||
cdef public dict user_hooks
|
||||
cdef public dict user_token_hooks
|
||||
cdef public dict user_span_hooks
|
||||
|
||||
cdef public list _py_tokens
|
||||
|
||||
|
|
|
@ -115,6 +115,7 @@ cdef class Doc:
|
|||
self.length = 0
|
||||
self.is_tagged = False
|
||||
self.is_parsed = False
|
||||
self.sentiment = 0.0
|
||||
self.getters_for_tokens = {}
|
||||
self.getters_for_spans = {}
|
||||
self.tensor = numpy.zeros((0,), dtype='float32')
|
||||
|
@ -217,16 +218,23 @@ cdef class Doc:
|
|||
return self.__str__()
|
||||
|
||||
def similarity(self, other):
|
||||
if 'similarity' in self.user_hooks:
|
||||
return self.user_hooks['similarity'](self, other)
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property has_vector:
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.user_hooks:
|
||||
return self.user_hooks['has_vector'](self)
|
||||
|
||||
return any(token.has_vector for token in self)
|
||||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
if 'vector' in self.user_hooks:
|
||||
return self.user_hooks['vector'](self)
|
||||
if self._vector is None:
|
||||
if len(self):
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
|
@ -239,6 +247,8 @@ cdef class Doc:
|
|||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
if 'vector_norm' in self.user_hooks:
|
||||
return self.user_hooks['vector_norm'](self)
|
||||
cdef float value
|
||||
if self._vector_norm is None:
|
||||
self._vector_norm = 1e-20
|
||||
|
@ -376,6 +386,9 @@ cdef class Doc:
|
|||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'sents' in self.user_hooks:
|
||||
return self.user_hooks['sents'](self)
|
||||
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"sentence boundary detection requires the dependency parse, which "
|
||||
|
|
|
@ -81,8 +81,8 @@ cdef class Span:
|
|||
self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||
|
||||
def similarity(self, other):
|
||||
if 'similarity' in self.doc.getters_for_spans:
|
||||
self.doc.getters_for_spans['similarity'](self, other)
|
||||
if 'similarity' in self.doc.user_span_hooks:
|
||||
self.doc.user_span_hooks['similarity'](self, other)
|
||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
@ -104,8 +104,8 @@ cdef class Span:
|
|||
property sent:
|
||||
'''Get the sentence span that this span is a part of.'''
|
||||
def __get__(self):
|
||||
if 'sent' in self.doc.getters_for_spans:
|
||||
return self.doc.getters_for_spans['sent'](self)
|
||||
if 'sent' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sent'](self)
|
||||
# This should raise if we're not parsed.
|
||||
self.doc.sents
|
||||
cdef int n = 0
|
||||
|
@ -119,14 +119,14 @@ cdef class Span:
|
|||
|
||||
property has_vector:
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.doc.getters_for_spans:
|
||||
return self.doc.getters_for_spans['has_vector'](self)
|
||||
if 'has_vector' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['has_vector'](self)
|
||||
return any(token.has_vector for token in self)
|
||||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
if 'vector' in self.doc.getters_for_spans:
|
||||
return self.doc.getters_for_spans['vector'](self)
|
||||
if 'vector' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['vector'](self)
|
||||
if self._vector is None:
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
return self._vector
|
||||
|
@ -197,8 +197,8 @@ cdef class Span:
|
|||
"""
|
||||
def __get__(self):
|
||||
self._recalculate_indices()
|
||||
if 'root' in self.doc.getters_for_spans:
|
||||
return self.doc.getters_for_spans['root'](self)
|
||||
if 'root' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['root'](self)
|
||||
# This should probably be called 'head', and the other one called
|
||||
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
||||
cdef int i
|
||||
|
|
|
@ -63,8 +63,8 @@ cdef class Token:
|
|||
return self.doc[self.i+i]
|
||||
|
||||
def similarity(self, other):
|
||||
if 'similarity' in self.doc.getters_for_tokens:
|
||||
return self.doc.getters_for_tokens['similarity'](self, other)
|
||||
if 'similarity' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['similarity'](self)
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
@ -97,6 +97,12 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.c.lex.prob
|
||||
|
||||
property sentiment:
|
||||
def __get__(self):
|
||||
if 'sentiment' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['sentiment'](self)
|
||||
return self.c.lex.sentiment
|
||||
|
||||
property lang:
|
||||
def __get__(self):
|
||||
return self.c.lex.lang
|
||||
|
@ -153,8 +159,8 @@ cdef class Token:
|
|||
|
||||
property has_vector:
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.doc.getters_for_tokens:
|
||||
return self.doc.getters_for_tokens['has_vector'](self)
|
||||
if 'has_vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['has_vector'](self)
|
||||
cdef int i
|
||||
for i in range(self.vocab.vectors_length):
|
||||
if self.c.lex.vector[i] != 0:
|
||||
|
@ -164,8 +170,8 @@ cdef class Token:
|
|||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
if 'vector' in self.doc.getters_for_tokens:
|
||||
return self.doc.getters_for_tokens['vector'](self)
|
||||
if 'vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector'](self)
|
||||
cdef int length = self.vocab.vectors_length
|
||||
if length == 0:
|
||||
raise ValueError(
|
||||
|
@ -186,8 +192,8 @@ cdef class Token:
|
|||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
if 'vector_norm' in self.doc.getters_for_tokens:
|
||||
return self.doc.getters_for_tokens['vector_norm'](self)
|
||||
if 'vector_norm' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector_norm'](self)
|
||||
return self.c.lex.l2_norm
|
||||
|
||||
property n_lefts:
|
||||
|
@ -367,8 +373,8 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
"""Get a list of conjoined words."""
|
||||
cdef Token word
|
||||
if 'conjuncts' in self.doc.getters_for_tokens:
|
||||
yield from self.doc.getters_for_tokens['conjuncts'](self)
|
||||
if 'conjuncts' in self.doc.user_token_hooks:
|
||||
yield from self.doc.user_token_hooks['conjuncts'](self)
|
||||
else:
|
||||
if self.dep_ != 'conj':
|
||||
for word in self.rights:
|
||||
|
|
Loading…
Reference in New Issue
Block a user