mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.
This commit is contained in:
parent
ed5e178817
commit
5d5742b773
|
@ -40,8 +40,11 @@ cdef class Doc:
|
||||||
cdef public bint is_tagged
|
cdef public bint is_tagged
|
||||||
cdef public bint is_parsed
|
cdef public bint is_parsed
|
||||||
|
|
||||||
cdef public dict getters_for_tokens
|
cdef public float sentiment
|
||||||
cdef public dict getters_for_spans
|
|
||||||
|
cdef public dict user_hooks
|
||||||
|
cdef public dict user_token_hooks
|
||||||
|
cdef public dict user_span_hooks
|
||||||
|
|
||||||
cdef public list _py_tokens
|
cdef public list _py_tokens
|
||||||
|
|
||||||
|
|
|
@ -115,6 +115,7 @@ cdef class Doc:
|
||||||
self.length = 0
|
self.length = 0
|
||||||
self.is_tagged = False
|
self.is_tagged = False
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
|
self.sentiment = 0.0
|
||||||
self.getters_for_tokens = {}
|
self.getters_for_tokens = {}
|
||||||
self.getters_for_spans = {}
|
self.getters_for_spans = {}
|
||||||
self.tensor = numpy.zeros((0,), dtype='float32')
|
self.tensor = numpy.zeros((0,), dtype='float32')
|
||||||
|
@ -217,16 +218,23 @@ cdef class Doc:
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
if 'similarity' in self.user_hooks:
|
||||||
|
return self.user_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
if 'has_vector' in self.user_hooks:
|
||||||
|
return self.user_hooks['has_vector'](self)
|
||||||
|
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
if 'vector' in self.user_hooks:
|
||||||
|
return self.user_hooks['vector'](self)
|
||||||
if self._vector is None:
|
if self._vector is None:
|
||||||
if len(self):
|
if len(self):
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = sum(t.vector for t in self) / len(self)
|
||||||
|
@ -239,6 +247,8 @@ cdef class Doc:
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
if 'vector_norm' in self.user_hooks:
|
||||||
|
return self.user_hooks['vector_norm'](self)
|
||||||
cdef float value
|
cdef float value
|
||||||
if self._vector_norm is None:
|
if self._vector_norm is None:
|
||||||
self._vector_norm = 1e-20
|
self._vector_norm = 1e-20
|
||||||
|
@ -376,6 +386,9 @@ cdef class Doc:
|
||||||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
if 'sents' in self.user_hooks:
|
||||||
|
return self.user_hooks['sents'](self)
|
||||||
|
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sentence boundary detection requires the dependency parse, which "
|
"sentence boundary detection requires the dependency parse, which "
|
||||||
|
|
|
@ -81,8 +81,8 @@ cdef class Span:
|
||||||
self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
if 'similarity' in self.doc.getters_for_spans:
|
if 'similarity' in self.doc.user_span_hooks:
|
||||||
self.doc.getters_for_spans['similarity'](self, other)
|
self.doc.user_span_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
@ -104,8 +104,8 @@ cdef class Span:
|
||||||
property sent:
|
property sent:
|
||||||
'''Get the sentence span that this span is a part of.'''
|
'''Get the sentence span that this span is a part of.'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sent' in self.doc.getters_for_spans:
|
if 'sent' in self.doc.user_span_hooks:
|
||||||
return self.doc.getters_for_spans['sent'](self)
|
return self.doc.user_span_hooks['sent'](self)
|
||||||
# This should raise if we're not parsed.
|
# This should raise if we're not parsed.
|
||||||
self.doc.sents
|
self.doc.sents
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
|
@ -119,14 +119,14 @@ cdef class Span:
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.getters_for_spans:
|
if 'has_vector' in self.doc.user_span_hooks:
|
||||||
return self.doc.getters_for_spans['has_vector'](self)
|
return self.doc.user_span_hooks['has_vector'](self)
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.doc.getters_for_spans:
|
if 'vector' in self.doc.user_span_hooks:
|
||||||
return self.doc.getters_for_spans['vector'](self)
|
return self.doc.user_span_hooks['vector'](self)
|
||||||
if self._vector is None:
|
if self._vector is None:
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = sum(t.vector for t in self) / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
|
@ -197,8 +197,8 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
self._recalculate_indices()
|
self._recalculate_indices()
|
||||||
if 'root' in self.doc.getters_for_spans:
|
if 'root' in self.doc.user_span_hooks:
|
||||||
return self.doc.getters_for_spans['root'](self)
|
return self.doc.user_span_hooks['root'](self)
|
||||||
# This should probably be called 'head', and the other one called
|
# This should probably be called 'head', and the other one called
|
||||||
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
|
@ -63,8 +63,8 @@ cdef class Token:
|
||||||
return self.doc[self.i+i]
|
return self.doc[self.i+i]
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
if 'similarity' in self.doc.getters_for_tokens:
|
if 'similarity' in self.doc.user_token_hooks:
|
||||||
return self.doc.getters_for_tokens['similarity'](self, other)
|
return self.doc.user_token_hooks['similarity'](self)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
@ -97,6 +97,12 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.prob
|
return self.c.lex.prob
|
||||||
|
|
||||||
|
property sentiment:
|
||||||
|
def __get__(self):
|
||||||
|
if 'sentiment' in self.doc.user_token_hooks:
|
||||||
|
return self.doc.user_token_hooks['sentiment'](self)
|
||||||
|
return self.c.lex.sentiment
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.lang
|
return self.c.lex.lang
|
||||||
|
@ -153,8 +159,8 @@ cdef class Token:
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.getters_for_tokens:
|
if 'has_vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.getters_for_tokens['has_vector'](self)
|
return self.doc.user_token_hooks['has_vector'](self)
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.vocab.vectors_length):
|
for i in range(self.vocab.vectors_length):
|
||||||
if self.c.lex.vector[i] != 0:
|
if self.c.lex.vector[i] != 0:
|
||||||
|
@ -164,8 +170,8 @@ cdef class Token:
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.doc.getters_for_tokens:
|
if 'vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.getters_for_tokens['vector'](self)
|
return self.doc.user_token_hooks['vector'](self)
|
||||||
cdef int length = self.vocab.vectors_length
|
cdef int length = self.vocab.vectors_length
|
||||||
if length == 0:
|
if length == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -186,8 +192,8 @@ cdef class Token:
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector_norm' in self.doc.getters_for_tokens:
|
if 'vector_norm' in self.doc.user_token_hooks:
|
||||||
return self.doc.getters_for_tokens['vector_norm'](self)
|
return self.doc.user_token_hooks['vector_norm'](self)
|
||||||
return self.c.lex.l2_norm
|
return self.c.lex.l2_norm
|
||||||
|
|
||||||
property n_lefts:
|
property n_lefts:
|
||||||
|
@ -367,8 +373,8 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Get a list of conjoined words."""
|
"""Get a list of conjoined words."""
|
||||||
cdef Token word
|
cdef Token word
|
||||||
if 'conjuncts' in self.doc.getters_for_tokens:
|
if 'conjuncts' in self.doc.user_token_hooks:
|
||||||
yield from self.doc.getters_for_tokens['conjuncts'](self)
|
yield from self.doc.user_token_hooks['conjuncts'](self)
|
||||||
else:
|
else:
|
||||||
if self.dep_ != 'conj':
|
if self.dep_ != 'conj':
|
||||||
for word in self.rights:
|
for word in self.rights:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user