mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.
This commit is contained in:
parent
191d593e03
commit
77856c4fcd
|
@ -23,6 +23,9 @@ cdef class Doc:
|
|||
cdef readonly Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
|
||||
cdef public object _vector
|
||||
cdef public object _vector_norm
|
||||
|
||||
cdef TokenC* data
|
||||
|
||||
cdef public bint is_tagged
|
||||
|
|
|
@ -6,6 +6,7 @@ import numpy
|
|||
import numpy.linalg
|
||||
import struct
|
||||
cimport numpy as np
|
||||
import math
|
||||
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..lexeme cimport EMPTY_LEXEME
|
||||
|
@ -77,6 +78,7 @@ cdef class Doc:
|
|||
self.is_tagged = False
|
||||
self.is_parsed = False
|
||||
self._py_tokens = []
|
||||
self._vector = None
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a token.
|
||||
|
@ -133,12 +135,25 @@ cdef class Doc:
|
|||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
return sum(t.vector for t in self if not t.is_stop) / len(self)
|
||||
if self._vector is None:
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
return self._vector
|
||||
|
||||
def __set__(self, value):
|
||||
self._vector = value
|
||||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
return numpy.linalg.norm(self.vector)
|
||||
cdef float value
|
||||
if self._vector_norm is None:
|
||||
self._vector_norm = 1e-20
|
||||
for value in self.vector:
|
||||
self._vector_norm += value * value
|
||||
self._vector_norm = math.sqrt(self._vector_norm)
|
||||
return self._vector_norm
|
||||
|
||||
def __set__(self, value):
|
||||
self._vector_norm = value
|
||||
|
||||
@property
|
||||
def string(self):
|
||||
|
@ -304,15 +319,14 @@ cdef class Doc:
|
|||
cdef size_t count
|
||||
|
||||
if counts is None:
|
||||
counts = PreshCounter(self.length)
|
||||
counts = PreshCounter()
|
||||
output_dict = True
|
||||
else:
|
||||
output_dict = False
|
||||
# Take this check out of the loop, for a bit of extra speed
|
||||
if exclude is None:
|
||||
for i in range(self.length):
|
||||
attr = get_token_attr(&self.data[i], attr_id)
|
||||
counts.inc(attr, 1)
|
||||
counts.inc(get_token_attr(&self.data[i], attr_id), 1)
|
||||
else:
|
||||
for i in range(self.length):
|
||||
if not exclude(self[i]):
|
||||
|
|
|
@ -7,3 +7,6 @@ cdef class Span:
|
|||
cdef public int start
|
||||
cdef public int end
|
||||
cdef readonly int label
|
||||
|
||||
cdef public _vector
|
||||
cdef public _vector_norm
|
||||
|
|
|
@ -3,6 +3,7 @@ from collections import defaultdict
|
|||
import numpy
|
||||
import numpy.linalg
|
||||
cimport numpy as np
|
||||
import math
|
||||
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t
|
||||
|
@ -21,6 +22,8 @@ cdef class Span:
|
|||
self.start = start
|
||||
self.end = end
|
||||
self.label = label
|
||||
self._vector = None
|
||||
self._vector_norm = None
|
||||
|
||||
def __richcmp__(self, Span other, int op):
|
||||
# Eq
|
||||
|
@ -60,15 +63,32 @@ cdef class Span:
|
|||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
return sum(t.vector for t in self if not t.is_stop) / len(self)
|
||||
if self._vector is None:
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
return self._vector
|
||||
|
||||
def __set__(self, value):
|
||||
self._vector = value
|
||||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
return numpy.linalg.norm(self.vector)
|
||||
cdef float value
|
||||
if self._vector_norm is None:
|
||||
self._vector_norm = 1e-20
|
||||
for value in self.vector:
|
||||
self._vector_norm += value * value
|
||||
self._vector_norm = math.sqrt(self._vector_norm)
|
||||
return self._vector_norm
|
||||
|
||||
def __set__(self, value):
|
||||
self._vector_norm = value
|
||||
|
||||
property text:
|
||||
def __get__(self):
|
||||
return u' '.join([t.text for t in self])
|
||||
text = self.text_with_ws
|
||||
if self[-1].whitespace_:
|
||||
text = text[:-1]
|
||||
return text
|
||||
|
||||
property text_with_ws:
|
||||
def __get__(self):
|
||||
|
|
Loading…
Reference in New Issue
Block a user