From 77856c4fcd8510581df36f6db8ccbdf409e29272 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 17 Sep 2015 11:50:11 +1000 Subject: [PATCH] * Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea. --- spacy/tokens/doc.pxd | 3 +++ spacy/tokens/doc.pyx | 24 +++++++++++++++++++----- spacy/tokens/spans.pxd | 3 +++ spacy/tokens/spans.pyx | 26 +++++++++++++++++++++++--- 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index a13858175..ce1cfecc0 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -23,6 +23,9 @@ cdef class Doc: cdef readonly Pool mem cdef readonly Vocab vocab + cdef public object _vector + cdef public object _vector_norm + cdef TokenC* data cdef public bint is_tagged diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0c3c0a2f7..5bdd5b22f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -6,6 +6,7 @@ import numpy import numpy.linalg import struct cimport numpy as np +import math from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME @@ -77,6 +78,7 @@ cdef class Doc: self.is_tagged = False self.is_parsed = False self._py_tokens = [] + self._vector = None def __getitem__(self, object i): """Get a token. @@ -133,12 +135,25 @@ cdef class Doc: property vector: def __get__(self): - return sum(t.vector for t in self if not t.is_stop) / len(self) + if self._vector is None: + self._vector = sum(t.vector for t in self) / len(self) + return self._vector + def __set__(self, value): + self._vector = value property vector_norm: def __get__(self): - return numpy.linalg.norm(self.vector) + cdef float value + if self._vector_norm is None: + self._vector_norm = 1e-20 + for value in self.vector: + self._vector_norm += value * value + self._vector_norm = math.sqrt(self._vector_norm) + return self._vector_norm + + def __set__(self, value): + self._vector_norm = value @property def string(self): @@ -304,15 +319,14 @@ cdef class Doc: cdef size_t count if counts is None: - counts = PreshCounter(self.length) + counts = PreshCounter() output_dict = True else: output_dict = False # Take this check out of the loop, for a bit of extra speed if exclude is None: for i in range(self.length): - attr = get_token_attr(&self.data[i], attr_id) - counts.inc(attr, 1) + counts.inc(get_token_attr(&self.data[i], attr_id), 1) else: for i in range(self.length): if not exclude(self[i]): diff --git a/spacy/tokens/spans.pxd b/spacy/tokens/spans.pxd index d9704ad1f..8e9ef69ef 100644 --- a/spacy/tokens/spans.pxd +++ b/spacy/tokens/spans.pxd @@ -7,3 +7,6 @@ cdef class Span: cdef public int start cdef public int end cdef readonly int label + + cdef public _vector + cdef public _vector_norm diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 12ad6e425..88a72982f 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -3,6 +3,7 @@ from collections import defaultdict import numpy import numpy.linalg cimport numpy as np +import math from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t @@ -21,6 +22,8 @@ cdef class Span: self.start = start self.end = end self.label = label + self._vector = None + self._vector_norm = None def __richcmp__(self, Span other, int op): # Eq @@ -60,15 +63,32 @@ cdef class Span: property vector: def __get__(self): - return sum(t.vector for t in self if not t.is_stop) / len(self) + if self._vector is None: + self._vector = sum(t.vector for t in self) / len(self) + return self._vector + + def __set__(self, value): + self._vector = value property vector_norm: def __get__(self): - return numpy.linalg.norm(self.vector) + cdef float value + if self._vector_norm is None: + self._vector_norm = 1e-20 + for value in self.vector: + self._vector_norm += value * value + self._vector_norm = math.sqrt(self._vector_norm) + return self._vector_norm + + def __set__(self, value): + self._vector_norm = value property text: def __get__(self): - return u' '.join([t.text for t in self]) + text = self.text_with_ws + if self[-1].whitespace_: + text = text[:-1] + return text property text_with_ws: def __get__(self):