mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.
This commit is contained in:
		
							parent
							
								
									191d593e03
								
							
						
					
					
						commit
						77856c4fcd
					
				| 
						 | 
				
			
			@ -23,6 +23,9 @@ cdef class Doc:
 | 
			
		|||
    cdef readonly Pool mem
 | 
			
		||||
    cdef readonly Vocab vocab
 | 
			
		||||
 | 
			
		||||
    cdef public object _vector
 | 
			
		||||
    cdef public object _vector_norm
 | 
			
		||||
 | 
			
		||||
    cdef TokenC* data
 | 
			
		||||
 | 
			
		||||
    cdef public bint is_tagged
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,6 +6,7 @@ import numpy
 | 
			
		|||
import numpy.linalg
 | 
			
		||||
import struct
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
import math
 | 
			
		||||
 | 
			
		||||
from ..lexeme cimport Lexeme
 | 
			
		||||
from ..lexeme cimport EMPTY_LEXEME
 | 
			
		||||
| 
						 | 
				
			
			@ -77,6 +78,7 @@ cdef class Doc:
 | 
			
		|||
        self.is_tagged = False
 | 
			
		||||
        self.is_parsed = False
 | 
			
		||||
        self._py_tokens = []
 | 
			
		||||
        self._vector = None
 | 
			
		||||
 | 
			
		||||
    def __getitem__(self, object i):
 | 
			
		||||
        """Get a token.
 | 
			
		||||
| 
						 | 
				
			
			@ -133,12 +135,25 @@ cdef class Doc:
 | 
			
		|||
 | 
			
		||||
    property vector:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return sum(t.vector for t in self if not t.is_stop) / len(self)
 | 
			
		||||
            if self._vector is None:
 | 
			
		||||
                self._vector = sum(t.vector for t in self) / len(self)
 | 
			
		||||
            return self._vector
 | 
			
		||||
 | 
			
		||||
        def __set__(self, value):
 | 
			
		||||
            self._vector = value
 | 
			
		||||
 | 
			
		||||
    property vector_norm:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return numpy.linalg.norm(self.vector)
 | 
			
		||||
            cdef float value
 | 
			
		||||
            if self._vector_norm is None:
 | 
			
		||||
                self._vector_norm = 1e-20
 | 
			
		||||
                for value in self.vector:
 | 
			
		||||
                    self._vector_norm += value * value
 | 
			
		||||
                self._vector_norm = math.sqrt(self._vector_norm)
 | 
			
		||||
            return self._vector_norm
 | 
			
		||||
        
 | 
			
		||||
        def __set__(self, value):
 | 
			
		||||
            self._vector_norm = value 
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def string(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -304,15 +319,14 @@ cdef class Doc:
 | 
			
		|||
        cdef size_t count
 | 
			
		||||
        
 | 
			
		||||
        if counts is None:
 | 
			
		||||
            counts = PreshCounter(self.length)
 | 
			
		||||
            counts = PreshCounter()
 | 
			
		||||
            output_dict = True
 | 
			
		||||
        else:
 | 
			
		||||
            output_dict = False
 | 
			
		||||
        # Take this check out of the loop, for a bit of extra speed
 | 
			
		||||
        if exclude is None:
 | 
			
		||||
            for i in range(self.length):
 | 
			
		||||
                attr = get_token_attr(&self.data[i], attr_id)
 | 
			
		||||
                counts.inc(attr, 1)
 | 
			
		||||
                counts.inc(get_token_attr(&self.data[i], attr_id), 1)
 | 
			
		||||
        else:
 | 
			
		||||
            for i in range(self.length):
 | 
			
		||||
                if not exclude(self[i]):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,3 +7,6 @@ cdef class Span:
 | 
			
		|||
    cdef public int start
 | 
			
		||||
    cdef public int end
 | 
			
		||||
    cdef readonly int label
 | 
			
		||||
 | 
			
		||||
    cdef public _vector
 | 
			
		||||
    cdef public _vector_norm
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,6 +3,7 @@ from collections import defaultdict
 | 
			
		|||
import numpy
 | 
			
		||||
import numpy.linalg
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
import math
 | 
			
		||||
 | 
			
		||||
from ..structs cimport TokenC, LexemeC
 | 
			
		||||
from ..typedefs cimport flags_t, attr_t
 | 
			
		||||
| 
						 | 
				
			
			@ -21,6 +22,8 @@ cdef class Span:
 | 
			
		|||
        self.start = start
 | 
			
		||||
        self.end = end
 | 
			
		||||
        self.label = label
 | 
			
		||||
        self._vector = None
 | 
			
		||||
        self._vector_norm = None
 | 
			
		||||
 | 
			
		||||
    def __richcmp__(self, Span other, int op):
 | 
			
		||||
        # Eq
 | 
			
		||||
| 
						 | 
				
			
			@ -60,15 +63,32 @@ cdef class Span:
 | 
			
		|||
 | 
			
		||||
    property vector:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return sum(t.vector for t in self if not t.is_stop) / len(self)
 | 
			
		||||
            if self._vector is None:
 | 
			
		||||
                self._vector = sum(t.vector for t in self) / len(self)
 | 
			
		||||
            return self._vector
 | 
			
		||||
 | 
			
		||||
        def __set__(self, value):
 | 
			
		||||
            self._vector = value
 | 
			
		||||
 | 
			
		||||
    property vector_norm:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return numpy.linalg.norm(self.vector)
 | 
			
		||||
            cdef float value
 | 
			
		||||
            if self._vector_norm is None:
 | 
			
		||||
                self._vector_norm = 1e-20
 | 
			
		||||
                for value in self.vector:
 | 
			
		||||
                    self._vector_norm += value * value
 | 
			
		||||
                self._vector_norm = math.sqrt(self._vector_norm)
 | 
			
		||||
            return self._vector_norm
 | 
			
		||||
    
 | 
			
		||||
        def __set__(self, value):
 | 
			
		||||
            self._vector_norm = value
 | 
			
		||||
 | 
			
		||||
    property text:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return u' '.join([t.text for t in self])
 | 
			
		||||
            text = self.text_with_ws
 | 
			
		||||
            if self[-1].whitespace_:
 | 
			
		||||
                text = text[:-1]
 | 
			
		||||
            return text
 | 
			
		||||
 | 
			
		||||
    property text_with_ws:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user