mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.
This commit is contained in:
parent
e13e47e9e5
commit
65dc0d1dfb
|
@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF
|
|||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||
from cython.view cimport array as cvarray
|
||||
cimport numpy as np
|
||||
np.import_array()
|
||||
|
||||
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .orth cimport word_shape
|
||||
|
@ -35,6 +42,26 @@ cdef class Lexeme:
|
|||
def py_check_flag(self, attr_id_t flag_id):
|
||||
return True if Lexeme.check_flag(self.c, flag_id) else False
|
||||
|
||||
def similarity(self, other):
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
return self.c.l2_norm
|
||||
|
||||
def __set__(self, float value):
|
||||
self.c.l2_norm = value
|
||||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
cdef int length = self.vocab.repvec_length
|
||||
repvec_view = <float[:length,]>self.c.repvec
|
||||
return numpy.asarray(repvec_view)
|
||||
|
||||
property repvec:
|
||||
def __get__(self):
|
||||
return self.vector
|
||||
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.orth]
|
||||
|
|
|
@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset
|
|||
from libc.stdint cimport uint32_t
|
||||
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
import struct
|
||||
cimport numpy as np
|
||||
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..lexeme cimport EMPTY_LEXEME
|
||||
|
@ -118,6 +120,22 @@ cdef class Doc:
|
|||
def __str__(self):
|
||||
return u''.join([t.string for t in self])
|
||||
|
||||
def similarity(self, other):
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property repvec:
|
||||
def __get__(self):
|
||||
return self.vector
|
||||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
return sum(t.vector for t in self if not t.is_stop) / len(self)
|
||||
|
||||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
return numpy.linalg.norm(self.vector)
|
||||
|
||||
@property
|
||||
def string(self):
|
||||
return u''.join([t.string for t in self])
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
from __future__ import unicode_literals
|
||||
from collections import defaultdict
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
cimport numpy as np
|
||||
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t
|
||||
|
@ -52,6 +55,17 @@ cdef class Span:
|
|||
def merge(self, unicode tag, unicode lemma, unicode ent_type):
|
||||
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
|
||||
|
||||
def similarity(self, other):
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property vector:
|
||||
def __get__(self):
|
||||
return sum(t.vector for t in self if not t.is_stop) / len(self)
|
||||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
return numpy.linalg.norm(self.vector)
|
||||
|
||||
property text:
|
||||
def __get__(self):
|
||||
return u' '.join([t.text for t in self])
|
||||
|
|
|
@ -49,6 +49,9 @@ cdef class Token:
|
|||
def nbor(self, int i=1):
|
||||
return self.doc[self.i+i]
|
||||
|
||||
def similarity(self, other):
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property lex_id:
|
||||
def __get__(self):
|
||||
return self.c.lex.id
|
||||
|
@ -125,12 +128,20 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.c.dep
|
||||
|
||||
property repvec:
|
||||
property vector:
|
||||
def __get__(self):
|
||||
cdef int length = self.vocab.repvec_length
|
||||
repvec_view = <float[:length,]>self.c.lex.repvec
|
||||
return numpy.asarray(repvec_view)
|
||||
|
||||
property repvec:
|
||||
def __get__(self):
|
||||
return self.vector
|
||||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
return self.c.lex.l2_norm
|
||||
|
||||
property n_lefts:
|
||||
def __get__(self):
|
||||
cdef int n = 0
|
||||
|
@ -302,6 +313,9 @@ cdef class Token:
|
|||
property is_oov:
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
|
||||
|
||||
property is_stop:
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user