Merge branch 'master' of ssh://github.com/honnibal/spaCy

This commit is contained in:
Matthew Honnibal 2015-09-22 12:26:24 +10:00
commit f5c256745b
9 changed files with 45 additions and 16 deletions

9
fabfile.py vendored
View File

@ -61,11 +61,14 @@ def docs():
with lcd(path.join(path.dirname(__file__), 'docs')): with lcd(path.join(path.dirname(__file__), 'docs')):
local('make html') local('make html')
def publish(): def publish(version):
with virtualenv(VENV_DIR): with virtualenv(VENV_DIR):
local('python setup.py register')
local('twine upload dist/*.tar.gz')
local('git push origin master') local('git push origin master')
local('git tag -a %s' % version)
local('git push origin %s' % version)
local('python setup.py sdist')
local('python setup.py register')
local('twine upload dist/%s.tar.gz' % version)
def env(lang="python2.7"): def env(lang="python2.7"):

View File

@ -134,7 +134,7 @@ def run_setup(exts):
headers_workaround.install_headers('numpy') headers_workaround.install_headers('numpy')
VERSION = '0.91' VERSION = '0.93'
def main(modules, is_pypy): def main(modules, is_pypy):
language = "cpp" language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')] includes = ['.', path.join(sys.prefix, 'include')]

View File

@ -43,11 +43,18 @@ cdef class Lexeme:
return True if Lexeme.c_check_flag(self.c, flag_id) else False return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other): def similarity(self, other):
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector: property has_vector:
def __get__(self): def __get__(self):
return sum(abs(self.c.repvec)) != 0 cdef int i
for i in range(self.vocab.vectors_length):
if self.c.repvec[i] != 0:
return True
else:
return False
property vector_norm: property vector_norm:
def __get__(self): def __get__(self):

View File

@ -127,6 +127,8 @@ cdef class Doc:
return u''.join([t.string for t in self]) return u''.join([t.string for t in self])
def similarity(self, other): def similarity(self, other):
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property repvec: property repvec:

View File

@ -60,6 +60,8 @@ cdef class Span:
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
def similarity(self, other): def similarity(self, other):
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property vector: property vector:

View File

@ -50,6 +50,8 @@ cdef class Token:
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other): def similarity(self, other):
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property lex_id: property lex_id:
@ -130,7 +132,12 @@ cdef class Token:
property has_vector: property has_vector:
def __get__(self): def __get__(self):
return sum(abs(self.c.lex.repvec)) != 0 cdef int i
for i in range(self.vocab.vectors_length):
if self.c.lex.repvec[i] != 0:
return True
else:
return False
property vector: property vector:
def __get__(self): def __get__(self):

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset from libc.string cimport memset
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
@ -117,16 +116,14 @@ cdef class Vocab:
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef hash_t key cdef hash_t key
#cdef bint is_oov = mem is not self.mem cdef bint is_oov = mem is not self.mem
# TODO
is_oov = False
mem = self.mem
if len(string) < 3: if len(string) < 3:
mem = self.mem mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex.orth = self.strings[string] lex.orth = self.strings[string]
lex.length = len(string) lex.length = len(string)
lex.id = self.length lex.id = self.length
lex.repvec = <float*>mem.alloc(self.vectors_length, sizeof(float))
if self.get_lex_attr is not None: if self.get_lex_attr is not None:
for attr, func in self.get_lex_attr.items(): for attr, func in self.get_lex_attr.items():
value = func(string) value = func(string)
@ -283,7 +280,7 @@ cdef class Vocab:
vec_len, len(pieces)) vec_len, len(pieces))
orth = self.strings[word_str] orth = self.strings[word_str]
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth) lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
lexeme.repvec = <float*>self.mem.alloc(len(pieces), sizeof(float)) lexeme.repvec = <float*>self.mem.alloc(self.vectors_length, sizeof(float))
for i, val_str in enumerate(pieces): for i, val_str in enumerate(pieces):
lexeme.repvec[i] = float(val_str) lexeme.repvec[i] = float(val_str)

View File

@ -51,3 +51,14 @@ def test_is_properties(EN):
assert addr.is_oov assert addr.is_oov
assert not Hi.is_oov assert not Hi.is_oov
@pytest.mark.models
def test_vectors(EN):
apples, oranges, oov = EN(u'apples oranges ldskbjlsdkbflzdfbl')
assert apples.has_vector
assert oranges.has_vector
assert not oov.has_vector
assert apples.similarity(oranges) > apples.similarity(oov)
assert apples.similarity(oranges) == oranges.similarity(apples)
assert sum(apples.vector) != sum(oranges.vector)
assert apples.vector_norm != oranges.vector_norm

View File

@ -4,15 +4,15 @@ from spacy.en import English
import pytest import pytest
@pytest.mark.vectors @pytest.mark.models
def test_vec(EN): def test_vec(EN):
hype = EN.vocab['hype'] hype = EN.vocab['hype']
assert hype.orth_ == 'hype' assert hype.orth_ == 'hype'
assert 0.08 >= hype.repvec[0] > 0.07 assert 0.08 >= hype.vector[0] > 0.07
@pytest.mark.vectors @pytest.mark.models
def test_capitalized(EN): def test_capitalized(EN):
hype = EN.vocab['Hype'] hype = EN.vocab['Hype']
assert hype.orth_ == 'Hype' assert hype.orth_ == 'Hype'
assert 0.08 >= hype.repvec[0] > 0.07 assert 0.08 >= hype.vector[0] > 0.07