mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge branch 'master' of ssh://github.com/honnibal/spaCy
This commit is contained in:
commit
f5c256745b
9
fabfile.py
vendored
9
fabfile.py
vendored
|
@ -61,11 +61,14 @@ def docs():
|
||||||
with lcd(path.join(path.dirname(__file__), 'docs')):
|
with lcd(path.join(path.dirname(__file__), 'docs')):
|
||||||
local('make html')
|
local('make html')
|
||||||
|
|
||||||
def publish():
|
def publish(version):
|
||||||
with virtualenv(VENV_DIR):
|
with virtualenv(VENV_DIR):
|
||||||
local('python setup.py register')
|
|
||||||
local('twine upload dist/*.tar.gz')
|
|
||||||
local('git push origin master')
|
local('git push origin master')
|
||||||
|
local('git tag -a %s' % version)
|
||||||
|
local('git push origin %s' % version)
|
||||||
|
local('python setup.py sdist')
|
||||||
|
local('python setup.py register')
|
||||||
|
local('twine upload dist/%s.tar.gz' % version)
|
||||||
|
|
||||||
|
|
||||||
def env(lang="python2.7"):
|
def env(lang="python2.7"):
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -134,7 +134,7 @@ def run_setup(exts):
|
||||||
headers_workaround.install_headers('numpy')
|
headers_workaround.install_headers('numpy')
|
||||||
|
|
||||||
|
|
||||||
VERSION = '0.91'
|
VERSION = '0.93'
|
||||||
def main(modules, is_pypy):
|
def main(modules, is_pypy):
|
||||||
language = "cpp"
|
language = "cpp"
|
||||||
includes = ['.', path.join(sys.prefix, 'include')]
|
includes = ['.', path.join(sys.prefix, 'include')]
|
||||||
|
|
|
@ -43,11 +43,18 @@ cdef class Lexeme:
|
||||||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return sum(abs(self.c.repvec)) != 0
|
cdef int i
|
||||||
|
for i in range(self.vocab.vectors_length):
|
||||||
|
if self.c.repvec[i] != 0:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -127,6 +127,8 @@ cdef class Doc:
|
||||||
return u''.join([t.string for t in self])
|
return u''.join([t.string for t in self])
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property repvec:
|
property repvec:
|
||||||
|
|
|
@ -60,6 +60,8 @@ cdef class Span:
|
||||||
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
|
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||||
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
|
|
|
@ -50,6 +50,8 @@ cdef class Token:
|
||||||
return self.doc[self.i+i]
|
return self.doc[self.i+i]
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property lex_id:
|
property lex_id:
|
||||||
|
@ -130,7 +132,12 @@ cdef class Token:
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return sum(abs(self.c.lex.repvec)) != 0
|
cdef int i
|
||||||
|
for i in range(self.vocab.vectors_length):
|
||||||
|
if self.c.lex.repvec[i] != 0:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
@ -117,16 +116,14 @@ cdef class Vocab:
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
#cdef bint is_oov = mem is not self.mem
|
cdef bint is_oov = mem is not self.mem
|
||||||
# TODO
|
|
||||||
is_oov = False
|
|
||||||
mem = self.mem
|
|
||||||
if len(string) < 3:
|
if len(string) < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
lex.orth = self.strings[string]
|
lex.orth = self.strings[string]
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.id = self.length
|
lex.id = self.length
|
||||||
|
lex.repvec = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
||||||
if self.get_lex_attr is not None:
|
if self.get_lex_attr is not None:
|
||||||
for attr, func in self.get_lex_attr.items():
|
for attr, func in self.get_lex_attr.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
|
@ -283,7 +280,7 @@ cdef class Vocab:
|
||||||
vec_len, len(pieces))
|
vec_len, len(pieces))
|
||||||
orth = self.strings[word_str]
|
orth = self.strings[word_str]
|
||||||
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
||||||
lexeme.repvec = <float*>self.mem.alloc(len(pieces), sizeof(float))
|
lexeme.repvec = <float*>self.mem.alloc(self.vectors_length, sizeof(float))
|
||||||
|
|
||||||
for i, val_str in enumerate(pieces):
|
for i, val_str in enumerate(pieces):
|
||||||
lexeme.repvec[i] = float(val_str)
|
lexeme.repvec[i] = float(val_str)
|
||||||
|
|
|
@ -51,3 +51,14 @@ def test_is_properties(EN):
|
||||||
assert addr.is_oov
|
assert addr.is_oov
|
||||||
assert not Hi.is_oov
|
assert not Hi.is_oov
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_vectors(EN):
|
||||||
|
apples, oranges, oov = EN(u'apples oranges ldskbjlsdkbflzdfbl')
|
||||||
|
assert apples.has_vector
|
||||||
|
assert oranges.has_vector
|
||||||
|
assert not oov.has_vector
|
||||||
|
assert apples.similarity(oranges) > apples.similarity(oov)
|
||||||
|
assert apples.similarity(oranges) == oranges.similarity(apples)
|
||||||
|
assert sum(apples.vector) != sum(oranges.vector)
|
||||||
|
assert apples.vector_norm != oranges.vector_norm
|
||||||
|
|
||||||
|
|
|
@ -4,15 +4,15 @@ from spacy.en import English
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@pytest.mark.vectors
|
@pytest.mark.models
|
||||||
def test_vec(EN):
|
def test_vec(EN):
|
||||||
hype = EN.vocab['hype']
|
hype = EN.vocab['hype']
|
||||||
assert hype.orth_ == 'hype'
|
assert hype.orth_ == 'hype'
|
||||||
assert 0.08 >= hype.repvec[0] > 0.07
|
assert 0.08 >= hype.vector[0] > 0.07
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.vectors
|
@pytest.mark.models
|
||||||
def test_capitalized(EN):
|
def test_capitalized(EN):
|
||||||
hype = EN.vocab['Hype']
|
hype = EN.vocab['Hype']
|
||||||
assert hype.orth_ == 'Hype'
|
assert hype.orth_ == 'Hype'
|
||||||
assert 0.08 >= hype.repvec[0] > 0.07
|
assert 0.08 >= hype.vector[0] > 0.07
|
||||||
|
|
Loading…
Reference in New Issue
Block a user