From 44aecba701600284300438b1c3809cc2c9aeb805 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 22 Sep 2015 01:43:16 +0200 Subject: [PATCH 1/6] * Fix Token.has_vector and Lexeme.has_vector --- spacy/lexeme.pyx | 7 ++++++- spacy/tokens/token.pyx | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index caa126993..66867a648 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -47,7 +47,12 @@ cdef class Lexeme: property has_vector: def __get__(self): - return sum(abs(self.c.repvec)) != 0 + cdef int i + for i in range(self.vocab.vectors_length): + if self.c.repvec[i] != 0: + return True + else: + return False property vector_norm: def __get__(self): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b91510678..4f8effa4e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -130,7 +130,12 @@ cdef class Token: property has_vector: def __get__(self): - return sum(abs(self.c.lex.repvec)) != 0 + cdef int i + for i in range(self.vocab.vectors_length): + if self.c.lex.repvec[i] != 0: + return True + else: + return False property vector: def __get__(self): From f7283a50672611f4da108951d4ce5be659910690 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 22 Sep 2015 02:10:01 +0200 Subject: [PATCH 2/6] * Fix vectors bugs for OOV words --- spacy/lexeme.pyx | 2 ++ spacy/tokens/doc.pyx | 2 ++ spacy/tokens/spans.pyx | 2 ++ spacy/tokens/token.pyx | 2 ++ spacy/vocab.pyx | 9 +++------ 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 66867a648..26acff407 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -43,6 +43,8 @@ cdef class Lexeme: return True if Lexeme.c_check_flag(self.c, flag_id) else False def similarity(self, other): + if self.vector_norm == 0 or other.vector_norm == 0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property has_vector: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 19cff3a90..536b3582b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -127,6 +127,8 @@ cdef class Doc: return u''.join([t.string for t in self]) def similarity(self, other): + if self.vector_norm == 0 or other.vector_norm == 0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property repvec: diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 38b9ebcca..cca24cb5b 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -60,6 +60,8 @@ cdef class Span: self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) def similarity(self, other): + if self.vector_norm == 0.0 or other.vector_norm == 0.0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property vector: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 4f8effa4e..25db3f47e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -50,6 +50,8 @@ cdef class Token: return self.doc[self.i+i] def similarity(self, other): + if self.vector_norm == 0 or other.vector_norm == 0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property lex_id: diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bb256b02e..e3ac67bf7 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,6 +1,5 @@ from __future__ import unicode_literals - from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.string cimport memset from libc.stdint cimport int32_t @@ -117,16 +116,14 @@ cdef class Vocab: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef hash_t key - #cdef bint is_oov = mem is not self.mem - # TODO - is_oov = False - mem = self.mem + cdef bint is_oov = mem is not self.mem if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) lex.orth = self.strings[string] lex.length = len(string) lex.id = self.length + lex.repvec = mem.alloc(self.vectors_length, sizeof(float)) if self.get_lex_attr is not None: for attr, func in self.get_lex_attr.items(): value = func(string) @@ -283,7 +280,7 @@ cdef class Vocab: vec_len, len(pieces)) orth = self.strings[word_str] lexeme = self.get_by_orth(self.mem, orth) - lexeme.repvec = self.mem.alloc(len(pieces), sizeof(float)) + lexeme.repvec = self.mem.alloc(self.vectors_length, sizeof(float)) for i, val_str in enumerate(pieces): lexeme.repvec[i] = float(val_str) From d9c29de3af190db3a3b3bef64bcff434ebe04b57 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 22 Sep 2015 02:11:31 +0200 Subject: [PATCH 3/6] * Add vectors tests for token API --- tests/tokens/test_token_api.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/tokens/test_token_api.py b/tests/tokens/test_token_api.py index e85feca6b..99c99fc11 100644 --- a/tests/tokens/test_token_api.py +++ b/tests/tokens/test_token_api.py @@ -51,3 +51,14 @@ def test_is_properties(EN): assert addr.is_oov assert not Hi.is_oov +@pytest.mark.models +def test_vectors(EN): + apples, oranges, oov = EN(u'apples oranges ldskbjlsdkbflzdfbl') + assert apples.has_vector + assert oranges.has_vector + assert not oov.has_vector + assert apples.similarity(oranges) > apples.similarity(oov) + assert apples.similarity(oranges) == oranges.similarity(apples) + assert sum(apples.vector) != sum(oranges.vector) + assert apples.vector_norm != oranges.vector_norm + From 361f6fdd744152cc3185f385ee62de32b9a5d300 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 22 Sep 2015 02:22:27 +0200 Subject: [PATCH 4/6] * Inc version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a32967b6d..6c514ff06 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.91' +VERSION = '0.93' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] From 5fa18e4f9dd0f1ae515b4348071f972153ddeb10 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 22 Sep 2015 02:23:11 +0200 Subject: [PATCH 5/6] * Update mark on test_vec --- tests/tokens/test_vec.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tokens/test_vec.py b/tests/tokens/test_vec.py index 26825f7a9..028328fa6 100644 --- a/tests/tokens/test_vec.py +++ b/tests/tokens/test_vec.py @@ -4,15 +4,15 @@ from spacy.en import English import pytest -@pytest.mark.vectors +@pytest.mark.models def test_vec(EN): hype = EN.vocab['hype'] assert hype.orth_ == 'hype' - assert 0.08 >= hype.repvec[0] > 0.07 + assert 0.08 >= hype.vector[0] > 0.07 -@pytest.mark.vectors +@pytest.mark.models def test_capitalized(EN): hype = EN.vocab['Hype'] assert hype.orth_ == 'Hype' - assert 0.08 >= hype.repvec[0] > 0.07 + assert 0.08 >= hype.vector[0] > 0.07 From 9431441544d6d38e507426f7fb7398055c252e84 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 22 Sep 2015 02:26:10 +0200 Subject: [PATCH 6/6] * Update the publish command, so that it creates a git tag --- fabfile.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fabfile.py b/fabfile.py index a3e2e65f7..953c02e00 100644 --- a/fabfile.py +++ b/fabfile.py @@ -61,11 +61,14 @@ def docs(): with lcd(path.join(path.dirname(__file__), 'docs')): local('make html') -def publish(): +def publish(version): with virtualenv(VENV_DIR): - local('python setup.py register') - local('twine upload dist/*.tar.gz') local('git push origin master') + local('git tag -a %s' % version) + local('git push origin %s' % version) + local('python setup.py sdist') + local('python setup.py register') + local('twine upload dist/%s.tar.gz' % version) def env(lang="python2.7"):