diff --git a/spacy/_ml.py b/spacy/_ml.py index e9dac11df..fa8e83d48 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -30,9 +30,13 @@ VECTORS_KEY = 'spacy_pretrained_vectors' def cosine(vec1, vec2): - norm1 = (vec1**2).sum() ** 0.5 - norm2 = (vec2**2).sum() ** 0.5 - return vec1.dot(vec2) / (norm1 * norm2) + xp = get_array_module(vec1) + norm1 = xp.linalg.norm(vec1) + norm2 = xp.linalg.norm(vec2) + if norm1 == 0. or norm2 == 0.: + return 0 + else: + return vec1.dot(vec2) / (norm1 * norm2) @layerize diff --git a/spacy/tests/vocab/test_add_vectors.py b/spacy/tests/vocab/test_add_vectors.py index 0ce95e5e9..3cb0b632c 100644 --- a/spacy/tests/vocab/test_add_vectors.py +++ b/spacy/tests/vocab/test_add_vectors.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import numpy -import pytest +from numpy.testing import assert_allclose from ...vocab import Vocab from ..._ml import cosine @@ -18,8 +18,6 @@ def test_vocab_add_vector(): assert list(cat.vector) == [1., 1., 1.] dog = vocab[u'dog'] assert list(dog.vector) == [2., 2., 2.] - for lex in vocab: - print(lex.orth_) def test_vocab_prune_vectors(): @@ -27,7 +25,6 @@ def test_vocab_prune_vectors(): _ = vocab[u'cat'] _ = vocab[u'dog'] _ = vocab[u'kitten'] - print(list(vocab.strings)) data = numpy.ndarray((5,3), dtype='f') data[0] = 1. data[1] = 2. @@ -35,9 +32,9 @@ def test_vocab_prune_vectors(): vocab.set_vector(u'cat', data[0]) vocab.set_vector(u'dog', data[1]) vocab.set_vector(u'kitten', data[2]) - for lex in vocab: - print(lex.orth_) remap = vocab.prune_vectors(2) - assert remap == {u'kitten': (u'cat', cosine(data[0], data[2]))} - #print(remap) + assert list(remap.keys()) == [u'kitten'] + neighbour, similarity = remap.values()[0] + assert neighbour == u'cat' + assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 6143986fb..cfc81bbe9 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -281,24 +281,26 @@ cdef class Vocab: toss = self.vectors.data[nr_row:] # Normalize the vectors, so cosine similarity is just dot product. # Note we can't modify the ones we're keeping in-place... - keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-8) + keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-12) keep = xp.ascontiguousarray(keep.T) neighbours = xp.zeros((toss.shape[0],), dtype='i') scores = xp.zeros((toss.shape[0],), dtype='f') for i in range(0, toss.shape[0], batch_size): batch = toss[i : i+batch_size] - batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8 + batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-12 sims = xp.dot(batch, keep) matches = sims.argmax(axis=1) neighbours[i:i+batch_size] = matches scores[i:i+batch_size] = sims.max(axis=1) - for lex in self: + i2k = {i: key for key, i in self.vectors.key2row.items()} + remap = {} + for lex in list(self): # If we're losing the vector for this word, map it to the nearest # vector we're keeping. if lex.rank >= nr_row: lex.rank = neighbours[lex.rank-nr_row] self.vectors.add(lex.orth, row=lex.rank) - remap[lex.orth_] = (i2k[lex.rank], scores[lex.rank]) + remap[lex.orth_] = (self.strings[i2k[lex.rank]], scores[lex.rank]) for key, row in self.vectors.key2row.items(): if row >= nr_row: self.vectors.key2row[key] = neighbours[row-nr_row]