Clip most_similar to range [-1, 1] (fixes #4506) (#4507)

* Clip most_similar to range [-1, 1]

* Add/fix vectors tests

* Fix test
This commit is contained in:
Matthew Honnibal 2019-10-22 20:10:42 +02:00 committed by Ines Montani
parent 74a19aeb1c
commit 9489c5f6b2
2 changed files with 6 additions and 3 deletions

View File

@ -141,7 +141,6 @@ def test_vectors_most_similar(most_similar_vectors_data):
assert all(row[0] == i for i, row in enumerate(best_rows))
@pytest.mark.xfail
def test_vectors_most_similar_identical():
"""Test that most similar identical vectors are assigned a score of 1.0."""
data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
@ -315,4 +314,4 @@ def test_vocab_prune_vectors():
assert list(remap.keys()) == ["kitten"]
neighbour, similarity = list(remap.values())[0]
assert neighbour == "cat", remap
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)

View File

@ -344,8 +344,12 @@ cdef class Vectors:
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index]
xp = get_array_module(self.data)
# Round values really close to 1 or -1
scores = xp.around(scores, decimals=4, out=scores)
# Account for numerical error we want to return in range -1, 1
scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
row2key = {row: key for key, row in self.key2row.items()}
keys = xp.asarray(
[[row2key[row] for row in best_rows[i] if row in row2key]