prevent division by zero in most_similar method (#4488)

2025-11-03 09:27:56 +03:00 · 2019-10-21 12:04:46 +02:00 · 2019-10-21 12:04:46 +02:00 · d5d55312b2
commit d5d55312b2
parent a98d1cd58e
2 changed files with 14 additions and 2 deletions
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -15,6 +15,7 @@ from spacy.util import decaying
 import numpy
 import re

+from spacy.vectors import Vectors
 from ..util import get_doc


@ -293,6 +294,13 @@ def test_issue3410():
        list(phrasematcher.pipe(docs, n_threads=4))


+def test_issue3412():
+    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
+    vectors = Vectors(data=data)
+    keys, best_rows, scores = vectors.most_similar(numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f"))
+    assert(best_rows[0] == 2)
+
+
 def test_issue3447():
    sizes = decaying(10.0, 1.0, 0.5)
    size = next(sizes)
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -321,14 +321,18 @@ cdef class Vectors:
        """
        xp = get_array_module(self.data)

-        vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
+        norms = xp.linalg.norm(self.data, axis=1, keepdims=True)
+        norms[norms == 0] = 1
+        vectors = self.data / norms

        best_rows = xp.zeros((queries.shape[0], n), dtype='i')
        scores = xp.zeros((queries.shape[0], n), dtype='f')
        # Work in batches, to avoid memory problems.
        for i in range(0, queries.shape[0], batch_size):
            batch = queries[i : i+batch_size]
-            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)
+            batch_norms = xp.linalg.norm(batch, axis=1, keepdims=True)
+            batch_norms[batch_norms == 0] = 1
+            batch /= batch_norms
            # batch   e.g. (1024, 300)
            # vectors e.g. (10000, 300)
            # sims    e.g. (1024, 10000)