From 05bdbe28bbe581bb9f7a3d236c2447d46b0b254e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 19 Apr 2021 10:30:03 +0200
Subject: [PATCH] Fix vectors data on GPU (#7626)

* ensure vectors data is stored on right device

* ensure the added vector is on the right device

* move vector to numpy before iterating

* move best_rows to numpy before iterating
---
 spacy/vectors.pyx | 17 ++++++++++++-----
 spacy/vocab.pyx   | 12 +++++++-----
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index bcea87e67..7cb3322c2 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -55,7 +55,7 @@ cdef class Vectors:
         """Create a new vector store.
 
         shape (tuple): Size of the table, as (# entries, # columns)
-        data (numpy.ndarray): The vector data.
+        data (numpy.ndarray or cupy.ndarray): The vector data.
         keys (iterable): A sequence of keys, aligned with the data.
         name (str): A name to identify the vectors table.
 
@@ -65,7 +65,8 @@ cdef class Vectors:
         if data is None:
             if shape is None:
                 shape = (0,0)
-            data = numpy.zeros(shape, dtype="f")
+            ops = get_current_ops()
+            data = ops.xp.zeros(shape, dtype="f")
         self.data = data
         self.key2row = {}
         if self.data is not None:
@@ -300,6 +301,8 @@ cdef class Vectors:
         else:
             raise ValueError(Errors.E197.format(row=row, key=key))
         if vector is not None:
+            xp = get_array_module(self.data)
+            vector = xp.asarray(vector)
             self.data[row] = vector
         if self._unset.count(row):
             self._unset.erase(self._unset.find(row))
@@ -321,10 +324,11 @@ cdef class Vectors:
         RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)`
             tuple.
         """
+        xp = get_array_module(self.data)
         filled = sorted(list({row for row in self.key2row.values()}))
         if len(filled) < n:
             raise ValueError(Errors.E198.format(n=n, n_rows=len(filled)))
-        xp = get_array_module(self.data)
+        filled = xp.asarray(filled)
 
         norms = xp.linalg.norm(self.data[filled], axis=1, keepdims=True)
         norms[norms == 0] = 1
@@ -357,8 +361,10 @@ cdef class Vectors:
         # Account for numerical error we want to return in range -1, 1
         scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
         row2key = {row: key for key, row in self.key2row.items()}
+
+        numpy_rows = get_current_ops().to_numpy(best_rows)
         keys = xp.asarray(
-            [[row2key[row] for row in best_rows[i] if row in row2key]
+            [[row2key[row] for row in numpy_rows[i] if row in row2key]
                     for i in range(len(queries)) ], dtype="uint64")
         return (keys, best_rows, scores)
 
@@ -459,7 +465,8 @@ cdef class Vectors:
             if hasattr(self.data, "from_bytes"):
                 self.data.from_bytes()
             else:
-                self.data = srsly.msgpack_loads(b)
+                xp = get_array_module(self.data)
+                self.data = xp.asarray(srsly.msgpack_loads(b))
 
         deserializers = {
             "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)),
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 1008797b3..ee440898a 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -2,7 +2,7 @@
 from libc.string cimport memcpy
 
 import srsly
-from thinc.api import get_array_module
+from thinc.api import get_array_module, get_current_ops
 import functools
 
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK
@@ -293,7 +293,7 @@ cdef class Vocab:
         among those remaining.
 
         For example, suppose the original table had vectors for the words:
-        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
+        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to
         two rows, we would discard the vectors for 'feline' and 'reclined'.
         These words would then be remapped to the closest remaining vector
         -- so "feline" would have the same vector as "cat", and "reclined"
@@ -314,6 +314,7 @@ cdef class Vocab:
 
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
+        ops = get_current_ops()
         xp = get_array_module(self.vectors.data)
         # Make sure all vectors are in the vocab
         for orth in self.vectors:
@@ -329,8 +330,9 @@ cdef class Vocab:
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
         self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name)
         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
+        syn_keys = ops.to_numpy(syn_keys)
         remap = {}
-        for i, key in enumerate(keys[nr_row:]):
+        for i, key in enumerate(ops.to_numpy(keys[nr_row:])):
             self.vectors.add(key, row=syn_rows[i][0])
             word = self.strings[key]
             synonym = self.strings[syn_keys[i][0]]
@@ -351,7 +353,7 @@ cdef class Vocab:
             Defaults to the length of `orth`.
         maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
             Defaults to the length of `orth`.
-        RETURNS (numpy.ndarray): A word vector. Size
+        RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size
             and shape determined by the `vocab.vectors` instance. Usually, a
             numpy ndarray of shape (300,) and dtype float32.
 
@@ -400,7 +402,7 @@ cdef class Vocab:
         by string or int ID.
 
         orth (int / unicode): The word.
-        vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set.
+        vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
 
         DOCS: https://spacy.io/api/vocab#set_vector
         """