From 113e7981d0c60f1e200eb0177c97b282927f61ac Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 13 May 2020 22:08:28 +0200
Subject: [PATCH] Check that row is within bounds when adding vector (#5430)

Check that row is within bounds for the vector data array when adding a
vector.

Don't add vectors with rank OOV_RANK in `init-model` (change is due to
shift from OOV as 0 to OOV as OOV_RANK).
---
 spacy/cli/init_model.py                   | 2 +-
 spacy/errors.py                           | 2 ++
 spacy/tests/vocab_vectors/test_vectors.py | 3 +++
 spacy/vectors.pyx                         | 6 +++++-
 spacy/vocab.pyx                           | 2 +-
 5 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 31d627e9b..618266633 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -181,7 +181,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
     if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
         nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
         for lex in nlp.vocab:
-            if lex.rank:
+            if lex.rank and lex.rank != OOV_RANK:
                 nlp.vocab.vectors.add(lex.orth, row=lex.rank)
     else:
         if vectors_loc:
diff --git a/spacy/errors.py b/spacy/errors.py
index 779980490..32ccd3df7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
+
 def add_codes(err_cls):
     """Add error codes to string messages via class attribute names."""
 
@@ -555,6 +556,7 @@ class Errors(object):
     E195 = ("Matcher can be called on {good} only, got {got}.")
     E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
             "only be fixed with token.is_sent_start.")
+    E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
 
 
 @add_codes
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 8987b7c89..322ef462a 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -307,6 +307,9 @@ def test_vocab_add_vector():
     dog = vocab["dog"]
     assert list(dog.vector) == [2.0, 2.0, 2.0]
 
+    with pytest.raises(ValueError):
+        vocab.vectors.add(vocab["hamster"].orth, row=1000000)
+
 
 def test_vocab_prune_vectors():
     vocab = Vocab(vectors_name="test_vocab_prune_vectors")
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index f3c20fb7f..2973ddb5b 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -9,6 +9,7 @@ import functools
 import numpy
 from collections import OrderedDict
 import srsly
+import warnings
 from thinc.neural.util import get_array_module
 from thinc.neural._classes.model import Model
 
@@ -303,7 +304,10 @@ cdef class Vectors:
                 raise ValueError(Errors.E060.format(rows=self.data.shape[0],
                                                     cols=self.data.shape[1]))
             row = deref(self._unset.begin())
-        self.key2row[key] = row
+        if row < self.data.shape[0]:
+            self.key2row[key] = row
+        else:
+            raise ValueError(Errors.E197.format(row=row, key=key))
         if vector is not None:
             self.data[row] = vector
             if self._unset.count(row):
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index e31d26f85..ef2e86bcc 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -319,7 +319,7 @@ cdef class Vocab:
         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
         keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
-        self.vectors = Vectors(data=keep, keys=keys, name=self.vectors.name)
+        self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name)
         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
         remap = {}
         for i, key in enumerate(keys[nr_row:]):