From 113e7981d0c60f1e200eb0177c97b282927f61ac Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 May 2020 22:08:28 +0200 Subject: [PATCH] Check that row is within bounds when adding vector (#5430) Check that row is within bounds for the vector data array when adding a vector. Don't add vectors with rank OOV_RANK in `init-model` (change is due to shift from OOV as 0 to OOV as OOV_RANK). --- spacy/cli/init_model.py | 2 +- spacy/errors.py | 2 ++ spacy/tests/vocab_vectors/test_vectors.py | 3 +++ spacy/vectors.pyx | 6 +++++- spacy/vocab.pyx | 2 +- 5 files changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 31d627e9b..618266633 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -181,7 +181,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) for lex in nlp.vocab: - if lex.rank: + if lex.rank and lex.rank != OOV_RANK: nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: diff --git a/spacy/errors.py b/spacy/errors.py index 779980490..32ccd3df7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals + def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -555,6 +556,7 @@ class Errors(object): E195 = ("Matcher can be called on {good} only, got {got}.") E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can " "only be fixed with token.is_sent_start.") + E197 = ("Row out of bounds, unable to add row {row} for key {key}.") @add_codes diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 8987b7c89..322ef462a 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -307,6 +307,9 @@ def test_vocab_add_vector(): dog = vocab["dog"] assert list(dog.vector) == [2.0, 2.0, 2.0] + with pytest.raises(ValueError): + vocab.vectors.add(vocab["hamster"].orth, row=1000000) + def test_vocab_prune_vectors(): vocab = Vocab(vectors_name="test_vocab_prune_vectors") diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index f3c20fb7f..2973ddb5b 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -9,6 +9,7 @@ import functools import numpy from collections import OrderedDict import srsly +import warnings from thinc.neural.util import get_array_module from thinc.neural._classes.model import Model @@ -303,7 +304,10 @@ cdef class Vectors: raise ValueError(Errors.E060.format(rows=self.data.shape[0], cols=self.data.shape[1])) row = deref(self._unset.begin()) - self.key2row[key] = row + if row < self.data.shape[0]: + self.key2row[key] = row + else: + raise ValueError(Errors.E197.format(row=row, key=key)) if vector is not None: self.data[row] = vector if self._unset.count(row): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e31d26f85..ef2e86bcc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -319,7 +319,7 @@ cdef class Vocab: keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(data=keep, keys=keys, name=self.vectors.name) + self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) remap = {} for i, key in enumerate(keys[nr_row:]):