Revise and simplify Vectors class

2024-11-10 19:57:17 +03:00 · 2017-10-31 18:25:08 +01:00 · 2017-10-31 18:25:08 +01:00 · 77d8f5de9a
commit 77d8f5de9a
parent cb5217012f
7 changed files with 275 additions and 187 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -208,8 +208,8 @@ def test_doc_api_right_edge(en_tokenizer):

 def test_doc_api_has_vector():
    vocab = Vocab()
-    vocab.clear_vectors(2)
-    vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
+    vocab.reset_vectors(width=2)
+    vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
    doc = Doc(vocab, words=['kitten'])
    assert doc.has_vector

--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -72,9 +72,9 @@ def test_doc_token_api_is_properties(en_vocab):

 def test_doc_token_api_vectors():
    vocab = Vocab()
-    vocab.clear_vectors(2)
-    vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f'))
-    vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
+    vocab.reset_vectors(width=2)
+    vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f'))
+    vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
    doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
    assert doc.has_vector

--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
-    vocab.clear_vectors(length)
+    vocab.reset_vectors(width=length)
    for word, vec in vectors:
-        vocab.set_vector(word, vec)
+        vocab.set_vector(word, vector=vec)
    return vocab


--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@ -35,20 +35,18 @@ def vocab(en_vocab, vectors):


 def test_init_vectors_with_data(strings, data):
-    v = Vectors(strings, data=data)
+    v = Vectors(data=data)
    assert v.shape == data.shape

-def test_init_vectors_with_width(strings):
-    v = Vectors(strings, width=3)
-    for string in strings:
-        v.add(string)
+def test_init_vectors_with_shape(strings):
+    v = Vectors(shape=(len(strings), 3))
    assert v.shape == (len(strings), 3)


 def test_get_vector(strings, data):
-    v = Vectors(strings, data=data)
-    for string in strings:
-        v.add(string)
+    v = Vectors(data=data)
+    for i, string in enumerate(strings):
+        v.add(string, row=i)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
@ -56,9 +54,9 @@ def test_get_vector(strings, data):

 def test_set_vector(strings, data):
    orig = data.copy()
-    v = Vectors(strings, data=data)
-    for string in strings:
-        v.add(string)
+    v = Vectors(data=data)
+    for i, string in enumerate(strings):
+        v.add(string, row=i)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
@ -66,7 +64,6 @@ def test_set_vector(strings, data):
    assert list(v[strings[0]]) != list(orig[0])


-
@pytest.fixture()
 def tokenizer_v(vocab):
    return Tokenizer(vocab, {}, None, None, None)
--- a/spacy/tests/vocab/test_add_vectors.py
+++ b/spacy/tests/vocab/test_add_vectors.py
@ -36,5 +36,5 @@ def test_vocab_prune_vectors():
    remap = vocab.prune_vectors(2)
    assert list(remap.keys()) == [u'kitten']
    neighbour, similarity = remap.values()[0]
-    assert neighbour == u'cat'
+    assert neighbour == u'cat', remap
    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -15,6 +15,12 @@ from .compat import basestring_, path2str
 from . import util


+def unpickle_vectors(keys_and_rows, data):
+    vectors = Vectors(data=data)
+    for key, row in keys_and_rows:
+        vectors.add(key, row=row)
+
+
 cdef class Vectors:
    """Store, save and load word vectors.

@ -23,130 +29,35 @@ cdef class Vectors:
    (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
    rows in the vectors.data table.
    
-    Multiple keys can be mapped to the same vector, so len(keys) may be greater
-    (but not smaller) than data.shape[0].
+    Multiple keys can be mapped to the same vector, and not all of the rows in
+    the table need to be assigned --- so len(list(vectors.keys())) may be
+    greater or smaller than vectors.shape[0].
    """
    cdef public object data
-    cdef readonly StringStore strings
    cdef public object key2row
-    cdef public int _i_vec
+    cdef public object _unset

-    def __init__(self, strings, width=0, data=None):
-        """Create a new vector store. To keep the vector table empty, pass
-        `width=0`. You can also create the vector table and add vectors one by
-        one, or set the vector values directly on initialisation.
-
-        strings (StringStore or list): List of strings or StringStore that maps
-            strings to hash values, and vice versa.
-        width (int): Number of dimensions.
+    def __init__(self, *, shape=None, data=None, keys=None):
+        """Create a new vector store.
+        
+        shape (tuple): Size of the table, as (# entries, # columns)
        data (numpy.ndarray): The vector data.
        RETURNS (Vectors): The newly created object.
        """
-        if isinstance(strings, StringStore):
-            self.strings = strings
+        if data is None:
+            if shape is None:
+                shape = (0,0)
+            data = numpy.zeros(shape, dtype='f')
+        self.data = data
+        self.key2row = OrderedDict()
+        if self.data is not None:
+            self._unset = set(range(self.data.shape[0]))
        else:
-            self.strings = StringStore()
-            for string in strings:
-                self.strings.add(string)
-        if data is not None:
-            self.data = numpy.asarray(data, dtype='f')
-        else:
-            self.data = numpy.zeros((len(self.strings), width), dtype='f')
-        self._i_vec = 0
-        self.key2row = {}
-        if data is not None:
-            for i, string in enumerate(self.strings):
-                if i >= self.data.shape[0]:
-                    break
-                self.add(self.strings[string], vector=self.data[i])
-
-    def __reduce__(self):
-        return (Vectors, (self.strings, self.data))
-
-    def __getitem__(self, key):
-        """Get a vector by key. If key is a string, it is hashed to an integer
-        ID using the vectors.strings table. If the integer key is not found in
-        the table, a KeyError is raised.
-
-        key (unicode / int): The key to get the vector for.
-        RETURNS (numpy.ndarray): The vector for the key.
-        """
-        if isinstance(key, basestring):
-            key = self.strings[key]
-        i = self.key2row[key]
-        if i is None:
-            raise KeyError(key)
-        else:
-            return self.data[i]
-
-    def __setitem__(self, key, vector):
-        """Set a vector for the given key. If key is a string, it is hashed
-        to an integer ID using the vectors.strings table.
-
-        key (unicode / int): The key to set the vector for.
-        vector (numpy.ndarray): The vector to set.
-        """
-        if isinstance(key, basestring):
-            key = self.strings.add(key)
-        i = self.key2row[key]
-        self.data[i] = vector
-
-    def __iter__(self):
-        """Yield vectors from the table.
-
-        YIELDS (numpy.ndarray): A vector.
-        """
-        yield from self.data
-
-    def __len__(self):
-        """Return the number of vectors that have been assigned.
-
-        RETURNS (int): The number of vectors in the data.
-        """
-        return self._i_vec
-
-    def __contains__(self, key):
-        """Check whether a key has a vector entry in the table.
-
-        key (unicode / int): The key to check.
-        RETURNS (bool): Whether the key has a vector entry.
-        """
-        if isinstance(key, basestring_):
-            key = self.strings[key]
-        return key in self.key2row
-
-    def add(self, key, *, vector=None, row=None):
-        """Add a key to the table. Keys can be mapped to an existing vector
-        by setting `row`, or a new vector can be added.
-
-        key (unicode / int): The key to add.
-        vector (numpy.ndarray / None): A vector to add for the key.
-        row (int / None): The row-number of a vector to map the key to.
-        """
-        if isinstance(key, basestring_):
-            key = self.strings.add(key)
-        if row is None and key in self.key2row:
-            row = self.key2row[key]
-        elif row is None:
-            row = self._i_vec
-            self._i_vec += 1
-        if row >= self.data.shape[0]:
-            self.data.resize((row*2, self.data.shape[1]))
-
-        self.key2row[key] = row
-        if vector is not None:
-            self.data[row] = vector
-        return row
-
-    def items(self):
-        """Iterate over `(string key, vector)` pairs, in order.
-
-        YIELDS (tuple): A key/vector pair.
-        """
-        for key, row in self.key2row.items():
-            string = self.strings[key]
-            yield string, self.data[row]
-
+            self._unset = set()
+        if keys is not None:
+            for i, key in enumerate(keys):
+                self.add(key, row=i)
+    
    @property
    def shape(self):
        """Get `(rows, dims)` tuples of number of rows and number of dimensions
@ -156,9 +67,179 @@ cdef class Vectors:
        """
        return self.data.shape

-    def most_similar(self, key):
-        # TODO: implement
-        raise NotImplementedError
+    @property
+    def size(self):
+        """Return rows*dims"""
+        return self.data.shape[0] * self.data.shape[1]
+
+    @property
+    def is_full(self):
+        """Returns True if no keys are available for new keys."""
+        return len(self._unset) == 0
+
+    def __reduce__(self):
+        keys_and_rows = self.key2row.items()
+        return (unpickle_vectors, (keys_and_rows, self.data))
+
+    def __getitem__(self, key):
+        """Get a vector by key. If the key is not found, a KeyError is raised.
+
+        key (int): The key to get the vector for.
+        RETURNS (ndarray): The vector for the key.
+        """
+        i = self.key2row[key]
+        if i is None:
+            raise KeyError(key)
+        else:
+            return self.data[i]
+
+    def __setitem__(self, key, vector):
+        """Set a vector for the given key.
+
+        key (int): The key to set the vector for.
+        vector (numpy.ndarray): The vector to set.
+        """
+        i = self.key2row[key]
+        self.data[i] = vector
+        if i in self._unset:
+            self._unset.remove(i)
+
+    def __iter__(self):
+        """Yield vectors from the table.
+
+        YIELDS (ndarray): A vector.
+        """
+        yield from self.key2row
+
+    def __len__(self):
+        """Return the number of vectors in the table.
+
+        RETURNS (int): The number of vectors in the data.
+        """
+        return self.data.shape[0]
+
+    def __contains__(self, key):
+        """Check whether a key has been mapped to a vector entry in the table.
+
+        key (int): The key to check.
+        RETURNS (bool): Whether the key has a vector entry.
+        """
+        return key in self.key2row
+
+    def resize(self, shape, inplace=False):
+        '''Resize the underlying vectors array. If inplace=True, the memory
+        is reallocated. This may cause other references to the data to become
+        invalid, so only use inplace=True if you're sure that's what you want.
+
+        If the number of vectors is reduced, keys mapped to rows that have been
+        deleted are removed. These removed items are returned as a list of
+        (key, row) tuples.
+        '''
+        if inplace:
+            self.data.resize(shape, refcheck=False)
+        else:
+            xp = get_array_module(self.data)
+            self.data = xp.resize(self.data, shape)
+        filled = {row for row in self.key2row.values()}
+        self._unset = {row for row in range(shape[0]) if row not in filled}
+        removed_items = []
+        for key, row in dict(self.key2row.items()):
+            if row >= shape[0]:
+                self.key2row.pop(key)
+                removed_items.append((key, row))
+        return removed_items
+    
+    def keys(self):
+        '''Iterate over the keys in the table.'''
+        yield from self.key2row.keys()
+    
+    def values(self):
+        '''Iterate over vectors that have been assigned to at least one key.
+
+        Note that some vectors may be unassigned, so the number of vectors
+        returned may be less than the length of the vectors table.'''
+        for row, vector in enumerate(range(self.data.shape[0])):
+            if row not in self._unset:
+                yield vector
+
+    def items(self):
+        """Iterate over `(key, vector)` pairs.
+
+        YIELDS (tuple): A key/vector pair.
+        """
+        for key, row in self.key2row.items():
+            yield key, self.data[row]
+
+    def get_keys(self, rows):
+        xp = get_array_module(self.data)
+        row2key = {row: key for key, row in self.key2row.items()}
+        keys = xp.asarray([row2key[row] for row in rows],
+                           dtype='uint64')
+        return keys
+
+    def get_rows(self, keys):
+        xp = get_array_module(self.data)
+        k2r = self.key2row
+        return xp.asarray([k2r.get(key, -1) for key in keys], dtype='i')
+
+    def add(self, key, *, vector=None, row=None):
+        """Add a key to the table. Keys can be mapped to an existing vector
+        by setting `row`, or a new vector can be added.
+
+        key (unicode / int): The key to add.
+        vector (numpy.ndarray / None): A vector to add for the key.
+        row (int / None): The row-number of a vector to map the key to.
+        """
+        if row is None and key in self.key2row:
+            row = self.key2row[key]
+        elif row is None:
+            if self.is_full:
+                raise ValueError("Cannot add new key to vectors -- full")
+            row = min(self._unset)
+
+        self.key2row[key] = row
+        if vector is not None:
+            self.data[row] = vector
+            if row in self._unset:
+                self._unset.remove(row)
+        return row
+    
+    def most_similar(self, queries, *, return_scores=False, return_rows,
+            batch_size=1024):
+        '''For each of the given vectors, find the single entry most similar
+        to it, by cosine.
+        
+        Queries are by vector. Results are returned as an array of keys,
+        or a tuple of (keys, scores) if return_scores=True. If `queries` is
+        large, the calculations are performed in chunks, to avoid consuming
+        too much memory. You can set the `batch_size` to control the size/space
+        trade-off during the calculations.
+        '''
+        xp = get_array_module(self.data)
+        
+        vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
+        
+        best_rows = xp.zeros((queries.shape[0],), dtype='i')
+        scores = xp.zeros((queries.shape[0],), dtype='f')
+        # Work in batches, to avoid memory problems.
+        for i in range(0, queries.shape[0], batch_size):
+            batch = queries[i : i+batch_size]
+            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)
+            # batch   e.g. (1024, 300)
+            # vectors e.g. (10000, 300)
+            # sims    e.g. (1024, 10000)
+            sims = xp.dot(batch, vectors.T)
+            best_rows[i:i+batch_size] = sims.argmax(axis=1)
+            scores[i:i+batch_size] = sims.max(axis=1)
+        keys = self.get_keys(best_rows)
+        if return_rows and return_scores:
+            return (keys, best_rows, scores)
+        elif return_rows:
+            return (keys, best_rows)
+        elif return_scores:
+            return (keys, scores)
+        else:
+            return keys

    def from_glove(self, path):
        """Load GloVe vectors from a directory. Assumes binary format,
@ -168,27 +249,33 @@ cdef class Vectors:
        By default GloVe outputs 64-bit vectors.

        path (unicode / Path): The path to load the GloVe vectors from.
+
+        RETURNS: A StringStore object, holding the key-to-string mapping.
        """
        path = util.ensure_path(path)
+        width = None
        for name in path.iterdir():
            if name.parts[-1].startswith('vectors'):
                _, dims, dtype, _2 = name.parts[-1].split('.')
-                self.width = int(dims)
+                width = int(dims)
                break
        else:
            raise IOError("Expected file named e.g. vectors.128.f.bin")
        bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
                                                             dtype=dtype)
+        xp = get_array_module(self.data)
+        self.data = None
        with bin_loc.open('rb') as file_:
-            self.data = numpy.fromfile(file_, dtype='float64')
-            self.data = numpy.ascontiguousarray(self.data, dtype='float32')
+            self.data = xp.fromfile(file_, dtype=dtype)
+            if dtype != 'float32':
+                self.data = xp.ascontiguousarray(self.data, dtype='float32')
        n = 0
+        strings = StringStore()
        with (path / 'vocab.txt').open('r') as file_:
-            for line in file_:
-                self.add(line.strip())
-                n += 1
-        if (self.data.size % self.width) == 0:
-            self.data
+            for i, line in enumerate(file_):
+                key = strings.add(line.strip())
+                self.add(key, row=i)
+        return strings

    def to_disk(self, path, **exclude):
        """Save the current state to a directory.
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -55,7 +55,7 @@ cdef class Vocab:
                _ = self[string]
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.vectors = Vectors(self.strings, width=0)
+        self.vectors = Vectors()

    property lang:
        def __get__(self):
@ -241,15 +241,19 @@ cdef class Vocab:
    def vectors_length(self):
        return self.vectors.data.shape[1]

-    def clear_vectors(self, width=None):
+    def reset_vectors(self, *, width=None, shape=None):
        """Drop the current vector table. Because all vectors must be the same
        width, you have to call this to change the size of the vectors.
        """
-        if width is None:
-            width = self.vectors.data.shape[1]
-        self.vectors = Vectors(self.strings, width=width)
+        if width is not None and shape is not None:
+            raise ValueError("Only one of width and shape can be specified")
+        elif shape is not None:
+            self.vectors = Vectors(shape=shape)
+        else:
+            width = width if width is not None else self.vectors.data.shape[1]
+            self.vectors = Vectors(shape=(self.vectors.shape[0], width))

-    def prune_vectors(self, nr_row, batch_size=8):
+    def prune_vectors(self, nr_row, batch_size=1024):
        """Reduce the current vector table to `nr_row` unique entries. Words
        mapped to the discarded vectors will be remapped to the closest vector
        among those remaining.
@ -275,37 +279,29 @@ cdef class Vocab:
            two words.
        """
        xp = get_array_module(self.vectors.data)
-        # Work in batches, to avoid memory problems.
-        keep = self.vectors.data[:nr_row]
-        keep_keys = [key for key, row in self.vectors.key2row.items() if row < nr_row]
-        toss = self.vectors.data[nr_row:]
-        # Normalize the vectors, so cosine similarity is just dot product.
-        # Note we can't modify the ones we're keeping in-place...
-        keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-12)
-        keep = xp.ascontiguousarray(keep.T)
-        neighbours = xp.zeros((toss.shape[0],), dtype='i')
-        scores = xp.zeros((toss.shape[0],), dtype='f')
-        for i in range(0, toss.shape[0], batch_size):
-            batch = toss[i : i+batch_size]
-            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-12
-            sims = xp.dot(batch, keep)
-            matches = sims.argmax(axis=1)
-            neighbours[i:i+batch_size] = matches
-            scores[i:i+batch_size] = sims.max(axis=1)
-        i2k = {i: key for key, i in self.vectors.key2row.items()}
+        # Make prob negative so it sorts by rank ascending
+        # (key2row contains the rank)
+        priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
+                    for lex in self if lex.orth in self.vectors.key2row]
+        priority.sort()
+        indices = xp.asarray([i for (prob, i, key) in priority], dtype='i')
+        keys = xp.asarray([key for (prob, i, key) in priority], dtype='uint64')
+        
+        keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
+        toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
+
+        self.vectors = Vectors(data=keep, keys=keys)
+
+        syn_keys, syn_rows, scores = self.vectors.most_similar(toss,
+                                        return_rows=True, return_scores=True)
+
        remap = {}
-        for lex in list(self):
-            # If we're losing the vector for this word, map it to the nearest
-            # vector we're keeping.
-            if lex.rank >= nr_row:
-                lex.rank = neighbours[lex.rank-nr_row]
-                self.vectors.add(lex.orth, row=lex.rank)
-                remap[lex.orth_] = (self.strings[i2k[lex.rank]], scores[lex.rank])
-        for key, row in self.vectors.key2row.items():
-            if row >= nr_row:
-                self.vectors.key2row[key] = neighbours[row-nr_row]
-        # Make copy, to encourage the original table to be garbage collected.
-        self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
+        for i, key in enumerate(keys[nr_row:]):
+            self.vectors.add(key, row=syn_rows[i])
+            word = self.strings[key]
+            synonym = self.strings[syn_keys[i]]
+            score = scores[i]
+            remap[word] = (synonym, score)
        link_vectors_to_models(self)
        return remap

@ -329,11 +325,19 @@ cdef class Vocab:
        """Set a vector for a word in the vocabulary. Words can be referenced
        by string or int ID.
        """
-        if self.vectors.data.size == 0:
-            self.clear_vectors(vector.shape[0])
-        lex = self[orth]
+        if isinstance(orth, basestring_):
+            orth = self.strings.add(orth)
+        if self.vectors.is_full and orth not in self.vectors:
+            new_rows = max(100, int(self.vectors.shape[0]*1.3))
+            if self.vectors.shape[1] == 0:
+                width = vector.size
+            else:
+                width = self.vectors.shape[1]
+            self.vectors.resize((new_rows, width))
+            print(self.vectors.shape)
+            self.vectors.add(orth, vector=vector)
+        print("Adding", orth, self.vectors.is_full)
        self.vectors.add(orth, vector=vector)
-        lex.rank = self.vectors.key2row[lex.orth]

    def has_vector(self, orth):
        """Check whether a word has a vector. Returns False if no vectors have