Revise and simplify Vectors class

This commit is contained in:
Matthew Honnibal 2017-10-31 18:25:08 +01:00
parent cb5217012f
commit 77d8f5de9a
7 changed files with 275 additions and 187 deletions

View File

@ -208,8 +208,8 @@ def test_doc_api_right_edge(en_tokenizer):
def test_doc_api_has_vector(): def test_doc_api_has_vector():
vocab = Vocab() vocab = Vocab()
vocab.clear_vectors(2) vocab.reset_vectors(width=2)
vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f')) vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
doc = Doc(vocab, words=['kitten']) doc = Doc(vocab, words=['kitten'])
assert doc.has_vector assert doc.has_vector

View File

@ -72,9 +72,9 @@ def test_doc_token_api_is_properties(en_vocab):
def test_doc_token_api_vectors(): def test_doc_token_api_vectors():
vocab = Vocab() vocab = Vocab()
vocab.clear_vectors(2) vocab.reset_vectors(width=2)
vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f')) vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f'))
vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f')) vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
doc = Doc(vocab, words=['apples', 'oranges', 'oov']) doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
assert doc.has_vector assert doc.has_vector

View File

@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
"""Add list of vector tuples to given vocab. All vectors need to have the """Add list of vector tuples to given vocab. All vectors need to have the
same length. Format: [("text", [1, 2, 3])]""" same length. Format: [("text", [1, 2, 3])]"""
length = len(vectors[0][1]) length = len(vectors[0][1])
vocab.clear_vectors(length) vocab.reset_vectors(width=length)
for word, vec in vectors: for word, vec in vectors:
vocab.set_vector(word, vec) vocab.set_vector(word, vector=vec)
return vocab return vocab

View File

@ -35,20 +35,18 @@ def vocab(en_vocab, vectors):
def test_init_vectors_with_data(strings, data): def test_init_vectors_with_data(strings, data):
v = Vectors(strings, data=data) v = Vectors(data=data)
assert v.shape == data.shape assert v.shape == data.shape
def test_init_vectors_with_width(strings): def test_init_vectors_with_shape(strings):
v = Vectors(strings, width=3) v = Vectors(shape=(len(strings), 3))
for string in strings:
v.add(string)
assert v.shape == (len(strings), 3) assert v.shape == (len(strings), 3)
def test_get_vector(strings, data): def test_get_vector(strings, data):
v = Vectors(strings, data=data) v = Vectors(data=data)
for string in strings: for i, string in enumerate(strings):
v.add(string) v.add(string, row=i)
assert list(v[strings[0]]) == list(data[0]) assert list(v[strings[0]]) == list(data[0])
assert list(v[strings[0]]) != list(data[1]) assert list(v[strings[0]]) != list(data[1])
assert list(v[strings[1]]) != list(data[0]) assert list(v[strings[1]]) != list(data[0])
@ -56,9 +54,9 @@ def test_get_vector(strings, data):
def test_set_vector(strings, data): def test_set_vector(strings, data):
orig = data.copy() orig = data.copy()
v = Vectors(strings, data=data) v = Vectors(data=data)
for string in strings: for i, string in enumerate(strings):
v.add(string) v.add(string, row=i)
assert list(v[strings[0]]) == list(orig[0]) assert list(v[strings[0]]) == list(orig[0])
assert list(v[strings[0]]) != list(orig[1]) assert list(v[strings[0]]) != list(orig[1])
v[strings[0]] = data[1] v[strings[0]] = data[1]
@ -66,7 +64,6 @@ def test_set_vector(strings, data):
assert list(v[strings[0]]) != list(orig[0]) assert list(v[strings[0]]) != list(orig[0])
@pytest.fixture() @pytest.fixture()
def tokenizer_v(vocab): def tokenizer_v(vocab):
return Tokenizer(vocab, {}, None, None, None) return Tokenizer(vocab, {}, None, None, None)

View File

@ -36,5 +36,5 @@ def test_vocab_prune_vectors():
remap = vocab.prune_vectors(2) remap = vocab.prune_vectors(2)
assert list(remap.keys()) == [u'kitten'] assert list(remap.keys()) == [u'kitten']
neighbour, similarity = remap.values()[0] neighbour, similarity = remap.values()[0]
assert neighbour == u'cat' assert neighbour == u'cat', remap
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6) assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)

View File

@ -15,6 +15,12 @@ from .compat import basestring_, path2str
from . import util from . import util
def unpickle_vectors(keys_and_rows, data):
vectors = Vectors(data=data)
for key, row in keys_and_rows:
vectors.add(key, row=row)
cdef class Vectors: cdef class Vectors:
"""Store, save and load word vectors. """Store, save and load word vectors.
@ -23,129 +29,34 @@ cdef class Vectors:
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
rows in the vectors.data table. rows in the vectors.data table.
Multiple keys can be mapped to the same vector, so len(keys) may be greater Multiple keys can be mapped to the same vector, and not all of the rows in
(but not smaller) than data.shape[0]. the table need to be assigned --- so len(list(vectors.keys())) may be
greater or smaller than vectors.shape[0].
""" """
cdef public object data cdef public object data
cdef readonly StringStore strings
cdef public object key2row cdef public object key2row
cdef public int _i_vec cdef public object _unset
def __init__(self, strings, width=0, data=None): def __init__(self, *, shape=None, data=None, keys=None):
"""Create a new vector store. To keep the vector table empty, pass """Create a new vector store.
`width=0`. You can also create the vector table and add vectors one by
one, or set the vector values directly on initialisation.
strings (StringStore or list): List of strings or StringStore that maps shape (tuple): Size of the table, as (# entries, # columns)
strings to hash values, and vice versa.
width (int): Number of dimensions.
data (numpy.ndarray): The vector data. data (numpy.ndarray): The vector data.
RETURNS (Vectors): The newly created object. RETURNS (Vectors): The newly created object.
""" """
if isinstance(strings, StringStore): if data is None:
self.strings = strings if shape is None:
shape = (0,0)
data = numpy.zeros(shape, dtype='f')
self.data = data
self.key2row = OrderedDict()
if self.data is not None:
self._unset = set(range(self.data.shape[0]))
else: else:
self.strings = StringStore() self._unset = set()
for string in strings: if keys is not None:
self.strings.add(string) for i, key in enumerate(keys):
if data is not None: self.add(key, row=i)
self.data = numpy.asarray(data, dtype='f')
else:
self.data = numpy.zeros((len(self.strings), width), dtype='f')
self._i_vec = 0
self.key2row = {}
if data is not None:
for i, string in enumerate(self.strings):
if i >= self.data.shape[0]:
break
self.add(self.strings[string], vector=self.data[i])
def __reduce__(self):
return (Vectors, (self.strings, self.data))
def __getitem__(self, key):
"""Get a vector by key. If key is a string, it is hashed to an integer
ID using the vectors.strings table. If the integer key is not found in
the table, a KeyError is raised.
key (unicode / int): The key to get the vector for.
RETURNS (numpy.ndarray): The vector for the key.
"""
if isinstance(key, basestring):
key = self.strings[key]
i = self.key2row[key]
if i is None:
raise KeyError(key)
else:
return self.data[i]
def __setitem__(self, key, vector):
"""Set a vector for the given key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
key (unicode / int): The key to set the vector for.
vector (numpy.ndarray): The vector to set.
"""
if isinstance(key, basestring):
key = self.strings.add(key)
i = self.key2row[key]
self.data[i] = vector
def __iter__(self):
"""Yield vectors from the table.
YIELDS (numpy.ndarray): A vector.
"""
yield from self.data
def __len__(self):
"""Return the number of vectors that have been assigned.
RETURNS (int): The number of vectors in the data.
"""
return self._i_vec
def __contains__(self, key):
"""Check whether a key has a vector entry in the table.
key (unicode / int): The key to check.
RETURNS (bool): Whether the key has a vector entry.
"""
if isinstance(key, basestring_):
key = self.strings[key]
return key in self.key2row
def add(self, key, *, vector=None, row=None):
"""Add a key to the table. Keys can be mapped to an existing vector
by setting `row`, or a new vector can be added.
key (unicode / int): The key to add.
vector (numpy.ndarray / None): A vector to add for the key.
row (int / None): The row-number of a vector to map the key to.
"""
if isinstance(key, basestring_):
key = self.strings.add(key)
if row is None and key in self.key2row:
row = self.key2row[key]
elif row is None:
row = self._i_vec
self._i_vec += 1
if row >= self.data.shape[0]:
self.data.resize((row*2, self.data.shape[1]))
self.key2row[key] = row
if vector is not None:
self.data[row] = vector
return row
def items(self):
"""Iterate over `(string key, vector)` pairs, in order.
YIELDS (tuple): A key/vector pair.
"""
for key, row in self.key2row.items():
string = self.strings[key]
yield string, self.data[row]
@property @property
def shape(self): def shape(self):
@ -156,9 +67,179 @@ cdef class Vectors:
""" """
return self.data.shape return self.data.shape
def most_similar(self, key): @property
# TODO: implement def size(self):
raise NotImplementedError """Return rows*dims"""
return self.data.shape[0] * self.data.shape[1]
@property
def is_full(self):
"""Returns True if no keys are available for new keys."""
return len(self._unset) == 0
def __reduce__(self):
keys_and_rows = self.key2row.items()
return (unpickle_vectors, (keys_and_rows, self.data))
def __getitem__(self, key):
"""Get a vector by key. If the key is not found, a KeyError is raised.
key (int): The key to get the vector for.
RETURNS (ndarray): The vector for the key.
"""
i = self.key2row[key]
if i is None:
raise KeyError(key)
else:
return self.data[i]
def __setitem__(self, key, vector):
"""Set a vector for the given key.
key (int): The key to set the vector for.
vector (numpy.ndarray): The vector to set.
"""
i = self.key2row[key]
self.data[i] = vector
if i in self._unset:
self._unset.remove(i)
def __iter__(self):
"""Yield vectors from the table.
YIELDS (ndarray): A vector.
"""
yield from self.key2row
def __len__(self):
"""Return the number of vectors in the table.
RETURNS (int): The number of vectors in the data.
"""
return self.data.shape[0]
def __contains__(self, key):
"""Check whether a key has been mapped to a vector entry in the table.
key (int): The key to check.
RETURNS (bool): Whether the key has a vector entry.
"""
return key in self.key2row
def resize(self, shape, inplace=False):
'''Resize the underlying vectors array. If inplace=True, the memory
is reallocated. This may cause other references to the data to become
invalid, so only use inplace=True if you're sure that's what you want.
If the number of vectors is reduced, keys mapped to rows that have been
deleted are removed. These removed items are returned as a list of
(key, row) tuples.
'''
if inplace:
self.data.resize(shape, refcheck=False)
else:
xp = get_array_module(self.data)
self.data = xp.resize(self.data, shape)
filled = {row for row in self.key2row.values()}
self._unset = {row for row in range(shape[0]) if row not in filled}
removed_items = []
for key, row in dict(self.key2row.items()):
if row >= shape[0]:
self.key2row.pop(key)
removed_items.append((key, row))
return removed_items
def keys(self):
'''Iterate over the keys in the table.'''
yield from self.key2row.keys()
def values(self):
'''Iterate over vectors that have been assigned to at least one key.
Note that some vectors may be unassigned, so the number of vectors
returned may be less than the length of the vectors table.'''
for row, vector in enumerate(range(self.data.shape[0])):
if row not in self._unset:
yield vector
def items(self):
"""Iterate over `(key, vector)` pairs.
YIELDS (tuple): A key/vector pair.
"""
for key, row in self.key2row.items():
yield key, self.data[row]
def get_keys(self, rows):
xp = get_array_module(self.data)
row2key = {row: key for key, row in self.key2row.items()}
keys = xp.asarray([row2key[row] for row in rows],
dtype='uint64')
return keys
def get_rows(self, keys):
xp = get_array_module(self.data)
k2r = self.key2row
return xp.asarray([k2r.get(key, -1) for key in keys], dtype='i')
def add(self, key, *, vector=None, row=None):
"""Add a key to the table. Keys can be mapped to an existing vector
by setting `row`, or a new vector can be added.
key (unicode / int): The key to add.
vector (numpy.ndarray / None): A vector to add for the key.
row (int / None): The row-number of a vector to map the key to.
"""
if row is None and key in self.key2row:
row = self.key2row[key]
elif row is None:
if self.is_full:
raise ValueError("Cannot add new key to vectors -- full")
row = min(self._unset)
self.key2row[key] = row
if vector is not None:
self.data[row] = vector
if row in self._unset:
self._unset.remove(row)
return row
def most_similar(self, queries, *, return_scores=False, return_rows,
batch_size=1024):
'''For each of the given vectors, find the single entry most similar
to it, by cosine.
Queries are by vector. Results are returned as an array of keys,
or a tuple of (keys, scores) if return_scores=True. If `queries` is
large, the calculations are performed in chunks, to avoid consuming
too much memory. You can set the `batch_size` to control the size/space
trade-off during the calculations.
'''
xp = get_array_module(self.data)
vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
best_rows = xp.zeros((queries.shape[0],), dtype='i')
scores = xp.zeros((queries.shape[0],), dtype='f')
# Work in batches, to avoid memory problems.
for i in range(0, queries.shape[0], batch_size):
batch = queries[i : i+batch_size]
batch /= xp.linalg.norm(batch, axis=1, keepdims=True)
# batch e.g. (1024, 300)
# vectors e.g. (10000, 300)
# sims e.g. (1024, 10000)
sims = xp.dot(batch, vectors.T)
best_rows[i:i+batch_size] = sims.argmax(axis=1)
scores[i:i+batch_size] = sims.max(axis=1)
keys = self.get_keys(best_rows)
if return_rows and return_scores:
return (keys, best_rows, scores)
elif return_rows:
return (keys, best_rows)
elif return_scores:
return (keys, scores)
else:
return keys
def from_glove(self, path): def from_glove(self, path):
"""Load GloVe vectors from a directory. Assumes binary format, """Load GloVe vectors from a directory. Assumes binary format,
@ -168,27 +249,33 @@ cdef class Vectors:
By default GloVe outputs 64-bit vectors. By default GloVe outputs 64-bit vectors.
path (unicode / Path): The path to load the GloVe vectors from. path (unicode / Path): The path to load the GloVe vectors from.
RETURNS: A StringStore object, holding the key-to-string mapping.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
width = None
for name in path.iterdir(): for name in path.iterdir():
if name.parts[-1].startswith('vectors'): if name.parts[-1].startswith('vectors'):
_, dims, dtype, _2 = name.parts[-1].split('.') _, dims, dtype, _2 = name.parts[-1].split('.')
self.width = int(dims) width = int(dims)
break break
else: else:
raise IOError("Expected file named e.g. vectors.128.f.bin") raise IOError("Expected file named e.g. vectors.128.f.bin")
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims, bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
dtype=dtype) dtype=dtype)
xp = get_array_module(self.data)
self.data = None
with bin_loc.open('rb') as file_: with bin_loc.open('rb') as file_:
self.data = numpy.fromfile(file_, dtype='float64') self.data = xp.fromfile(file_, dtype=dtype)
self.data = numpy.ascontiguousarray(self.data, dtype='float32') if dtype != 'float32':
self.data = xp.ascontiguousarray(self.data, dtype='float32')
n = 0 n = 0
strings = StringStore()
with (path / 'vocab.txt').open('r') as file_: with (path / 'vocab.txt').open('r') as file_:
for line in file_: for i, line in enumerate(file_):
self.add(line.strip()) key = strings.add(line.strip())
n += 1 self.add(key, row=i)
if (self.data.size % self.width) == 0: return strings
self.data
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Save the current state to a directory. """Save the current state to a directory.

View File

@ -55,7 +55,7 @@ cdef class Vocab:
_ = self[string] _ = self[string]
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(self.strings, width=0) self.vectors = Vectors()
property lang: property lang:
def __get__(self): def __get__(self):
@ -241,15 +241,19 @@ cdef class Vocab:
def vectors_length(self): def vectors_length(self):
return self.vectors.data.shape[1] return self.vectors.data.shape[1]
def clear_vectors(self, width=None): def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same """Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors. width, you have to call this to change the size of the vectors.
""" """
if width is None: if width is not None and shape is not None:
width = self.vectors.data.shape[1] raise ValueError("Only one of width and shape can be specified")
self.vectors = Vectors(self.strings, width=width) elif shape is not None:
self.vectors = Vectors(shape=shape)
else:
width = width if width is not None else self.vectors.data.shape[1]
self.vectors = Vectors(shape=(self.vectors.shape[0], width))
def prune_vectors(self, nr_row, batch_size=8): def prune_vectors(self, nr_row, batch_size=1024):
"""Reduce the current vector table to `nr_row` unique entries. Words """Reduce the current vector table to `nr_row` unique entries. Words
mapped to the discarded vectors will be remapped to the closest vector mapped to the discarded vectors will be remapped to the closest vector
among those remaining. among those remaining.
@ -275,37 +279,29 @@ cdef class Vocab:
two words. two words.
""" """
xp = get_array_module(self.vectors.data) xp = get_array_module(self.vectors.data)
# Work in batches, to avoid memory problems. # Make prob negative so it sorts by rank ascending
keep = self.vectors.data[:nr_row] # (key2row contains the rank)
keep_keys = [key for key, row in self.vectors.key2row.items() if row < nr_row] priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
toss = self.vectors.data[nr_row:] for lex in self if lex.orth in self.vectors.key2row]
# Normalize the vectors, so cosine similarity is just dot product. priority.sort()
# Note we can't modify the ones we're keeping in-place... indices = xp.asarray([i for (prob, i, key) in priority], dtype='i')
keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-12) keys = xp.asarray([key for (prob, i, key) in priority], dtype='uint64')
keep = xp.ascontiguousarray(keep.T)
neighbours = xp.zeros((toss.shape[0],), dtype='i') keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
scores = xp.zeros((toss.shape[0],), dtype='f') toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
for i in range(0, toss.shape[0], batch_size):
batch = toss[i : i+batch_size] self.vectors = Vectors(data=keep, keys=keys)
batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-12
sims = xp.dot(batch, keep) syn_keys, syn_rows, scores = self.vectors.most_similar(toss,
matches = sims.argmax(axis=1) return_rows=True, return_scores=True)
neighbours[i:i+batch_size] = matches
scores[i:i+batch_size] = sims.max(axis=1)
i2k = {i: key for key, i in self.vectors.key2row.items()}
remap = {} remap = {}
for lex in list(self): for i, key in enumerate(keys[nr_row:]):
# If we're losing the vector for this word, map it to the nearest self.vectors.add(key, row=syn_rows[i])
# vector we're keeping. word = self.strings[key]
if lex.rank >= nr_row: synonym = self.strings[syn_keys[i]]
lex.rank = neighbours[lex.rank-nr_row] score = scores[i]
self.vectors.add(lex.orth, row=lex.rank) remap[word] = (synonym, score)
remap[lex.orth_] = (self.strings[i2k[lex.rank]], scores[lex.rank])
for key, row in self.vectors.key2row.items():
if row >= nr_row:
self.vectors.key2row[key] = neighbours[row-nr_row]
# Make copy, to encourage the original table to be garbage collected.
self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
link_vectors_to_models(self) link_vectors_to_models(self)
return remap return remap
@ -329,11 +325,19 @@ cdef class Vocab:
"""Set a vector for a word in the vocabulary. Words can be referenced """Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID. by string or int ID.
""" """
if self.vectors.data.size == 0: if isinstance(orth, basestring_):
self.clear_vectors(vector.shape[0]) orth = self.strings.add(orth)
lex = self[orth] if self.vectors.is_full and orth not in self.vectors:
new_rows = max(100, int(self.vectors.shape[0]*1.3))
if self.vectors.shape[1] == 0:
width = vector.size
else:
width = self.vectors.shape[1]
self.vectors.resize((new_rows, width))
print(self.vectors.shape)
self.vectors.add(orth, vector=vector)
print("Adding", orth, self.vectors.is_full)
self.vectors.add(orth, vector=vector) self.vectors.add(orth, vector=vector)
lex.rank = self.vectors.key2row[lex.orth]
def has_vector(self, orth): def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no vectors have """Check whether a word has a vector. Returns False if no vectors have