mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Revise and simplify Vectors class
This commit is contained in:
parent
cb5217012f
commit
77d8f5de9a
|
@ -208,8 +208,8 @@ def test_doc_api_right_edge(en_tokenizer):
|
|||
|
||||
def test_doc_api_has_vector():
|
||||
vocab = Vocab()
|
||||
vocab.clear_vectors(2)
|
||||
vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
vocab.reset_vectors(width=2)
|
||||
vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
doc = Doc(vocab, words=['kitten'])
|
||||
assert doc.has_vector
|
||||
|
||||
|
|
|
@ -72,9 +72,9 @@ def test_doc_token_api_is_properties(en_vocab):
|
|||
|
||||
def test_doc_token_api_vectors():
|
||||
vocab = Vocab()
|
||||
vocab.clear_vectors(2)
|
||||
vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
|
||||
vocab.reset_vectors(width=2)
|
||||
vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
|
||||
doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
|
||||
assert doc.has_vector
|
||||
|
||||
|
|
|
@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
|
|||
"""Add list of vector tuples to given vocab. All vectors need to have the
|
||||
same length. Format: [("text", [1, 2, 3])]"""
|
||||
length = len(vectors[0][1])
|
||||
vocab.clear_vectors(length)
|
||||
vocab.reset_vectors(width=length)
|
||||
for word, vec in vectors:
|
||||
vocab.set_vector(word, vec)
|
||||
vocab.set_vector(word, vector=vec)
|
||||
return vocab
|
||||
|
||||
|
||||
|
|
|
@ -35,20 +35,18 @@ def vocab(en_vocab, vectors):
|
|||
|
||||
|
||||
def test_init_vectors_with_data(strings, data):
|
||||
v = Vectors(strings, data=data)
|
||||
v = Vectors(data=data)
|
||||
assert v.shape == data.shape
|
||||
|
||||
def test_init_vectors_with_width(strings):
|
||||
v = Vectors(strings, width=3)
|
||||
for string in strings:
|
||||
v.add(string)
|
||||
def test_init_vectors_with_shape(strings):
|
||||
v = Vectors(shape=(len(strings), 3))
|
||||
assert v.shape == (len(strings), 3)
|
||||
|
||||
|
||||
def test_get_vector(strings, data):
|
||||
v = Vectors(strings, data=data)
|
||||
for string in strings:
|
||||
v.add(string)
|
||||
v = Vectors(data=data)
|
||||
for i, string in enumerate(strings):
|
||||
v.add(string, row=i)
|
||||
assert list(v[strings[0]]) == list(data[0])
|
||||
assert list(v[strings[0]]) != list(data[1])
|
||||
assert list(v[strings[1]]) != list(data[0])
|
||||
|
@ -56,9 +54,9 @@ def test_get_vector(strings, data):
|
|||
|
||||
def test_set_vector(strings, data):
|
||||
orig = data.copy()
|
||||
v = Vectors(strings, data=data)
|
||||
for string in strings:
|
||||
v.add(string)
|
||||
v = Vectors(data=data)
|
||||
for i, string in enumerate(strings):
|
||||
v.add(string, row=i)
|
||||
assert list(v[strings[0]]) == list(orig[0])
|
||||
assert list(v[strings[0]]) != list(orig[1])
|
||||
v[strings[0]] = data[1]
|
||||
|
@ -66,7 +64,6 @@ def test_set_vector(strings, data):
|
|||
assert list(v[strings[0]]) != list(orig[0])
|
||||
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tokenizer_v(vocab):
|
||||
return Tokenizer(vocab, {}, None, None, None)
|
||||
|
|
|
@ -36,5 +36,5 @@ def test_vocab_prune_vectors():
|
|||
remap = vocab.prune_vectors(2)
|
||||
assert list(remap.keys()) == [u'kitten']
|
||||
neighbour, similarity = remap.values()[0]
|
||||
assert neighbour == u'cat'
|
||||
assert neighbour == u'cat', remap
|
||||
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
|
||||
|
|
|
@ -15,6 +15,12 @@ from .compat import basestring_, path2str
|
|||
from . import util
|
||||
|
||||
|
||||
def unpickle_vectors(keys_and_rows, data):
|
||||
vectors = Vectors(data=data)
|
||||
for key, row in keys_and_rows:
|
||||
vectors.add(key, row=row)
|
||||
|
||||
|
||||
cdef class Vectors:
|
||||
"""Store, save and load word vectors.
|
||||
|
||||
|
@ -23,129 +29,34 @@ cdef class Vectors:
|
|||
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
|
||||
rows in the vectors.data table.
|
||||
|
||||
Multiple keys can be mapped to the same vector, so len(keys) may be greater
|
||||
(but not smaller) than data.shape[0].
|
||||
Multiple keys can be mapped to the same vector, and not all of the rows in
|
||||
the table need to be assigned --- so len(list(vectors.keys())) may be
|
||||
greater or smaller than vectors.shape[0].
|
||||
"""
|
||||
cdef public object data
|
||||
cdef readonly StringStore strings
|
||||
cdef public object key2row
|
||||
cdef public int _i_vec
|
||||
cdef public object _unset
|
||||
|
||||
def __init__(self, strings, width=0, data=None):
|
||||
"""Create a new vector store. To keep the vector table empty, pass
|
||||
`width=0`. You can also create the vector table and add vectors one by
|
||||
one, or set the vector values directly on initialisation.
|
||||
def __init__(self, *, shape=None, data=None, keys=None):
|
||||
"""Create a new vector store.
|
||||
|
||||
strings (StringStore or list): List of strings or StringStore that maps
|
||||
strings to hash values, and vice versa.
|
||||
width (int): Number of dimensions.
|
||||
shape (tuple): Size of the table, as (# entries, # columns)
|
||||
data (numpy.ndarray): The vector data.
|
||||
RETURNS (Vectors): The newly created object.
|
||||
"""
|
||||
if isinstance(strings, StringStore):
|
||||
self.strings = strings
|
||||
if data is None:
|
||||
if shape is None:
|
||||
shape = (0,0)
|
||||
data = numpy.zeros(shape, dtype='f')
|
||||
self.data = data
|
||||
self.key2row = OrderedDict()
|
||||
if self.data is not None:
|
||||
self._unset = set(range(self.data.shape[0]))
|
||||
else:
|
||||
self.strings = StringStore()
|
||||
for string in strings:
|
||||
self.strings.add(string)
|
||||
if data is not None:
|
||||
self.data = numpy.asarray(data, dtype='f')
|
||||
else:
|
||||
self.data = numpy.zeros((len(self.strings), width), dtype='f')
|
||||
self._i_vec = 0
|
||||
self.key2row = {}
|
||||
if data is not None:
|
||||
for i, string in enumerate(self.strings):
|
||||
if i >= self.data.shape[0]:
|
||||
break
|
||||
self.add(self.strings[string], vector=self.data[i])
|
||||
|
||||
def __reduce__(self):
|
||||
return (Vectors, (self.strings, self.data))
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Get a vector by key. If key is a string, it is hashed to an integer
|
||||
ID using the vectors.strings table. If the integer key is not found in
|
||||
the table, a KeyError is raised.
|
||||
|
||||
key (unicode / int): The key to get the vector for.
|
||||
RETURNS (numpy.ndarray): The vector for the key.
|
||||
"""
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings[key]
|
||||
i = self.key2row[key]
|
||||
if i is None:
|
||||
raise KeyError(key)
|
||||
else:
|
||||
return self.data[i]
|
||||
|
||||
def __setitem__(self, key, vector):
|
||||
"""Set a vector for the given key. If key is a string, it is hashed
|
||||
to an integer ID using the vectors.strings table.
|
||||
|
||||
key (unicode / int): The key to set the vector for.
|
||||
vector (numpy.ndarray): The vector to set.
|
||||
"""
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings.add(key)
|
||||
i = self.key2row[key]
|
||||
self.data[i] = vector
|
||||
|
||||
def __iter__(self):
|
||||
"""Yield vectors from the table.
|
||||
|
||||
YIELDS (numpy.ndarray): A vector.
|
||||
"""
|
||||
yield from self.data
|
||||
|
||||
def __len__(self):
|
||||
"""Return the number of vectors that have been assigned.
|
||||
|
||||
RETURNS (int): The number of vectors in the data.
|
||||
"""
|
||||
return self._i_vec
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Check whether a key has a vector entry in the table.
|
||||
|
||||
key (unicode / int): The key to check.
|
||||
RETURNS (bool): Whether the key has a vector entry.
|
||||
"""
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings[key]
|
||||
return key in self.key2row
|
||||
|
||||
def add(self, key, *, vector=None, row=None):
|
||||
"""Add a key to the table. Keys can be mapped to an existing vector
|
||||
by setting `row`, or a new vector can be added.
|
||||
|
||||
key (unicode / int): The key to add.
|
||||
vector (numpy.ndarray / None): A vector to add for the key.
|
||||
row (int / None): The row-number of a vector to map the key to.
|
||||
"""
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings.add(key)
|
||||
if row is None and key in self.key2row:
|
||||
row = self.key2row[key]
|
||||
elif row is None:
|
||||
row = self._i_vec
|
||||
self._i_vec += 1
|
||||
if row >= self.data.shape[0]:
|
||||
self.data.resize((row*2, self.data.shape[1]))
|
||||
|
||||
self.key2row[key] = row
|
||||
if vector is not None:
|
||||
self.data[row] = vector
|
||||
return row
|
||||
|
||||
def items(self):
|
||||
"""Iterate over `(string key, vector)` pairs, in order.
|
||||
|
||||
YIELDS (tuple): A key/vector pair.
|
||||
"""
|
||||
for key, row in self.key2row.items():
|
||||
string = self.strings[key]
|
||||
yield string, self.data[row]
|
||||
self._unset = set()
|
||||
if keys is not None:
|
||||
for i, key in enumerate(keys):
|
||||
self.add(key, row=i)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
|
@ -156,9 +67,179 @@ cdef class Vectors:
|
|||
"""
|
||||
return self.data.shape
|
||||
|
||||
def most_similar(self, key):
|
||||
# TODO: implement
|
||||
raise NotImplementedError
|
||||
@property
|
||||
def size(self):
|
||||
"""Return rows*dims"""
|
||||
return self.data.shape[0] * self.data.shape[1]
|
||||
|
||||
@property
|
||||
def is_full(self):
|
||||
"""Returns True if no keys are available for new keys."""
|
||||
return len(self._unset) == 0
|
||||
|
||||
def __reduce__(self):
|
||||
keys_and_rows = self.key2row.items()
|
||||
return (unpickle_vectors, (keys_and_rows, self.data))
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Get a vector by key. If the key is not found, a KeyError is raised.
|
||||
|
||||
key (int): The key to get the vector for.
|
||||
RETURNS (ndarray): The vector for the key.
|
||||
"""
|
||||
i = self.key2row[key]
|
||||
if i is None:
|
||||
raise KeyError(key)
|
||||
else:
|
||||
return self.data[i]
|
||||
|
||||
def __setitem__(self, key, vector):
|
||||
"""Set a vector for the given key.
|
||||
|
||||
key (int): The key to set the vector for.
|
||||
vector (numpy.ndarray): The vector to set.
|
||||
"""
|
||||
i = self.key2row[key]
|
||||
self.data[i] = vector
|
||||
if i in self._unset:
|
||||
self._unset.remove(i)
|
||||
|
||||
def __iter__(self):
|
||||
"""Yield vectors from the table.
|
||||
|
||||
YIELDS (ndarray): A vector.
|
||||
"""
|
||||
yield from self.key2row
|
||||
|
||||
def __len__(self):
|
||||
"""Return the number of vectors in the table.
|
||||
|
||||
RETURNS (int): The number of vectors in the data.
|
||||
"""
|
||||
return self.data.shape[0]
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Check whether a key has been mapped to a vector entry in the table.
|
||||
|
||||
key (int): The key to check.
|
||||
RETURNS (bool): Whether the key has a vector entry.
|
||||
"""
|
||||
return key in self.key2row
|
||||
|
||||
def resize(self, shape, inplace=False):
|
||||
'''Resize the underlying vectors array. If inplace=True, the memory
|
||||
is reallocated. This may cause other references to the data to become
|
||||
invalid, so only use inplace=True if you're sure that's what you want.
|
||||
|
||||
If the number of vectors is reduced, keys mapped to rows that have been
|
||||
deleted are removed. These removed items are returned as a list of
|
||||
(key, row) tuples.
|
||||
'''
|
||||
if inplace:
|
||||
self.data.resize(shape, refcheck=False)
|
||||
else:
|
||||
xp = get_array_module(self.data)
|
||||
self.data = xp.resize(self.data, shape)
|
||||
filled = {row for row in self.key2row.values()}
|
||||
self._unset = {row for row in range(shape[0]) if row not in filled}
|
||||
removed_items = []
|
||||
for key, row in dict(self.key2row.items()):
|
||||
if row >= shape[0]:
|
||||
self.key2row.pop(key)
|
||||
removed_items.append((key, row))
|
||||
return removed_items
|
||||
|
||||
def keys(self):
|
||||
'''Iterate over the keys in the table.'''
|
||||
yield from self.key2row.keys()
|
||||
|
||||
def values(self):
|
||||
'''Iterate over vectors that have been assigned to at least one key.
|
||||
|
||||
Note that some vectors may be unassigned, so the number of vectors
|
||||
returned may be less than the length of the vectors table.'''
|
||||
for row, vector in enumerate(range(self.data.shape[0])):
|
||||
if row not in self._unset:
|
||||
yield vector
|
||||
|
||||
def items(self):
|
||||
"""Iterate over `(key, vector)` pairs.
|
||||
|
||||
YIELDS (tuple): A key/vector pair.
|
||||
"""
|
||||
for key, row in self.key2row.items():
|
||||
yield key, self.data[row]
|
||||
|
||||
def get_keys(self, rows):
|
||||
xp = get_array_module(self.data)
|
||||
row2key = {row: key for key, row in self.key2row.items()}
|
||||
keys = xp.asarray([row2key[row] for row in rows],
|
||||
dtype='uint64')
|
||||
return keys
|
||||
|
||||
def get_rows(self, keys):
|
||||
xp = get_array_module(self.data)
|
||||
k2r = self.key2row
|
||||
return xp.asarray([k2r.get(key, -1) for key in keys], dtype='i')
|
||||
|
||||
def add(self, key, *, vector=None, row=None):
|
||||
"""Add a key to the table. Keys can be mapped to an existing vector
|
||||
by setting `row`, or a new vector can be added.
|
||||
|
||||
key (unicode / int): The key to add.
|
||||
vector (numpy.ndarray / None): A vector to add for the key.
|
||||
row (int / None): The row-number of a vector to map the key to.
|
||||
"""
|
||||
if row is None and key in self.key2row:
|
||||
row = self.key2row[key]
|
||||
elif row is None:
|
||||
if self.is_full:
|
||||
raise ValueError("Cannot add new key to vectors -- full")
|
||||
row = min(self._unset)
|
||||
|
||||
self.key2row[key] = row
|
||||
if vector is not None:
|
||||
self.data[row] = vector
|
||||
if row in self._unset:
|
||||
self._unset.remove(row)
|
||||
return row
|
||||
|
||||
def most_similar(self, queries, *, return_scores=False, return_rows,
|
||||
batch_size=1024):
|
||||
'''For each of the given vectors, find the single entry most similar
|
||||
to it, by cosine.
|
||||
|
||||
Queries are by vector. Results are returned as an array of keys,
|
||||
or a tuple of (keys, scores) if return_scores=True. If `queries` is
|
||||
large, the calculations are performed in chunks, to avoid consuming
|
||||
too much memory. You can set the `batch_size` to control the size/space
|
||||
trade-off during the calculations.
|
||||
'''
|
||||
xp = get_array_module(self.data)
|
||||
|
||||
vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
|
||||
|
||||
best_rows = xp.zeros((queries.shape[0],), dtype='i')
|
||||
scores = xp.zeros((queries.shape[0],), dtype='f')
|
||||
# Work in batches, to avoid memory problems.
|
||||
for i in range(0, queries.shape[0], batch_size):
|
||||
batch = queries[i : i+batch_size]
|
||||
batch /= xp.linalg.norm(batch, axis=1, keepdims=True)
|
||||
# batch e.g. (1024, 300)
|
||||
# vectors e.g. (10000, 300)
|
||||
# sims e.g. (1024, 10000)
|
||||
sims = xp.dot(batch, vectors.T)
|
||||
best_rows[i:i+batch_size] = sims.argmax(axis=1)
|
||||
scores[i:i+batch_size] = sims.max(axis=1)
|
||||
keys = self.get_keys(best_rows)
|
||||
if return_rows and return_scores:
|
||||
return (keys, best_rows, scores)
|
||||
elif return_rows:
|
||||
return (keys, best_rows)
|
||||
elif return_scores:
|
||||
return (keys, scores)
|
||||
else:
|
||||
return keys
|
||||
|
||||
def from_glove(self, path):
|
||||
"""Load GloVe vectors from a directory. Assumes binary format,
|
||||
|
@ -168,27 +249,33 @@ cdef class Vectors:
|
|||
By default GloVe outputs 64-bit vectors.
|
||||
|
||||
path (unicode / Path): The path to load the GloVe vectors from.
|
||||
|
||||
RETURNS: A StringStore object, holding the key-to-string mapping.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
width = None
|
||||
for name in path.iterdir():
|
||||
if name.parts[-1].startswith('vectors'):
|
||||
_, dims, dtype, _2 = name.parts[-1].split('.')
|
||||
self.width = int(dims)
|
||||
width = int(dims)
|
||||
break
|
||||
else:
|
||||
raise IOError("Expected file named e.g. vectors.128.f.bin")
|
||||
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
|
||||
dtype=dtype)
|
||||
xp = get_array_module(self.data)
|
||||
self.data = None
|
||||
with bin_loc.open('rb') as file_:
|
||||
self.data = numpy.fromfile(file_, dtype='float64')
|
||||
self.data = numpy.ascontiguousarray(self.data, dtype='float32')
|
||||
self.data = xp.fromfile(file_, dtype=dtype)
|
||||
if dtype != 'float32':
|
||||
self.data = xp.ascontiguousarray(self.data, dtype='float32')
|
||||
n = 0
|
||||
strings = StringStore()
|
||||
with (path / 'vocab.txt').open('r') as file_:
|
||||
for line in file_:
|
||||
self.add(line.strip())
|
||||
n += 1
|
||||
if (self.data.size % self.width) == 0:
|
||||
self.data
|
||||
for i, line in enumerate(file_):
|
||||
key = strings.add(line.strip())
|
||||
self.add(key, row=i)
|
||||
return strings
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Save the current state to a directory.
|
||||
|
|
|
@ -55,7 +55,7 @@ cdef class Vocab:
|
|||
_ = self[string]
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.vectors = Vectors(self.strings, width=0)
|
||||
self.vectors = Vectors()
|
||||
|
||||
property lang:
|
||||
def __get__(self):
|
||||
|
@ -241,15 +241,19 @@ cdef class Vocab:
|
|||
def vectors_length(self):
|
||||
return self.vectors.data.shape[1]
|
||||
|
||||
def clear_vectors(self, width=None):
|
||||
def reset_vectors(self, *, width=None, shape=None):
|
||||
"""Drop the current vector table. Because all vectors must be the same
|
||||
width, you have to call this to change the size of the vectors.
|
||||
"""
|
||||
if width is None:
|
||||
width = self.vectors.data.shape[1]
|
||||
self.vectors = Vectors(self.strings, width=width)
|
||||
if width is not None and shape is not None:
|
||||
raise ValueError("Only one of width and shape can be specified")
|
||||
elif shape is not None:
|
||||
self.vectors = Vectors(shape=shape)
|
||||
else:
|
||||
width = width if width is not None else self.vectors.data.shape[1]
|
||||
self.vectors = Vectors(shape=(self.vectors.shape[0], width))
|
||||
|
||||
def prune_vectors(self, nr_row, batch_size=8):
|
||||
def prune_vectors(self, nr_row, batch_size=1024):
|
||||
"""Reduce the current vector table to `nr_row` unique entries. Words
|
||||
mapped to the discarded vectors will be remapped to the closest vector
|
||||
among those remaining.
|
||||
|
@ -275,37 +279,29 @@ cdef class Vocab:
|
|||
two words.
|
||||
"""
|
||||
xp = get_array_module(self.vectors.data)
|
||||
# Work in batches, to avoid memory problems.
|
||||
keep = self.vectors.data[:nr_row]
|
||||
keep_keys = [key for key, row in self.vectors.key2row.items() if row < nr_row]
|
||||
toss = self.vectors.data[nr_row:]
|
||||
# Normalize the vectors, so cosine similarity is just dot product.
|
||||
# Note we can't modify the ones we're keeping in-place...
|
||||
keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-12)
|
||||
keep = xp.ascontiguousarray(keep.T)
|
||||
neighbours = xp.zeros((toss.shape[0],), dtype='i')
|
||||
scores = xp.zeros((toss.shape[0],), dtype='f')
|
||||
for i in range(0, toss.shape[0], batch_size):
|
||||
batch = toss[i : i+batch_size]
|
||||
batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-12
|
||||
sims = xp.dot(batch, keep)
|
||||
matches = sims.argmax(axis=1)
|
||||
neighbours[i:i+batch_size] = matches
|
||||
scores[i:i+batch_size] = sims.max(axis=1)
|
||||
i2k = {i: key for key, i in self.vectors.key2row.items()}
|
||||
# Make prob negative so it sorts by rank ascending
|
||||
# (key2row contains the rank)
|
||||
priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
|
||||
for lex in self if lex.orth in self.vectors.key2row]
|
||||
priority.sort()
|
||||
indices = xp.asarray([i for (prob, i, key) in priority], dtype='i')
|
||||
keys = xp.asarray([key for (prob, i, key) in priority], dtype='uint64')
|
||||
|
||||
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
|
||||
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
|
||||
|
||||
self.vectors = Vectors(data=keep, keys=keys)
|
||||
|
||||
syn_keys, syn_rows, scores = self.vectors.most_similar(toss,
|
||||
return_rows=True, return_scores=True)
|
||||
|
||||
remap = {}
|
||||
for lex in list(self):
|
||||
# If we're losing the vector for this word, map it to the nearest
|
||||
# vector we're keeping.
|
||||
if lex.rank >= nr_row:
|
||||
lex.rank = neighbours[lex.rank-nr_row]
|
||||
self.vectors.add(lex.orth, row=lex.rank)
|
||||
remap[lex.orth_] = (self.strings[i2k[lex.rank]], scores[lex.rank])
|
||||
for key, row in self.vectors.key2row.items():
|
||||
if row >= nr_row:
|
||||
self.vectors.key2row[key] = neighbours[row-nr_row]
|
||||
# Make copy, to encourage the original table to be garbage collected.
|
||||
self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
|
||||
for i, key in enumerate(keys[nr_row:]):
|
||||
self.vectors.add(key, row=syn_rows[i])
|
||||
word = self.strings[key]
|
||||
synonym = self.strings[syn_keys[i]]
|
||||
score = scores[i]
|
||||
remap[word] = (synonym, score)
|
||||
link_vectors_to_models(self)
|
||||
return remap
|
||||
|
||||
|
@ -329,11 +325,19 @@ cdef class Vocab:
|
|||
"""Set a vector for a word in the vocabulary. Words can be referenced
|
||||
by string or int ID.
|
||||
"""
|
||||
if self.vectors.data.size == 0:
|
||||
self.clear_vectors(vector.shape[0])
|
||||
lex = self[orth]
|
||||
if isinstance(orth, basestring_):
|
||||
orth = self.strings.add(orth)
|
||||
if self.vectors.is_full and orth not in self.vectors:
|
||||
new_rows = max(100, int(self.vectors.shape[0]*1.3))
|
||||
if self.vectors.shape[1] == 0:
|
||||
width = vector.size
|
||||
else:
|
||||
width = self.vectors.shape[1]
|
||||
self.vectors.resize((new_rows, width))
|
||||
print(self.vectors.shape)
|
||||
self.vectors.add(orth, vector=vector)
|
||||
print("Adding", orth, self.vectors.is_full)
|
||||
self.vectors.add(orth, vector=vector)
|
||||
lex.rank = self.vectors.key2row[lex.orth]
|
||||
|
||||
def has_vector(self, orth):
|
||||
"""Check whether a word has a vector. Returns False if no vectors have
|
||||
|
|
Loading…
Reference in New Issue
Block a user