Implement Fast-Text vectors with subword features

This commit is contained in:
Suraj Krishnan Rajan 2018-04-21 01:34:14 +05:30
parent 686225eadd
commit 69d041148f
2 changed files with 59 additions and 3 deletions

View File

@ -23,6 +23,18 @@ def vectors():
('juice', [5, 5, 10]), ('juice', [5, 5, 10]),
('pie', [7, 6.3, 8.9])] ('pie', [7, 6.3, 8.9])]
@pytest.fixture
def ngrams_vectors():
return [
("apple", [1, 2, 3]),
("app", [-0.1, -0.2, -0.3]),
('ppl', [-0.2, -0.3, -0.4]),
('pl', [0.7, 0.8, 0.9])
]
@pytest.fixture()
def ngrams_vocab(en_vocab, ngrams_vectors):
add_vecs_to_vocab(en_vocab, ngrams_vectors)
return en_vocab
@pytest.fixture @pytest.fixture
def data(): def data():
@ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
assert vectors[1] == (doc[2].text, list(doc[2].vector)) assert vectors[1] == (doc[2].text, list(doc[2].vector))
@pytest.mark.parametrize('text', ["apple"])
def test_vectors__ngrams_word(ngrams_vocab, text):
assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1])
@pytest.mark.parametrize('text', ["applpie"])
def test_vectors__ngrams_subword(ngrams_vocab, text):
truth = list(ngrams_vocab.get_vector(text,1,6))
test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))])
eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
for i in eps:
assert i<1e-6
@pytest.mark.parametrize('text', ["apple", "orange"]) @pytest.mark.parametrize('text', ["apple", "orange"])
def test_vectors_lexeme_vector(vocab, text): def test_vectors_lexeme_vector(vocab, text):
lex = vocab[text] lex = vocab[text]

View File

@ -309,7 +309,7 @@ cdef class Vocab:
link_vectors_to_models(self) link_vectors_to_models(self)
return remap return remap
def get_vector(self, orth): def get_vector(self, orth, minn=None, maxn=None):
"""Retrieve a vector for a word in the vocabulary. Words can be looked """Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is up by string or int ID. If no vectors data is loaded, ValueError is
raised. raised.
@ -320,10 +320,42 @@ cdef class Vocab:
""" """
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
word = self[orth].orth_
if orth in self.vectors.key2row: if orth in self.vectors.key2row:
return self.vectors[orth] return self.vectors[orth]
else:
return numpy.zeros((self.vectors_length,), dtype='f') # Assign default ngram limits to minn and maxn which is the length of the word.
if minn is None:
minn = len(word)
if maxn is None:
maxn = len(word)
vectors = numpy.zeros((self.vectors_length,), dtype='f')
# Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText
ngrams_size = 0;
for i in range(len(word)):
ngram = ""
if (word[i] and 0xC0) == 0x80:
continue
n = 1
j = i
while (j < len(word) and n <= maxn):
if n > maxn:
break
ngram += word[j]
j = j + 1
while (j < len(word) and (word[j] and 0xC0) == 0x80):
ngram += word[j]
j = j + 1
if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
if self.strings[ngram] in self.vectors.key2row:
vectors = numpy.add(self.vectors[self.strings[ngram]],vectors)
ngrams_size += 1
n = n + 1
if ngrams_size > 0:
vectors = vectors * (1.0/ngrams_size)
return vectors
def set_vector(self, orth, vector): def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. Words can be referenced """Set a vector for a word in the vocabulary. Words can be referenced