Merge pull request #2247 from skrcode/1480

1480 - Implement Fast-Text vectors with subword features
This commit is contained in:
Matthew Honnibal 2018-05-21 01:16:21 +02:00 committed by GitHub
commit b096b22c20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 59 additions and 3 deletions

View File

@ -23,6 +23,18 @@ def vectors():
('juice', [5, 5, 10]), ('juice', [5, 5, 10]),
('pie', [7, 6.3, 8.9])] ('pie', [7, 6.3, 8.9])]
@pytest.fixture
def ngrams_vectors():
return [
("apple", [1, 2, 3]),
("app", [-0.1, -0.2, -0.3]),
('ppl', [-0.2, -0.3, -0.4]),
('pl', [0.7, 0.8, 0.9])
]
@pytest.fixture()
def ngrams_vocab(en_vocab, ngrams_vectors):
add_vecs_to_vocab(en_vocab, ngrams_vectors)
return en_vocab
@pytest.fixture @pytest.fixture
def data(): def data():
@ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
assert vectors[1] == (doc[2].text, list(doc[2].vector)) assert vectors[1] == (doc[2].text, list(doc[2].vector))
@pytest.mark.parametrize('text', ["apple"])
def test_vectors__ngrams_word(ngrams_vocab, text):
assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1])
@pytest.mark.parametrize('text', ["applpie"])
def test_vectors__ngrams_subword(ngrams_vocab, text):
truth = list(ngrams_vocab.get_vector(text,1,6))
test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))])
eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
for i in eps:
assert i<1e-6
@pytest.mark.parametrize('text', ["apple", "orange"]) @pytest.mark.parametrize('text', ["apple", "orange"])
def test_vectors_lexeme_vector(vocab, text): def test_vectors_lexeme_vector(vocab, text):
lex = vocab[text] lex = vocab[text]

View File

@ -309,7 +309,7 @@ cdef class Vocab:
link_vectors_to_models(self) link_vectors_to_models(self)
return remap return remap
def get_vector(self, orth): def get_vector(self, orth, minn=None, maxn=None):
"""Retrieve a vector for a word in the vocabulary. Words can be looked """Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is up by string or int ID. If no vectors data is loaded, ValueError is
raised. raised.
@ -320,10 +320,42 @@ cdef class Vocab:
""" """
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
word = self[orth].orth_
if orth in self.vectors.key2row: if orth in self.vectors.key2row:
return self.vectors[orth] return self.vectors[orth]
else:
return numpy.zeros((self.vectors_length,), dtype='f') # Assign default ngram limits to minn and maxn which is the length of the word.
if minn is None:
minn = len(word)
if maxn is None:
maxn = len(word)
vectors = numpy.zeros((self.vectors_length,), dtype='f')
# Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText
ngrams_size = 0;
for i in range(len(word)):
ngram = ""
if (word[i] and 0xC0) == 0x80:
continue
n = 1
j = i
while (j < len(word) and n <= maxn):
if n > maxn:
break
ngram += word[j]
j = j + 1
while (j < len(word) and (word[j] and 0xC0) == 0x80):
ngram += word[j]
j = j + 1
if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
if self.strings[ngram] in self.vectors.key2row:
vectors = numpy.add(self.vectors[self.strings[ngram]],vectors)
ngrams_size += 1
n = n + 1
if ngrams_size > 0:
vectors = vectors * (1.0/ngrams_size)
return vectors
def set_vector(self, orth, vector): def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. Words can be referenced """Set a vector for a word in the vocabulary. Words can be referenced