mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge pull request #2247 from skrcode/1480
1480 - Implement Fast-Text vectors with subword features
This commit is contained in:
commit
b096b22c20
|
@ -23,6 +23,18 @@ def vectors():
|
||||||
('juice', [5, 5, 10]),
|
('juice', [5, 5, 10]),
|
||||||
('pie', [7, 6.3, 8.9])]
|
('pie', [7, 6.3, 8.9])]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ngrams_vectors():
|
||||||
|
return [
|
||||||
|
("apple", [1, 2, 3]),
|
||||||
|
("app", [-0.1, -0.2, -0.3]),
|
||||||
|
('ppl', [-0.2, -0.3, -0.4]),
|
||||||
|
('pl', [0.7, 0.8, 0.9])
|
||||||
|
]
|
||||||
|
@pytest.fixture()
|
||||||
|
def ngrams_vocab(en_vocab, ngrams_vectors):
|
||||||
|
add_vecs_to_vocab(en_vocab, ngrams_vectors)
|
||||||
|
return en_vocab
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def data():
|
def data():
|
||||||
|
@ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||||
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["apple"])
|
||||||
|
def test_vectors__ngrams_word(ngrams_vocab, text):
|
||||||
|
assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1])
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["applpie"])
|
||||||
|
def test_vectors__ngrams_subword(ngrams_vocab, text):
|
||||||
|
truth = list(ngrams_vocab.get_vector(text,1,6))
|
||||||
|
test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))])
|
||||||
|
eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
|
||||||
|
for i in eps:
|
||||||
|
assert i<1e-6
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["apple", "orange"])
|
@pytest.mark.parametrize('text', ["apple", "orange"])
|
||||||
def test_vectors_lexeme_vector(vocab, text):
|
def test_vectors_lexeme_vector(vocab, text):
|
||||||
lex = vocab[text]
|
lex = vocab[text]
|
||||||
|
|
|
@ -309,7 +309,7 @@ cdef class Vocab:
|
||||||
link_vectors_to_models(self)
|
link_vectors_to_models(self)
|
||||||
return remap
|
return remap
|
||||||
|
|
||||||
def get_vector(self, orth):
|
def get_vector(self, orth, minn=None, maxn=None):
|
||||||
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
||||||
up by string or int ID. If no vectors data is loaded, ValueError is
|
up by string or int ID. If no vectors data is loaded, ValueError is
|
||||||
raised.
|
raised.
|
||||||
|
@ -320,10 +320,42 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, basestring_):
|
if isinstance(orth, basestring_):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
|
word = self[orth].orth_
|
||||||
if orth in self.vectors.key2row:
|
if orth in self.vectors.key2row:
|
||||||
return self.vectors[orth]
|
return self.vectors[orth]
|
||||||
else:
|
|
||||||
return numpy.zeros((self.vectors_length,), dtype='f')
|
# Assign default ngram limits to minn and maxn which is the length of the word.
|
||||||
|
if minn is None:
|
||||||
|
minn = len(word)
|
||||||
|
if maxn is None:
|
||||||
|
maxn = len(word)
|
||||||
|
vectors = numpy.zeros((self.vectors_length,), dtype='f')
|
||||||
|
|
||||||
|
# Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText
|
||||||
|
ngrams_size = 0;
|
||||||
|
for i in range(len(word)):
|
||||||
|
ngram = ""
|
||||||
|
if (word[i] and 0xC0) == 0x80:
|
||||||
|
continue
|
||||||
|
n = 1
|
||||||
|
j = i
|
||||||
|
while (j < len(word) and n <= maxn):
|
||||||
|
if n > maxn:
|
||||||
|
break
|
||||||
|
ngram += word[j]
|
||||||
|
j = j + 1
|
||||||
|
while (j < len(word) and (word[j] and 0xC0) == 0x80):
|
||||||
|
ngram += word[j]
|
||||||
|
j = j + 1
|
||||||
|
if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
|
||||||
|
if self.strings[ngram] in self.vectors.key2row:
|
||||||
|
vectors = numpy.add(self.vectors[self.strings[ngram]],vectors)
|
||||||
|
ngrams_size += 1
|
||||||
|
n = n + 1
|
||||||
|
if ngrams_size > 0:
|
||||||
|
vectors = vectors * (1.0/ngrams_size)
|
||||||
|
|
||||||
|
return vectors
|
||||||
|
|
||||||
def set_vector(self, orth, vector):
|
def set_vector(self, orth, vector):
|
||||||
"""Set a vector for a word in the vocabulary. Words can be referenced
|
"""Set a vector for a word in the vocabulary. Words can be referenced
|
||||||
|
|
Loading…
Reference in New Issue
Block a user