Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2018-05-21 17:42:55 +02:00
commit d5af38f80c
12 changed files with 117 additions and 27 deletions

View File

@ -88,11 +88,11 @@ def symlink_to(orig, dest):
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
return ((python2 is None or python2 == is_python2) and return (python2 in (None, is_python2) and
(python3 is None or python3 == is_python3) and python3 in (None, is_python3) and
(windows is None or windows == is_windows) and windows in (None, is_windows) and
(linux is None or linux == is_linux) and linux in (None, is_linux) and
(osx is None or osx == is_osx)) osx in (None, is_osx))
def normalize_string_keys(old): def normalize_string_keys(old):

View File

@ -38,6 +38,14 @@ class Warnings(object):
"surprising to you, make sure the Doc was processed using a model " "surprising to you, make sure the Doc was processed using a model "
"that supports named entity recognition, and check the `doc.ents` " "that supports named entity recognition, and check the `doc.ents` "
"property manually if necessary.") "property manually if necessary.")
W007 = ("The model you're using has no word vectors loaded, so the result "
"of the {obj}.similarity method will be based on the tagger, "
"parser and NER, which may not give useful similarity judgements. "
"This may happen if you're using one of the small models, e.g. "
"`en_core_web_sm`, which don't ship with word vectors and only "
"use context-sensitive tensors. You can always add your own word "
"vectors, or use one of the larger models instead if available.")
W008 = ("Evaluating {obj}.similarity based on empty vectors.")
@add_codes @add_codes
@ -286,8 +294,15 @@ def _get_warn_types(arg):
if w_type.strip() in WARNINGS] if w_type.strip() in WARNINGS]
def _get_warn_excl(arg):
if not arg:
return []
return [w_id.strip() for w_id in arg.split(',')]
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always') SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always')
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES')) SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get('SPACY_WARNING_IGNORE'))
def user_warning(message): def user_warning(message):
@ -307,7 +322,8 @@ def _warn(message, warn_type='user'):
message (unicode): The message to display. message (unicode): The message to display.
category (Warning): The Warning to show. category (Warning): The Warning to show.
""" """
if warn_type in SPACY_WARNING_TYPES: w_id = message.split('[', 1)[1].split(']', 1)[0] # get ID from string
if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
category = WARNINGS[warn_type] category = WARNINGS[warn_type]
stack = inspect.stack()[-1] stack = inspect.stack()[-1]
with warnings.catch_warnings(): with warnings.catch_warnings():

View File

@ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
from .attrs cimport PROB from .attrs cimport PROB
from .attrs import intify_attrs from .attrs import intify_attrs
from .errors import Errors from .errors import Errors, Warnings, user_warning
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@ -122,6 +122,7 @@ cdef class Lexeme:
if self.c.orth == other[0].orth: if self.c.orth == other[0].orth:
return 1.0 return 1.0
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
user_warning(Warnings.W008.format(obj='Lexeme'))
return 0.0 return 0.0
return (numpy.dot(self.vector, other.vector) / return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm)) (self.vector_norm * other.vector_norm))

View File

@ -253,11 +253,13 @@ def test_doc_api_has_vector():
def test_doc_api_similarity_match(): def test_doc_api_similarity_match():
doc = Doc(Vocab(), words=['a']) doc = Doc(Vocab(), words=['a'])
assert doc.similarity(doc[0]) == 1.0 with pytest.warns(None):
assert doc.similarity(doc.vocab['a']) == 1.0 assert doc.similarity(doc[0]) == 1.0
assert doc.similarity(doc.vocab['a']) == 1.0
doc2 = Doc(doc.vocab, words=['a', 'b', 'c']) doc2 = Doc(doc.vocab, words=['a', 'b', 'c'])
assert doc.similarity(doc2[:1]) == 1.0 with pytest.warns(None):
assert doc.similarity(doc2) == 0.0 assert doc.similarity(doc2[:1]) == 1.0
assert doc.similarity(doc2) == 0.0
def test_lowest_common_ancestor(en_tokenizer): def test_lowest_common_ancestor(en_tokenizer):

View File

@ -88,9 +88,10 @@ def test_span_similarity_match():
doc = Doc(Vocab(), words=['a', 'b', 'a', 'b']) doc = Doc(Vocab(), words=['a', 'b', 'a', 'b'])
span1 = doc[:2] span1 = doc[:2]
span2 = doc[2:] span2 = doc[2:]
assert span1.similarity(span2) == 1.0 with pytest.warns(None):
assert span1.similarity(doc) == 0.0 assert span1.similarity(span2) == 1.0
assert span1[:1].similarity(doc.vocab['a']) == 1.0 assert span1.similarity(doc) == 0.0
assert span1[:1].similarity(doc.vocab['a']) == 1.0
def test_spans_default_sentiment(en_tokenizer): def test_spans_default_sentiment(en_tokenizer):

View File

@ -45,7 +45,8 @@ def test_vectors_similarity_TT(vocab, vectors):
def test_vectors_similarity_TD(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2]) doc = get_doc(vocab, words=[word1, word2])
assert doc.similarity(doc[0]) == doc[0].similarity(doc) with pytest.warns(None):
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
def test_vectors_similarity_DS(vocab, vectors): def test_vectors_similarity_DS(vocab, vectors):
@ -57,4 +58,5 @@ def test_vectors_similarity_DS(vocab, vectors):
def test_vectors_similarity_TS(vocab, vectors): def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2]) doc = get_doc(vocab, words=[word1, word2])
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) with pytest.warns(None):
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])

View File

@ -23,6 +23,18 @@ def vectors():
('juice', [5, 5, 10]), ('juice', [5, 5, 10]),
('pie', [7, 6.3, 8.9])] ('pie', [7, 6.3, 8.9])]
@pytest.fixture
def ngrams_vectors():
return [
("apple", [1, 2, 3]),
("app", [-0.1, -0.2, -0.3]),
('ppl', [-0.2, -0.3, -0.4]),
('pl', [0.7, 0.8, 0.9])
]
@pytest.fixture()
def ngrams_vocab(en_vocab, ngrams_vectors):
add_vecs_to_vocab(en_vocab, ngrams_vectors)
return en_vocab
@pytest.fixture @pytest.fixture
def data(): def data():
@ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
assert vectors[1] == (doc[2].text, list(doc[2].vector)) assert vectors[1] == (doc[2].text, list(doc[2].vector))
@pytest.mark.parametrize('text', ["apple"])
def test_vectors__ngrams_word(ngrams_vocab, text):
assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1])
@pytest.mark.parametrize('text', ["applpie"])
def test_vectors__ngrams_subword(ngrams_vocab, text):
truth = list(ngrams_vocab.get_vector(text,1,6))
test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))])
eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
for i in eps:
assert i<1e-6
@pytest.mark.parametrize('text', ["apple", "orange"]) @pytest.mark.parametrize('text', ["apple", "orange"])
def test_vectors_lexeme_vector(vocab, text): def test_vectors_lexeme_vector(vocab, text):
lex = vocab[text] lex = vocab[text]
@ -182,15 +206,17 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) @pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_span_similarity(vocab, text): def test_vectors_span_span_similarity(vocab, text):
doc = get_doc(vocab, text) doc = get_doc(vocab, text)
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) with pytest.warns(None):
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0 assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) @pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_doc_similarity(vocab, text): def test_vectors_span_doc_similarity(vocab, text):
doc = get_doc(vocab, text) doc = get_doc(vocab, text)
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) with pytest.warns(None):
assert -1. < doc[0:2].similarity(doc) < 1.0 assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
assert -1. < doc[0:2].similarity(doc) < 1.0
@pytest.mark.parametrize('text1,text2', [ @pytest.mark.parametrize('text1,text2', [

View File

@ -31,7 +31,8 @@ from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle, basestring_ from ..compat import is_config, copy_reg, pickle, basestring_
from ..errors import Errors, Warnings, deprecation_warning from ..errors import deprecation_warning, models_warning, user_warning
from ..errors import Errors, Warnings
from .. import util from .. import util
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
from ._retokenize import Retokenizer from ._retokenize import Retokenizer
@ -318,8 +319,10 @@ cdef class Doc:
break break
else: else:
return 1.0 return 1.0
if self.vocab.vectors.n_keys == 0:
models_warning(Warnings.W007.format(obj='Doc'))
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
user_warning(Warnings.W008.format(obj='Doc'))
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

View File

@ -16,7 +16,7 @@ from ..util import normalize_slice
from ..attrs cimport IS_PUNCT, IS_SPACE from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config from ..compat import is_config
from ..errors import Errors, TempErrors from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -200,7 +200,10 @@ cdef class Span:
break break
else: else:
return 1.0 return 1.0
if self.vocab.vectors.n_keys == 0:
models_warning(Warnings.W007.format(obj='Span'))
if self.vector_norm == 0.0 or other.vector_norm == 0.0: if self.vector_norm == 0.0 or other.vector_norm == 0.0:
user_warning(Warnings.W008.format(obj='Span'))
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

View File

@ -19,7 +19,7 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config from ..compat import is_config
from ..errors import Errors from ..errors import Errors, Warnings, user_warning, models_warning
from .. import util from .. import util
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -161,7 +161,10 @@ cdef class Token:
elif hasattr(other, 'orth'): elif hasattr(other, 'orth'):
if self.c.lex.orth == other.orth: if self.c.lex.orth == other.orth:
return 1.0 return 1.0
if self.vocab.vectors.n_keys == 0:
models_warning(Warnings.W007.format(obj='Token'))
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
user_warning(Warnings.W008.format(obj='Token'))
return 0.0 return 0.0
return (numpy.dot(self.vector, other.vector) / return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm)) (self.vector_norm * other.vector_norm))

View File

@ -309,7 +309,7 @@ cdef class Vocab:
link_vectors_to_models(self) link_vectors_to_models(self)
return remap return remap
def get_vector(self, orth): def get_vector(self, orth, minn=None, maxn=None):
"""Retrieve a vector for a word in the vocabulary. Words can be looked """Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is up by string or int ID. If no vectors data is loaded, ValueError is
raised. raised.
@ -320,10 +320,42 @@ cdef class Vocab:
""" """
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
word = self[orth].orth_
if orth in self.vectors.key2row: if orth in self.vectors.key2row:
return self.vectors[orth] return self.vectors[orth]
else:
return numpy.zeros((self.vectors_length,), dtype='f') # Assign default ngram limits to minn and maxn which is the length of the word.
if minn is None:
minn = len(word)
if maxn is None:
maxn = len(word)
vectors = numpy.zeros((self.vectors_length,), dtype='f')
# Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText
ngrams_size = 0;
for i in range(len(word)):
ngram = ""
if (word[i] and 0xC0) == 0x80:
continue
n = 1
j = i
while (j < len(word) and n <= maxn):
if n > maxn:
break
ngram += word[j]
j = j + 1
while (j < len(word) and (word[j] and 0xC0) == 0x80):
ngram += word[j]
j = j + 1
if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
if self.strings[ngram] in self.vectors.key2row:
vectors = numpy.add(self.vectors[self.strings[ngram]],vectors)
ngrams_size += 1
n = n + 1
if ngrams_size > 0:
vectors = vectors * (1.0/ngrams_size)
return vectors
def set_vector(self, orth, vector): def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. Words can be referenced """Set a vector for a word in the vocabulary. Words can be referenced

View File

@ -47,6 +47,7 @@ p
+row +row
+cell other +cell other
+tag-new(2.1)
+cell - +cell -
+cell +cell
| Additional installation options to be passed to | Additional installation options to be passed to