mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
d5af38f80c
|
@ -88,11 +88,11 @@ def symlink_to(orig, dest):
|
||||||
|
|
||||||
|
|
||||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||||
return ((python2 is None or python2 == is_python2) and
|
return (python2 in (None, is_python2) and
|
||||||
(python3 is None or python3 == is_python3) and
|
python3 in (None, is_python3) and
|
||||||
(windows is None or windows == is_windows) and
|
windows in (None, is_windows) and
|
||||||
(linux is None or linux == is_linux) and
|
linux in (None, is_linux) and
|
||||||
(osx is None or osx == is_osx))
|
osx in (None, is_osx))
|
||||||
|
|
||||||
|
|
||||||
def normalize_string_keys(old):
|
def normalize_string_keys(old):
|
||||||
|
|
|
@ -38,6 +38,14 @@ class Warnings(object):
|
||||||
"surprising to you, make sure the Doc was processed using a model "
|
"surprising to you, make sure the Doc was processed using a model "
|
||||||
"that supports named entity recognition, and check the `doc.ents` "
|
"that supports named entity recognition, and check the `doc.ents` "
|
||||||
"property manually if necessary.")
|
"property manually if necessary.")
|
||||||
|
W007 = ("The model you're using has no word vectors loaded, so the result "
|
||||||
|
"of the {obj}.similarity method will be based on the tagger, "
|
||||||
|
"parser and NER, which may not give useful similarity judgements. "
|
||||||
|
"This may happen if you're using one of the small models, e.g. "
|
||||||
|
"`en_core_web_sm`, which don't ship with word vectors and only "
|
||||||
|
"use context-sensitive tensors. You can always add your own word "
|
||||||
|
"vectors, or use one of the larger models instead if available.")
|
||||||
|
W008 = ("Evaluating {obj}.similarity based on empty vectors.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -286,8 +294,15 @@ def _get_warn_types(arg):
|
||||||
if w_type.strip() in WARNINGS]
|
if w_type.strip() in WARNINGS]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_warn_excl(arg):
|
||||||
|
if not arg:
|
||||||
|
return []
|
||||||
|
return [w_id.strip() for w_id in arg.split(',')]
|
||||||
|
|
||||||
|
|
||||||
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always')
|
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always')
|
||||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
|
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
|
||||||
|
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get('SPACY_WARNING_IGNORE'))
|
||||||
|
|
||||||
|
|
||||||
def user_warning(message):
|
def user_warning(message):
|
||||||
|
@ -307,7 +322,8 @@ def _warn(message, warn_type='user'):
|
||||||
message (unicode): The message to display.
|
message (unicode): The message to display.
|
||||||
category (Warning): The Warning to show.
|
category (Warning): The Warning to show.
|
||||||
"""
|
"""
|
||||||
if warn_type in SPACY_WARNING_TYPES:
|
w_id = message.split('[', 1)[1].split(']', 1)[0] # get ID from string
|
||||||
|
if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
|
||||||
category = WARNINGS[warn_type]
|
category = WARNINGS[warn_type]
|
||||||
stack = inspect.stack()[-1]
|
stack = inspect.stack()[-1]
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
|
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
|
||||||
from .attrs cimport PROB
|
from .attrs cimport PROB
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .errors import Errors
|
from .errors import Errors, Warnings, user_warning
|
||||||
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
@ -122,6 +122,7 @@ cdef class Lexeme:
|
||||||
if self.c.orth == other[0].orth:
|
if self.c.orth == other[0].orth:
|
||||||
return 1.0
|
return 1.0
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
user_warning(Warnings.W008.format(obj='Lexeme'))
|
||||||
return 0.0
|
return 0.0
|
||||||
return (numpy.dot(self.vector, other.vector) /
|
return (numpy.dot(self.vector, other.vector) /
|
||||||
(self.vector_norm * other.vector_norm))
|
(self.vector_norm * other.vector_norm))
|
||||||
|
|
|
@ -253,11 +253,13 @@ def test_doc_api_has_vector():
|
||||||
|
|
||||||
def test_doc_api_similarity_match():
|
def test_doc_api_similarity_match():
|
||||||
doc = Doc(Vocab(), words=['a'])
|
doc = Doc(Vocab(), words=['a'])
|
||||||
assert doc.similarity(doc[0]) == 1.0
|
with pytest.warns(None):
|
||||||
assert doc.similarity(doc.vocab['a']) == 1.0
|
assert doc.similarity(doc[0]) == 1.0
|
||||||
|
assert doc.similarity(doc.vocab['a']) == 1.0
|
||||||
doc2 = Doc(doc.vocab, words=['a', 'b', 'c'])
|
doc2 = Doc(doc.vocab, words=['a', 'b', 'c'])
|
||||||
assert doc.similarity(doc2[:1]) == 1.0
|
with pytest.warns(None):
|
||||||
assert doc.similarity(doc2) == 0.0
|
assert doc.similarity(doc2[:1]) == 1.0
|
||||||
|
assert doc.similarity(doc2) == 0.0
|
||||||
|
|
||||||
|
|
||||||
def test_lowest_common_ancestor(en_tokenizer):
|
def test_lowest_common_ancestor(en_tokenizer):
|
||||||
|
|
|
@ -88,9 +88,10 @@ def test_span_similarity_match():
|
||||||
doc = Doc(Vocab(), words=['a', 'b', 'a', 'b'])
|
doc = Doc(Vocab(), words=['a', 'b', 'a', 'b'])
|
||||||
span1 = doc[:2]
|
span1 = doc[:2]
|
||||||
span2 = doc[2:]
|
span2 = doc[2:]
|
||||||
assert span1.similarity(span2) == 1.0
|
with pytest.warns(None):
|
||||||
assert span1.similarity(doc) == 0.0
|
assert span1.similarity(span2) == 1.0
|
||||||
assert span1[:1].similarity(doc.vocab['a']) == 1.0
|
assert span1.similarity(doc) == 0.0
|
||||||
|
assert span1[:1].similarity(doc.vocab['a']) == 1.0
|
||||||
|
|
||||||
|
|
||||||
def test_spans_default_sentiment(en_tokenizer):
|
def test_spans_default_sentiment(en_tokenizer):
|
||||||
|
|
|
@ -45,7 +45,8 @@ def test_vectors_similarity_TT(vocab, vectors):
|
||||||
def test_vectors_similarity_TD(vocab, vectors):
|
def test_vectors_similarity_TD(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
with pytest.warns(None):
|
||||||
|
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||||
|
|
||||||
|
|
||||||
def test_vectors_similarity_DS(vocab, vectors):
|
def test_vectors_similarity_DS(vocab, vectors):
|
||||||
|
@ -57,4 +58,5 @@ def test_vectors_similarity_DS(vocab, vectors):
|
||||||
def test_vectors_similarity_TS(vocab, vectors):
|
def test_vectors_similarity_TS(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
with pytest.warns(None):
|
||||||
|
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||||
|
|
|
@ -23,6 +23,18 @@ def vectors():
|
||||||
('juice', [5, 5, 10]),
|
('juice', [5, 5, 10]),
|
||||||
('pie', [7, 6.3, 8.9])]
|
('pie', [7, 6.3, 8.9])]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ngrams_vectors():
|
||||||
|
return [
|
||||||
|
("apple", [1, 2, 3]),
|
||||||
|
("app", [-0.1, -0.2, -0.3]),
|
||||||
|
('ppl', [-0.2, -0.3, -0.4]),
|
||||||
|
('pl', [0.7, 0.8, 0.9])
|
||||||
|
]
|
||||||
|
@pytest.fixture()
|
||||||
|
def ngrams_vocab(en_vocab, ngrams_vectors):
|
||||||
|
add_vecs_to_vocab(en_vocab, ngrams_vectors)
|
||||||
|
return en_vocab
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def data():
|
def data():
|
||||||
|
@ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||||
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["apple"])
|
||||||
|
def test_vectors__ngrams_word(ngrams_vocab, text):
|
||||||
|
assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1])
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["applpie"])
|
||||||
|
def test_vectors__ngrams_subword(ngrams_vocab, text):
|
||||||
|
truth = list(ngrams_vocab.get_vector(text,1,6))
|
||||||
|
test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))])
|
||||||
|
eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
|
||||||
|
for i in eps:
|
||||||
|
assert i<1e-6
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["apple", "orange"])
|
@pytest.mark.parametrize('text', ["apple", "orange"])
|
||||||
def test_vectors_lexeme_vector(vocab, text):
|
def test_vectors_lexeme_vector(vocab, text):
|
||||||
lex = vocab[text]
|
lex = vocab[text]
|
||||||
|
@ -182,15 +206,17 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_span_span_similarity(vocab, text):
|
def test_vectors_span_span_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
with pytest.warns(None):
|
||||||
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
|
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
||||||
|
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_span_doc_similarity(vocab, text):
|
def test_vectors_span_doc_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
with pytest.warns(None):
|
||||||
assert -1. < doc[0:2].similarity(doc) < 1.0
|
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
||||||
|
assert -1. < doc[0:2].similarity(doc) < 1.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text1,text2', [
|
@pytest.mark.parametrize('text1,text2', [
|
||||||
|
|
|
@ -31,7 +31,8 @@ from ..attrs cimport ENT_TYPE, SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config, copy_reg, pickle, basestring_
|
from ..compat import is_config, copy_reg, pickle, basestring_
|
||||||
from ..errors import Errors, Warnings, deprecation_warning
|
from ..errors import deprecation_warning, models_warning, user_warning
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
from ._retokenize import Retokenizer
|
from ._retokenize import Retokenizer
|
||||||
|
@ -318,8 +319,10 @@ cdef class Doc:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
return 1.0
|
return 1.0
|
||||||
|
if self.vocab.vectors.n_keys == 0:
|
||||||
|
models_warning(Warnings.W007.format(obj='Doc'))
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
user_warning(Warnings.W008.format(obj='Doc'))
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ from ..util import normalize_slice
|
||||||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
|
||||||
|
|
||||||
|
@ -200,7 +200,10 @@ cdef class Span:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
return 1.0
|
return 1.0
|
||||||
|
if self.vocab.vectors.n_keys == 0:
|
||||||
|
models_warning(Warnings.W007.format(obj='Span'))
|
||||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||||
|
user_warning(Warnings.W008.format(obj='Span'))
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM
|
||||||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
from ..errors import Errors
|
from ..errors import Errors, Warnings, user_warning, models_warning
|
||||||
from .. import util
|
from .. import util
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
|
||||||
|
@ -161,7 +161,10 @@ cdef class Token:
|
||||||
elif hasattr(other, 'orth'):
|
elif hasattr(other, 'orth'):
|
||||||
if self.c.lex.orth == other.orth:
|
if self.c.lex.orth == other.orth:
|
||||||
return 1.0
|
return 1.0
|
||||||
|
if self.vocab.vectors.n_keys == 0:
|
||||||
|
models_warning(Warnings.W007.format(obj='Token'))
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
user_warning(Warnings.W008.format(obj='Token'))
|
||||||
return 0.0
|
return 0.0
|
||||||
return (numpy.dot(self.vector, other.vector) /
|
return (numpy.dot(self.vector, other.vector) /
|
||||||
(self.vector_norm * other.vector_norm))
|
(self.vector_norm * other.vector_norm))
|
||||||
|
|
|
@ -309,7 +309,7 @@ cdef class Vocab:
|
||||||
link_vectors_to_models(self)
|
link_vectors_to_models(self)
|
||||||
return remap
|
return remap
|
||||||
|
|
||||||
def get_vector(self, orth):
|
def get_vector(self, orth, minn=None, maxn=None):
|
||||||
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
||||||
up by string or int ID. If no vectors data is loaded, ValueError is
|
up by string or int ID. If no vectors data is loaded, ValueError is
|
||||||
raised.
|
raised.
|
||||||
|
@ -320,10 +320,42 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, basestring_):
|
if isinstance(orth, basestring_):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
|
word = self[orth].orth_
|
||||||
if orth in self.vectors.key2row:
|
if orth in self.vectors.key2row:
|
||||||
return self.vectors[orth]
|
return self.vectors[orth]
|
||||||
else:
|
|
||||||
return numpy.zeros((self.vectors_length,), dtype='f')
|
# Assign default ngram limits to minn and maxn which is the length of the word.
|
||||||
|
if minn is None:
|
||||||
|
minn = len(word)
|
||||||
|
if maxn is None:
|
||||||
|
maxn = len(word)
|
||||||
|
vectors = numpy.zeros((self.vectors_length,), dtype='f')
|
||||||
|
|
||||||
|
# Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText
|
||||||
|
ngrams_size = 0;
|
||||||
|
for i in range(len(word)):
|
||||||
|
ngram = ""
|
||||||
|
if (word[i] and 0xC0) == 0x80:
|
||||||
|
continue
|
||||||
|
n = 1
|
||||||
|
j = i
|
||||||
|
while (j < len(word) and n <= maxn):
|
||||||
|
if n > maxn:
|
||||||
|
break
|
||||||
|
ngram += word[j]
|
||||||
|
j = j + 1
|
||||||
|
while (j < len(word) and (word[j] and 0xC0) == 0x80):
|
||||||
|
ngram += word[j]
|
||||||
|
j = j + 1
|
||||||
|
if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
|
||||||
|
if self.strings[ngram] in self.vectors.key2row:
|
||||||
|
vectors = numpy.add(self.vectors[self.strings[ngram]],vectors)
|
||||||
|
ngrams_size += 1
|
||||||
|
n = n + 1
|
||||||
|
if ngrams_size > 0:
|
||||||
|
vectors = vectors * (1.0/ngrams_size)
|
||||||
|
|
||||||
|
return vectors
|
||||||
|
|
||||||
def set_vector(self, orth, vector):
|
def set_vector(self, orth, vector):
|
||||||
"""Set a vector for a word in the vocabulary. Words can be referenced
|
"""Set a vector for a word in the vocabulary. Words can be referenced
|
||||||
|
|
|
@ -47,6 +47,7 @@ p
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell other
|
+cell other
|
||||||
|
+tag-new(2.1)
|
||||||
+cell -
|
+cell -
|
||||||
+cell
|
+cell
|
||||||
| Additional installation options to be passed to
|
| Additional installation options to be passed to
|
||||||
|
|
Loading…
Reference in New Issue
Block a user