mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						d5af38f80c
					
				|  | @ -88,11 +88,11 @@ def symlink_to(orig, dest): | |||
| 
 | ||||
| 
 | ||||
| def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): | ||||
|     return ((python2 is None or python2 == is_python2) and | ||||
|             (python3 is None or python3 == is_python3) and | ||||
|             (windows is None or windows == is_windows) and | ||||
|             (linux is None or linux == is_linux) and | ||||
|             (osx is None or osx == is_osx)) | ||||
|     return (python2 in (None, is_python2) and | ||||
|             python3 in (None, is_python3) and | ||||
|             windows in (None, is_windows) and | ||||
|             linux in (None, is_linux) and | ||||
|             osx in (None, is_osx)) | ||||
| 
 | ||||
| 
 | ||||
| def normalize_string_keys(old): | ||||
|  |  | |||
|  | @ -38,6 +38,14 @@ class Warnings(object): | |||
|             "surprising to you, make sure the Doc was processed using a model " | ||||
|             "that supports named entity recognition, and check the `doc.ents` " | ||||
|             "property manually if necessary.") | ||||
|     W007 = ("The model you're using has no word vectors loaded, so the result " | ||||
|             "of the {obj}.similarity method will be based on the tagger, " | ||||
|             "parser and NER, which may not give useful similarity judgements. " | ||||
|             "This may happen if you're using one of the small models, e.g. " | ||||
|             "`en_core_web_sm`, which don't ship with word vectors and only " | ||||
|             "use context-sensitive tensors. You can always add your own word " | ||||
|             "vectors, or use one of the larger models instead if available.") | ||||
|     W008 = ("Evaluating {obj}.similarity based on empty vectors.") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
|  | @ -286,8 +294,15 @@ def _get_warn_types(arg): | |||
|             if w_type.strip() in WARNINGS] | ||||
| 
 | ||||
| 
 | ||||
| def _get_warn_excl(arg): | ||||
|     if not arg: | ||||
|         return [] | ||||
|     return [w_id.strip() for w_id in arg.split(',')] | ||||
| 
 | ||||
| 
 | ||||
| SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always') | ||||
| SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES')) | ||||
| SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get('SPACY_WARNING_IGNORE')) | ||||
| 
 | ||||
| 
 | ||||
| def user_warning(message): | ||||
|  | @ -307,7 +322,8 @@ def _warn(message, warn_type='user'): | |||
|     message (unicode): The message to display. | ||||
|     category (Warning): The Warning to show. | ||||
|     """ | ||||
|     if warn_type in SPACY_WARNING_TYPES: | ||||
|     w_id = message.split('[', 1)[1].split(']', 1)[0]  # get ID from string | ||||
|     if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE: | ||||
|         category = WARNINGS[warn_type] | ||||
|         stack = inspect.stack()[-1] | ||||
|         with warnings.catch_warnings(): | ||||
|  |  | |||
|  | @ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | |||
| from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV | ||||
| from .attrs cimport PROB | ||||
| from .attrs import intify_attrs | ||||
| from .errors import Errors | ||||
| from .errors import Errors, Warnings, user_warning | ||||
| 
 | ||||
| 
 | ||||
| memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||
|  | @ -122,6 +122,7 @@ cdef class Lexeme: | |||
|             if self.c.orth == other[0].orth: | ||||
|                 return 1.0 | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             user_warning(Warnings.W008.format(obj='Lexeme')) | ||||
|             return 0.0 | ||||
|         return (numpy.dot(self.vector, other.vector) / | ||||
|                 (self.vector_norm * other.vector_norm)) | ||||
|  |  | |||
|  | @ -253,11 +253,13 @@ def test_doc_api_has_vector(): | |||
| 
 | ||||
| def test_doc_api_similarity_match(): | ||||
|     doc = Doc(Vocab(), words=['a']) | ||||
|     assert doc.similarity(doc[0]) == 1.0 | ||||
|     assert doc.similarity(doc.vocab['a']) == 1.0 | ||||
|     with pytest.warns(None): | ||||
|         assert doc.similarity(doc[0]) == 1.0 | ||||
|         assert doc.similarity(doc.vocab['a']) == 1.0 | ||||
|     doc2 = Doc(doc.vocab, words=['a', 'b', 'c']) | ||||
|     assert doc.similarity(doc2[:1]) == 1.0 | ||||
|     assert doc.similarity(doc2) == 0.0 | ||||
|     with pytest.warns(None): | ||||
|         assert doc.similarity(doc2[:1]) == 1.0 | ||||
|         assert doc.similarity(doc2) == 0.0 | ||||
| 
 | ||||
| 
 | ||||
| def test_lowest_common_ancestor(en_tokenizer): | ||||
|  |  | |||
|  | @ -88,9 +88,10 @@ def test_span_similarity_match(): | |||
|     doc = Doc(Vocab(), words=['a', 'b', 'a', 'b']) | ||||
|     span1 = doc[:2] | ||||
|     span2 = doc[2:] | ||||
|     assert span1.similarity(span2) == 1.0 | ||||
|     assert span1.similarity(doc) == 0.0 | ||||
|     assert span1[:1].similarity(doc.vocab['a']) == 1.0 | ||||
|     with pytest.warns(None): | ||||
|         assert span1.similarity(span2) == 1.0 | ||||
|         assert span1.similarity(doc) == 0.0 | ||||
|         assert span1[:1].similarity(doc.vocab['a']) == 1.0 | ||||
| 
 | ||||
| 
 | ||||
| def test_spans_default_sentiment(en_tokenizer): | ||||
|  |  | |||
|  | @ -45,7 +45,8 @@ def test_vectors_similarity_TT(vocab, vectors): | |||
| def test_vectors_similarity_TD(vocab, vectors): | ||||
|     [(word1, vec1), (word2, vec2)] = vectors | ||||
|     doc = get_doc(vocab, words=[word1, word2]) | ||||
|     assert doc.similarity(doc[0]) == doc[0].similarity(doc) | ||||
|     with pytest.warns(None): | ||||
|         assert doc.similarity(doc[0]) == doc[0].similarity(doc) | ||||
| 
 | ||||
| 
 | ||||
| def test_vectors_similarity_DS(vocab, vectors): | ||||
|  | @ -57,4 +58,5 @@ def test_vectors_similarity_DS(vocab, vectors): | |||
| def test_vectors_similarity_TS(vocab, vectors): | ||||
|     [(word1, vec1), (word2, vec2)] = vectors | ||||
|     doc = get_doc(vocab, words=[word1, word2]) | ||||
|     assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) | ||||
|     with pytest.warns(None): | ||||
|         assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) | ||||
|  |  | |||
|  | @ -23,6 +23,18 @@ def vectors(): | |||
|         ('juice', [5, 5, 10]), | ||||
|         ('pie', [7, 6.3, 8.9])] | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def ngrams_vectors(): | ||||
|     return [ | ||||
|         ("apple", [1, 2, 3]), | ||||
|         ("app", [-0.1, -0.2, -0.3]), | ||||
|         ('ppl', [-0.2, -0.3, -0.4]), | ||||
|         ('pl', [0.7, 0.8, 0.9]) | ||||
|     ] | ||||
| @pytest.fixture() | ||||
| def ngrams_vocab(en_vocab, ngrams_vectors): | ||||
|     add_vecs_to_vocab(en_vocab, ngrams_vectors) | ||||
|     return en_vocab | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def data(): | ||||
|  | @ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text): | |||
|     assert vectors[1] == (doc[2].text, list(doc[2].vector)) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["apple"]) | ||||
| def test_vectors__ngrams_word(ngrams_vocab, text): | ||||
|     assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1]) | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["applpie"]) | ||||
| def test_vectors__ngrams_subword(ngrams_vocab, text): | ||||
|     truth = list(ngrams_vocab.get_vector(text,1,6)) | ||||
|     test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))]) | ||||
|     eps = [abs(truth[i] - test[i]) for i in range(len(truth))] | ||||
|     for i in eps: | ||||
|         assert i<1e-6 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["apple", "orange"]) | ||||
| def test_vectors_lexeme_vector(vocab, text): | ||||
|     lex = vocab[text] | ||||
|  | @ -182,15 +206,17 @@ def test_vectors_lexeme_doc_similarity(vocab, text): | |||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_span_span_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) | ||||
|     assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0 | ||||
|     with pytest.warns(None): | ||||
|         assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) | ||||
|         assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_span_doc_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) | ||||
|     assert -1. < doc[0:2].similarity(doc) < 1.0 | ||||
|     with pytest.warns(None): | ||||
|         assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) | ||||
|         assert -1. < doc[0:2].similarity(doc) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text1,text2', [ | ||||
|  |  | |||
|  | @ -31,7 +31,8 @@ from ..attrs cimport ENT_TYPE, SENT_START | |||
| from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t | ||||
| from ..util import normalize_slice | ||||
| from ..compat import is_config, copy_reg, pickle, basestring_ | ||||
| from ..errors import Errors, Warnings, deprecation_warning | ||||
| from ..errors import deprecation_warning, models_warning, user_warning | ||||
| from ..errors import Errors, Warnings | ||||
| from .. import util | ||||
| from .underscore import Underscore, get_ext_args | ||||
| from ._retokenize import Retokenizer | ||||
|  | @ -318,8 +319,10 @@ cdef class Doc: | |||
|                         break | ||||
|                 else: | ||||
|                     return 1.0 | ||||
| 
 | ||||
|         if self.vocab.vectors.n_keys == 0: | ||||
|             models_warning(Warnings.W007.format(obj='Doc')) | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             user_warning(Warnings.W008.format(obj='Doc')) | ||||
|             return 0.0 | ||||
|         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
| 
 | ||||
|  |  | |||
|  | @ -16,7 +16,7 @@ from ..util import normalize_slice | |||
| from ..attrs cimport IS_PUNCT, IS_SPACE | ||||
| from ..lexeme cimport Lexeme | ||||
| from ..compat import is_config | ||||
| from ..errors import Errors, TempErrors | ||||
| from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning | ||||
| from .underscore import Underscore, get_ext_args | ||||
| 
 | ||||
| 
 | ||||
|  | @ -200,7 +200,10 @@ cdef class Span: | |||
|                     break | ||||
|             else: | ||||
|                 return 1.0 | ||||
|         if self.vocab.vectors.n_keys == 0: | ||||
|             models_warning(Warnings.W007.format(obj='Span')) | ||||
|         if self.vector_norm == 0.0 or other.vector_norm == 0.0: | ||||
|             user_warning(Warnings.W008.format(obj='Span')) | ||||
|             return 0.0 | ||||
|         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
| 
 | ||||
|  |  | |||
|  | @ -19,7 +19,7 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM | |||
| from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX | ||||
| from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP | ||||
| from ..compat import is_config | ||||
| from ..errors import Errors | ||||
| from ..errors import Errors, Warnings, user_warning, models_warning | ||||
| from .. import util | ||||
| from .underscore import Underscore, get_ext_args | ||||
| 
 | ||||
|  | @ -161,7 +161,10 @@ cdef class Token: | |||
|         elif hasattr(other, 'orth'): | ||||
|             if self.c.lex.orth == other.orth: | ||||
|                 return 1.0 | ||||
|         if self.vocab.vectors.n_keys == 0: | ||||
|             models_warning(Warnings.W007.format(obj='Token')) | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             user_warning(Warnings.W008.format(obj='Token')) | ||||
|             return 0.0 | ||||
|         return (numpy.dot(self.vector, other.vector) / | ||||
|                 (self.vector_norm * other.vector_norm)) | ||||
|  |  | |||
|  | @ -309,7 +309,7 @@ cdef class Vocab: | |||
|         link_vectors_to_models(self) | ||||
|         return remap | ||||
| 
 | ||||
|     def get_vector(self, orth): | ||||
|     def get_vector(self, orth, minn=None, maxn=None): | ||||
|         """Retrieve a vector for a word in the vocabulary. Words can be looked | ||||
|         up by string or int ID. If no vectors data is loaded, ValueError is | ||||
|         raised. | ||||
|  | @ -320,10 +320,42 @@ cdef class Vocab: | |||
|         """ | ||||
|         if isinstance(orth, basestring_): | ||||
|             orth = self.strings.add(orth) | ||||
|         word = self[orth].orth_ | ||||
|         if orth in self.vectors.key2row: | ||||
|             return self.vectors[orth] | ||||
|         else: | ||||
|             return numpy.zeros((self.vectors_length,), dtype='f') | ||||
| 
 | ||||
|         # Assign default ngram limits to minn and maxn which is the length of the word. | ||||
|         if minn is None: | ||||
|             minn = len(word) | ||||
|         if maxn is None: | ||||
|             maxn = len(word) | ||||
|         vectors = numpy.zeros((self.vectors_length,), dtype='f') | ||||
| 
 | ||||
|         # Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText | ||||
|         ngrams_size = 0; | ||||
|         for i in range(len(word)): | ||||
|             ngram = "" | ||||
|             if (word[i] and 0xC0) == 0x80: | ||||
|                 continue | ||||
|             n = 1 | ||||
|             j = i | ||||
|             while (j < len(word) and n <= maxn): | ||||
|                 if n > maxn: | ||||
|                     break | ||||
|                 ngram += word[j] | ||||
|                 j = j + 1 | ||||
|                 while (j < len(word) and (word[j] and 0xC0) == 0x80): | ||||
|                     ngram += word[j] | ||||
|                     j = j + 1 | ||||
|                 if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))): | ||||
|                     if self.strings[ngram] in self.vectors.key2row: | ||||
|                         vectors = numpy.add(self.vectors[self.strings[ngram]],vectors) | ||||
|                         ngrams_size += 1 | ||||
|                 n = n + 1 | ||||
|         if ngrams_size > 0: | ||||
|             vectors = vectors * (1.0/ngrams_size) | ||||
| 
 | ||||
|         return vectors | ||||
| 
 | ||||
|     def set_vector(self, orth, vector): | ||||
|         """Set a vector for a word in the vocabulary. Words can be referenced | ||||
|  |  | |||
|  | @ -47,6 +47,7 @@ p | |||
| 
 | ||||
|     +row | ||||
|         +cell other | ||||
|             +tag-new(2.1) | ||||
|         +cell - | ||||
|         +cell | ||||
|             |  Additional installation options to be passed to | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user