mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						1b65115bc2
					
				|  | @ -14,8 +14,7 @@ os: | |||
| env: | ||||
|   - VIA=compile LC_ALL=en_US.ascii  | ||||
|   - VIA=compile | ||||
| 
 | ||||
| #  - VIA=sdist | ||||
|   #- VIA=pypi_nightly | ||||
| 
 | ||||
| install: | ||||
|   - "./travis.sh" | ||||
|  | @ -23,7 +22,7 @@ install: | |||
| script: | ||||
|   - "pip install pytest pytest-timeout" | ||||
|   - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi | ||||
|   - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi | ||||
|   - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi | ||||
|   - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi | ||||
|    | ||||
| notifications: | ||||
|  |  | |||
							
								
								
									
										12
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								spacy/_ml.py
									
									
									
									
									
								
							|  | @ -212,12 +212,14 @@ class PrecomputableMaxouts(Model): | |||
| 
 | ||||
| def drop_layer(layer, factor=2.): | ||||
|     def drop_layer_fwd(X, drop=0.): | ||||
|         drop *= factor | ||||
|         mask = layer.ops.get_dropout_mask((1,), drop) | ||||
|         if mask is None or mask > 0: | ||||
|         if drop <= 0.: | ||||
|             return layer.begin_update(X, drop=drop) | ||||
|         else: | ||||
|             return X, lambda dX, sgd=None: dX | ||||
|             coinflip = layer.ops.xp.random.random() | ||||
|             if (coinflip / factor) >= drop: | ||||
|                 return layer.begin_update(X, drop=drop) | ||||
|             else: | ||||
|                 return X, lambda dX, sgd=None: dX | ||||
| 
 | ||||
|     model = wrap(drop_layer_fwd, layer) | ||||
|     model.predict = layer | ||||
|  | @ -362,6 +364,8 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): | |||
|     def backward(d_output, sgd=None): | ||||
|         return (tokens, d_output) | ||||
|     return vectors, backward | ||||
| 
 | ||||
| 
 | ||||
| def fine_tune(embedding, combine=None): | ||||
|     if combine is not None: | ||||
|         raise NotImplementedError( | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ | |||
| # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | ||||
| 
 | ||||
| __title__ = 'spacy-nightly' | ||||
| __version__ = '2.0.0a12' | ||||
| __version__ = '2.0.0a13' | ||||
| __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | ||||
| __uri__ = 'https://spacy.io' | ||||
| __author__ = 'Explosion AI' | ||||
|  |  | |||
|  | @ -59,7 +59,8 @@ MORPH_RULES = { | |||
| 
 | ||||
|     "VBP": { | ||||
|         "are":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, | ||||
|         "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"} | ||||
|         "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, | ||||
|         "am":           {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"}, | ||||
|     }, | ||||
| 
 | ||||
|     "VBD": { | ||||
|  |  | |||
|  | @ -44,6 +44,11 @@ class Lemmatizer(object): | |||
|             return True | ||||
|         elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf': | ||||
|             return True | ||||
|         # This maps 'VBP' to base form -- probably just need 'IS_BASE' | ||||
|         # morphology | ||||
|         elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ | ||||
|                                      morphology.get('Tense') == 'pres'): | ||||
|             return True | ||||
|         elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': | ||||
|             return True | ||||
|         elif VerbForm_inf in morphology: | ||||
|  |  | |||
|  | @ -142,7 +142,7 @@ class BaseThincComponent(object): | |||
| 
 | ||||
|         deserialize = OrderedDict(( | ||||
|             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), | ||||
|             ('model', lambda b: self.model.from_bytes(b)), | ||||
|             ('model', load_model), | ||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)) | ||||
|         )) | ||||
|         util.from_bytes(bytes_data, deserialize, exclude) | ||||
|  | @ -417,7 +417,8 @@ class NeuralTagger(BaseThincComponent): | |||
|     def from_bytes(self, bytes_data, **exclude): | ||||
|         def load_model(b): | ||||
|             if self.model is True: | ||||
|                 token_vector_width = util.env_opt('token_vector_width', 128) | ||||
|                 token_vector_width = util.env_opt('token_vector_width', | ||||
|                         self.cfg.get('token_vector_width', 128)) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) | ||||
|             self.model.from_bytes(b) | ||||
| 
 | ||||
|  | @ -451,7 +452,8 @@ class NeuralTagger(BaseThincComponent): | |||
|     def from_disk(self, path, **exclude): | ||||
|         def load_model(p): | ||||
|             if self.model is True: | ||||
|                 token_vector_width = util.env_opt('token_vector_width', 128) | ||||
|                 token_vector_width = util.env_opt('token_vector_width', | ||||
|                         self.cfg.get('token_vector_width', 128)) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) | ||||
|             self.model.from_bytes(p.open('rb').read()) | ||||
| 
 | ||||
|  |  | |||
|  | @ -393,7 +393,8 @@ cdef class Parser: | |||
| 
 | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) | ||||
|         if USE_FINE_TUNE: | ||||
|             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
|             # TODO: This is incorrect! Unhack when training next model | ||||
|             tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
| 
 | ||||
|         nr_state = len(docs) | ||||
|         nr_class = self.moves.n_moves | ||||
|  | @ -531,8 +532,8 @@ cdef class Parser: | |||
|             docs = [docs] | ||||
|             golds = [golds] | ||||
|         if USE_FINE_TUNE: | ||||
|             tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             tokvecs = self.model[0].ops.flatten(tokvecs) | ||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             tokvecs += self.model[0].ops.flatten(my_tokvecs) | ||||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
| 
 | ||||
|  | @ -605,8 +606,8 @@ cdef class Parser: | |||
|         assert min(lengths) >= 1 | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecs) | ||||
|         if USE_FINE_TUNE: | ||||
|             tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             tokvecs = self.model[0].ops.flatten(tokvecs) | ||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             tokvecs += self.model[0].ops.flatten(my_tokvecs) | ||||
| 
 | ||||
|         states = self.moves.init_batch(docs) | ||||
|         for gold in golds: | ||||
|  | @ -705,7 +706,7 @@ cdef class Parser: | |||
|                         lower, stream, drop=dropout) | ||||
|         return state2vec, upper | ||||
| 
 | ||||
|     nr_feature = 8 | ||||
|     nr_feature = 13 | ||||
| 
 | ||||
|     def get_token_ids(self, states): | ||||
|         cdef StateClass state | ||||
|  |  | |||
|  | @ -13,7 +13,7 @@ from .. import util | |||
| 
 | ||||
| _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', | ||||
|               'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] | ||||
| _models = {'en': ['en_depent_web_sm', 'en_core_web_md'], | ||||
| _models = {'en': ['en_core_web_sm'], | ||||
|            'de': ['de_core_news_md'], | ||||
|            'fr': ['fr_depvec_web_lg'], | ||||
|            'xx': ['xx_ent_web_md']} | ||||
|  |  | |||
|  | @ -2,12 +2,18 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| from ....tokens.doc import Doc | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def en_lemmatizer(EN): | ||||
|     return EN.Defaults.create_lemmatizer() | ||||
| 
 | ||||
| @pytest.mark.models('en') | ||||
| def test_doc_lemmatization(EN): | ||||
|     doc = Doc(EN.vocab, words=['bleed']) | ||||
|     doc[0].tag_ = 'VBP' | ||||
|     assert doc[0].lemma_ == 'bleed' | ||||
| 
 | ||||
| @pytest.mark.models('en') | ||||
| @pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]), | ||||
|  | @ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): | |||
|     assert en_lemmatizer.noun(text) == set(lemmas) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models('en') | ||||
| @pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]), | ||||
|                                          ("feed", ["feed"]), | ||||
|                                          ("need", ["need"]), | ||||
|                                          ("ring", ["ring"]), | ||||
|                                          ("axes", ["axis", "axe", "ax"])]) | ||||
| def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): | ||||
|     assert en_lemmatizer.noun(text) == set(lemmas) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| @pytest.mark.models('en') | ||||
| def test_en_lemmatizer_base_forms(en_lemmatizer): | ||||
|  |  | |||
|  | @ -25,7 +25,6 @@ def test_tag_names(EN): | |||
|     doc = EN(text, disable=['parser']) | ||||
|     assert type(doc[2].pos) == int | ||||
|     assert isinstance(doc[2].pos_, six.text_type) | ||||
|     assert type(doc[2].dep) == int | ||||
|     assert isinstance(doc[2].dep_, six.text_type) | ||||
|     assert doc[2].tag_ == u'NNS' | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user