mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Fix and add warnings related to spacy-lookups-data (#5588)
* Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.)
This commit is contained in:
		
							parent
							
								
									f698007907
								
							
						
					
					
						commit
						e867e9fa8f
					
				|  | @ -92,9 +92,9 @@ class Warnings(object): | |||
|     W022 = ("Training a new part-of-speech tagger using a model with no " | ||||
|             "lemmatization rules or data. This means that the trained model " | ||||
|             "may not be able to lemmatize correctly. If this is intentional " | ||||
|             "or the language you're using doesn't have lemmatization data. " | ||||
|             "If this is surprising, make sure you have the spacy-lookups-data " | ||||
|             "package installed.") | ||||
|             "or the language you're using doesn't have lemmatization data, " | ||||
|             "please ignore this warning. If this is surprising, make sure you " | ||||
|             "have the spacy-lookups-data package installed.") | ||||
|     W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " | ||||
|             "'n_process' will be set to 1.") | ||||
|     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " | ||||
|  | @ -127,6 +127,13 @@ class Warnings(object): | |||
|             "this, download a newer compatible model or retrain your custom " | ||||
|             "model with the current spaCy version. For more details and " | ||||
|             "available updates, run: python -m spacy validate") | ||||
|     W033 = ("Training a new {model} using a model with no lexeme normalization " | ||||
|             "table. This may degrade the performance of the model to some " | ||||
|             "degree. If this is intentional or the language you're using " | ||||
|             "doesn't have a normalization table, please ignore this warning. " | ||||
|             "If this is surprising, make sure you have the spacy-lookups-data " | ||||
|             "package installed. The languages with lexeme normalization tables " | ||||
|             "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
|  |  | |||
|  | @ -516,6 +516,8 @@ class Tagger(Pipe): | |||
|         lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] | ||||
|         if not any(table in self.vocab.lookups for table in lemma_tables): | ||||
|             warnings.warn(Warnings.W022) | ||||
|         if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: | ||||
|             warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) | ||||
|         orig_tag_map = dict(self.vocab.morphology.tag_map) | ||||
|         new_tag_map = OrderedDict() | ||||
|         for raw_text, annots_brackets in get_gold_tuples(): | ||||
|  |  | |||
|  | @ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps | |||
| from thinc.neural.util import get_array_module | ||||
| from thinc.linalg cimport Vec, VecVec | ||||
| import srsly | ||||
| import warnings | ||||
| 
 | ||||
| from ._parser_model cimport alloc_activations, free_activations | ||||
| from ._parser_model cimport predict_states, arg_max_if_valid | ||||
|  | @ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer | |||
| from ..compat import copy_array | ||||
| from ..tokens.doc cimport Doc | ||||
| from ..gold cimport GoldParse | ||||
| from ..errors import Errors, TempErrors | ||||
| from ..errors import Errors, TempErrors, Warnings | ||||
| from .. import util | ||||
| from .stateclass cimport StateClass | ||||
| from ._state cimport StateC | ||||
|  | @ -601,6 +602,8 @@ cdef class Parser: | |||
|                                         **self.cfg.get('optimizer', {})) | ||||
| 
 | ||||
|     def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): | ||||
|         if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: | ||||
|             warnings.warn(Warnings.W033.format(model="parser or NER")) | ||||
|         if 'model' in cfg: | ||||
|             self.model = cfg['model'] | ||||
|         if not hasattr(get_gold_tuples, '__call__'): | ||||
|  |  | |||
|  | @ -4,6 +4,8 @@ from __future__ import unicode_literals | |||
| import pytest | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| from spacy.language import Language | ||||
| from spacy.lookups import Lookups | ||||
| from spacy.pipeline import EntityRecognizer, EntityRuler | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.syntax.ner import BiluoPushDown | ||||
|  | @ -305,6 +307,21 @@ def test_change_number_features(): | |||
|     nlp("hello world") | ||||
| 
 | ||||
| 
 | ||||
| def test_ner_warns_no_lookups(): | ||||
|     nlp = Language() | ||||
|     nlp.vocab.lookups = Lookups() | ||||
|     assert not len(nlp.vocab.lookups) | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     nlp.add_pipe(ner) | ||||
|     with pytest.warns(UserWarning): | ||||
|         nlp.begin_training() | ||||
|     nlp.vocab.lookups.add_table("lexeme_norm") | ||||
|     nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" | ||||
|     with pytest.warns(None) as record: | ||||
|         nlp.begin_training() | ||||
|         assert not record.list | ||||
| 
 | ||||
| 
 | ||||
| class BlockerComponent1(object): | ||||
|     name = "my_blocker" | ||||
| 
 | ||||
|  |  | |||
|  | @ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): | |||
|     assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" | ||||
| 
 | ||||
| 
 | ||||
| def test_tagger_warns_no_lemma_lookups(): | ||||
| def test_tagger_warns_no_lookups(): | ||||
|     nlp = Language() | ||||
|     nlp.vocab.lookups = Lookups() | ||||
|     assert not len(nlp.vocab.lookups) | ||||
|     tagger = nlp.create_pipe("tagger") | ||||
|     with pytest.warns(UserWarning): | ||||
|         tagger.begin_training() | ||||
|     nlp.add_pipe(tagger) | ||||
|     with pytest.warns(UserWarning): | ||||
|         nlp.begin_training() | ||||
|     nlp.vocab.lookups.add_table("lemma_lookup") | ||||
|     nlp.vocab.lookups.add_table("lexeme_norm") | ||||
|     nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" | ||||
|     with pytest.warns(None) as record: | ||||
|         nlp.begin_training() | ||||
|         assert not record.list | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user