diff --git a/spacy/errors.py b/spacy/errors.py index baed574f8..a25661a20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -92,9 +92,9 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "or the language you're using doesn't have lemmatization data, " + "please ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -127,6 +127,13 @@ class Warnings(object): "this, download a newer compatible model or retrain your custom " "model with the current spaCy version. For more details and " "available updates, run: python -m spacy validate") + W033 = ("Training a new {model} using a model with no lexeme normalization " + "table. This may degrade the performance of the model to some " + "degree. If this is intentional or the language you're using " + "doesn't have a normalization table, please ignore this warning. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed. The languages with lexeme normalization tables " + "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01472a6d0..3f40cb545 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -516,6 +516,8 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..6944e9113 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +import warnings from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -601,6 +602,8 @@ cdef class Parser: **self.cfg.get('optimizer', {})) def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if 'model' in cfg: self.model = cfg['model'] if not hasattr(get_gold_tuples, '__call__'): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 244e9fa25..dd623e07f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import pytest from spacy.lang.en import English +from spacy.language import Language +from spacy.lookups import Lookups from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown @@ -305,6 +307,21 @@ def test_change_number_features(): nlp("hello world") +def test_ner_warns_no_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index bcda2999a..fce3772c4 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" -def test_tagger_warns_no_lemma_lookups(): +def test_tagger_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list