Fix and add warnings related to spacy-lookups-data (#5588)

* Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.)
2025-12-25 02:53:14 +03:00 · 2020-06-15 14:56:04 +02:00 · 2020-06-15 14:56:04 +02:00 · c482f20778
commit c482f20778
parent aa5b40fa64
5 changed files with 36 additions and 7 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -92,9 +92,9 @@ class Warnings(object):
    W022 = ("Training a new part-of-speech tagger using a model with no "
            "lemmatization rules or data. This means that the trained model "
            "may not be able to lemmatize correctly. If this is intentional "
-            "or the language you're using doesn't have lemmatization data. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed.")
+            "or the language you're using doesn't have lemmatization data, "
+            "please ignore this warning. If this is surprising, make sure you "
+            "have the spacy-lookups-data package installed.")
    W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
            "'n_process' will be set to 1.")
    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
@ -127,6 +127,13 @@ class Warnings(object):
            "this, download a newer compatible model or retrain your custom "
            "model with the current spaCy version. For more details and "
            "available updates, run: python -m spacy validate")
+    W033 = ("Training a new {model} using a model with no lexeme normalization "
+            "table. This may degrade the performance of the model to some "
+            "degree. If this is intentional or the language you're using "
+            "doesn't have a normalization table, please ignore this warning. "
+            "If this is surprising, make sure you have the spacy-lookups-data "
+            "package installed. The languages with lexeme normalization tables "
+            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")


@add_codes
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -516,6 +516,8 @@ class Tagger(Pipe):
        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
        if not any(table in self.vocab.lookups for table in lemma_tables):
            warnings.warn(Warnings.W022)
+        if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
+            warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = OrderedDict()
        for raw_text, annots_brackets in get_gold_tuples():
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
 import srsly
+import warnings

 from ._parser_model cimport alloc_activations, free_activations
 from ._parser_model cimport predict_states, arg_max_if_valid
@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer
 from ..compat import copy_array
 from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
-from ..errors import Errors, TempErrors
+from ..errors import Errors, TempErrors, Warnings
 from .. import util
 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -601,6 +602,8 @@ cdef class Parser:
                                        **self.cfg.get('optimizer', {}))

    def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
+        if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
+            warnings.warn(Warnings.W033.format(model="parser or NER"))
        if 'model' in cfg:
            self.model = cfg['model']
        if not hasattr(get_gold_tuples, '__call__'):
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -4,6 +4,8 @@ from __future__ import unicode_literals
 import pytest
 from spacy.lang.en import English

+from spacy.language import Language
+from spacy.lookups import Lookups
 from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.vocab import Vocab
 from spacy.syntax.ner import BiluoPushDown
@ -305,6 +307,21 @@ def test_change_number_features():
    nlp("hello world")


+def test_ner_warns_no_lookups():
+    nlp = Language()
+    nlp.vocab.lookups = Lookups()
+    assert not len(nlp.vocab.lookups)
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    with pytest.warns(UserWarning):
+        nlp.begin_training()
+    nlp.vocab.lookups.add_table("lexeme_norm")
+    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
+    with pytest.warns(None) as record:
+        nlp.begin_training()
+        assert not record.list
+
+
 class BlockerComponent1(object):
    name = "my_blocker"

--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes():
    assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"


-def test_tagger_warns_no_lemma_lookups():
+def test_tagger_warns_no_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    tagger = nlp.create_pipe("tagger")
-    with pytest.warns(UserWarning):
-        tagger.begin_training()
    nlp.add_pipe(tagger)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lemma_lookup")
+    nlp.vocab.lookups.add_table("lexeme_norm")
+    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list