Update lexeme_norm checks

* Add util method for check * Add new languages to list with lexeme norm tables * Add check to all relevant components * Add config details to warning message Note that we're not actually inspecting the model config to see if `NORM` is used as an attribute, so it may warn in cases where it's not relevant.
2025-10-25 05:01:02 +03:00 · 2021-03-19 10:45:16 +01:00 · 2021-03-19 10:45:16 +01:00 · 39153ef90f
commit 39153ef90f
parent 34e13c1161
6 changed files with 31 additions and 12 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -73,8 +73,13 @@ class Warnings:
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed. The languages with lexeme normalization tables "
-            "are currently: {langs}")
+            "package installed and load the table in your config. The "
+            "languages with lexeme normalization tables are currently: "
+            "{langs}\n\nLoad the table in your config with:\n\n"
+            "[initialize.lookups]\n"
+            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "lang = ${{nlp.lang}}\n"
+            "tables = [\"lexeme_norm\"]\n")
    W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
            "attribute or operator.")

--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -137,6 +137,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#initialize
        """
        validate_get_examples(get_examples, "Morphologizer.initialize")
+        util.check_lexeme_norms(self.vocab, "morphologizer")
        if labels is not None:
            self.cfg["labels_morph"] = labels["morph"]
            self.cfg["labels_pos"] = labels["pos"]
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
        DOCS: https://spacy.io/api/sentencerecognizer#initialize
        """
        validate_get_examples(get_examples, "SentenceRecognizer.initialize")
+        util.check_lexeme_norms(self.vocab, "senter")
        doc_sample = []
        label_sample = []
        assert self.labels, Errors.E924.format(name=self.name)
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#initialize
        """
        validate_get_examples(get_examples, "Tagger.initialize")
+        util.check_lexeme_norms(self.vocab, "tagger")
        if labels is not None:
            for tag in labels:
                self.add_label(tag)
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):

    def initialize(self, get_examples, nlp=None, labels=None):
        validate_get_examples(get_examples, "Parser.initialize")
-        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
-        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
-            langs = ", ".join(util.LEXEME_NORM_LANGS)
-            util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
+        util.check_lexeme_norms(self.vocab, "parser or NER")
        if labels is not None:
            actions = dict(labels)
        else:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -59,7 +59,7 @@ if TYPE_CHECKING:

 OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]

 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co

 logger = logging.getLogger("spacy")
 logger_stream_handler = logging.StreamHandler()
-logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
+logger_stream_handler.setFormatter(
+    logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
+)
 logger.addHandler(logger_stream_handler)


@ -1454,8 +1456,11 @@ def is_cython_func(func: Callable) -> bool:
    if hasattr(func, attr):  # function or class instance
        return True
    # https://stackoverflow.com/a/55767059
-    if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
-        and func.__module__ in sys.modules:  # method
+    if (
+        hasattr(func, "__qualname__")
+        and hasattr(func, "__module__")
+        and func.__module__ in sys.modules
+    ):  # method
        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
        return hasattr(cls_func, attr)
    return False
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
    """
    if is_in_jupyter():
        from thinc.backends.cupy_ops import CupyOps
+
        if CupyOps.xp is not None:
            from thinc.backends import contextvars_eq_thread_ops
+
            if not contextvars_eq_thread_ops():
                warnings.warn(Warnings.W111)
+
+
+def check_lexeme_norms(vocab, component_name):
+    lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
+    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
+        langs = ", ".join(LEXEME_NORM_LANGS)
+        logger.debug(Warnings.W033.format(model=component_name, langs=langs))