Merge pull request #7495 from adrianeboyd/bugfix/norm-ux

Update lexeme_norm checks
2025-10-21 11:14:32 +03:00 · 2021-03-22 12:44:52 +01:00 · 2021-03-22 12:44:52 +01:00 · d545ab4ca4
commit d545ab4ca4
parent be55f43163 39153ef90f
6 changed files with 31 additions and 12 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -73,8 +73,13 @@ class Warnings:
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed. The languages with lexeme normalization tables "
+            "package installed and load the table in your config. The "
-            "are currently: {langs}")
+            "languages with lexeme normalization tables are currently: "
            "{langs}\n\nLoad the table in your config with:\n\n"
            "[initialize.lookups]\n"
            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
            "lang = ${{nlp.lang}}\n"
            "tables = [\"lexeme_norm\"]\n")
    W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
            "attribute or operator.")
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -137,6 +137,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#initialize
        """
        validate_get_examples(get_examples, "Morphologizer.initialize")
        util.check_lexeme_norms(self.vocab, "morphologizer")
        if labels is not None:
            self.cfg["labels_morph"] = labels["morph"]
            self.cfg["labels_pos"] = labels["pos"]
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
        DOCS: https://spacy.io/api/sentencerecognizer#initialize
        """
        validate_get_examples(get_examples, "SentenceRecognizer.initialize")
        util.check_lexeme_norms(self.vocab, "senter")
        doc_sample = []
        label_sample = []
        assert self.labels, Errors.E924.format(name=self.name)
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#initialize
        """
        validate_get_examples(get_examples, "Tagger.initialize")
        util.check_lexeme_norms(self.vocab, "tagger")
        if labels is not None:
            for tag in labels:
                self.add_label(tag)
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
    def initialize(self, get_examples, nlp=None, labels=None):
        validate_get_examples(get_examples, "Parser.initialize")
-        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
+        util.check_lexeme_norms(self.vocab, "parser or NER")
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
            langs = ", ".join(util.LEXEME_NORM_LANGS)
            util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
        if labels is not None:
            actions = dict(labels)
        else:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -59,7 +59,7 @@ if TYPE_CHECKING:
 OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
 logger = logging.getLogger("spacy")
 logger_stream_handler = logging.StreamHandler()
-logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
+logger_stream_handler.setFormatter(
    logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
 )
 logger.addHandler(logger_stream_handler)
@ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool:
    if hasattr(func, attr):  # function or class instance
        return True
    # https://stackoverflow.com/a/55767059
-    if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
+    if (
-        and func.__module__ in sys.modules:  # method
+        hasattr(func, "__qualname__")
-            cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
+        and hasattr(func, "__module__")
-            return hasattr(cls_func, attr)
+        and func.__module__ in sys.modules
    ):  # method
        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
        return hasattr(cls_func, attr)
    return False
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
    """
    if is_in_jupyter():
        from thinc.backends.cupy_ops import CupyOps
        if CupyOps.xp is not None:
            from thinc.backends import contextvars_eq_thread_ops
            if not contextvars_eq_thread_ops():
                warnings.warn(Warnings.W111)
 def check_lexeme_norms(vocab, component_name):
    lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
        langs = ", ".join(LEXEME_NORM_LANGS)
        logger.debug(Warnings.W033.format(model=component_name, langs=langs))