Merge pull request #8285 from adrianeboyd/feature/refactor-logger-warnings

Refactor warnings
2025-10-17 01:04:16 +03:00 · 2021-06-11 10:20:02 +02:00 · 2021-06-11 10:20:02 +02:00 · dbbeab2506
commit dbbeab2506
parent 07082c9692 9dfd3c9484
13 changed files with 82 additions and 27 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,10 +1,10 @@
 from typing import Union, Iterable, Dict, Any
 from pathlib import Path
-import warnings
 import sys

-warnings.filterwarnings("ignore", message="numpy.dtype size changed")  # noqa
-warnings.filterwarnings("ignore", message="numpy.ufunc size changed")  # noqa
+# set library-specific custom warning handling before doing anything else
+from .errors import setup_default_warnings
+setup_default_warnings()

 # These are imported as part of the API
 from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,3 +1,6 @@
+import warnings
+
+
 def add_codes(err_cls):
    """Add error codes to string messages via class attribute names."""

@ -12,6 +15,33 @@ def add_codes(err_cls):
    return ErrorsWithCodes()


+def setup_default_warnings():
+    # ignore certain numpy warnings
+    filter_warning("ignore", error_msg="numpy.dtype size changed")  # noqa
+    filter_warning("ignore", error_msg="numpy.ufunc size changed")  # noqa
+
+    # warn about entity_ruler & matcher having no patterns only once
+    for pipe in ["matcher", "entity_ruler"]:
+        filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
+
+    # warn once about lemmatizer without required POS
+    filter_warning("once", error_msg="[W108]")
+
+
+def filter_warning(action: str, error_msg: str):
+    """Customize how spaCy should handle a certain warning.
+
+    error_msg (str): e.g. "W006", or a full error message
+    action (str): "default", "error", "ignore", "always", "module" or "once"
+    """
+    warnings.filterwarnings(action, message=_escape_warning_msg(error_msg))
+
+
+def _escape_warning_msg(msg):
+    """To filter with warnings.filterwarnings, the [] brackets need to be escaped"""
+    return msg.replace("[", "\\[").replace("]", "\\]")
+
+
 # fmt: off

@add_codes
@ -80,8 +110,9 @@ class Warnings:
            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
            "lang = ${{nlp.lang}}\n"
            "tables = [\"lexeme_norm\"]\n")
-    W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
+    W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
            "attribute or operator.")
+    W036 = ("The component '{name}' does not have any patterns defined.")

    # New warnings added in v3.x
    W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -689,7 +689,7 @@ class Language:
        if self.vocab.vectors.shape != source.vocab.vectors.shape or \
                self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
                self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
-            util.logger.warning(Warnings.W113.format(name=source_name))
+            warnings.warn(Warnings.W113.format(name=source_name))
        if not source_name in source.component_names:
            raise KeyError(
                Errors.E944.format(
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -4,6 +4,7 @@ from collections import defaultdict
 from itertools import product

 import numpy
+import warnings

 from .matcher cimport Matcher
 from ..vocab cimport Vocab
@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc

 from ..errors import Errors, Warnings
 from ..tokens import Span
-from ..util import logger


 DELIMITER = "||"
@ -282,7 +282,7 @@ cdef class DependencyMatcher:
        keys_to_position_maps = defaultdict(lambda: defaultdict(list))
        for match_id, start, end in self._matcher(doc):
            if start + 1 != end:
-                logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
+                warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
            token = doc[start]
            root = ([token] + list(token.ancestors))[-1]
            keys_to_position_maps[root.i][match_id].append(start)
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -138,6 +138,11 @@ cdef class Matcher:
        self._filter[key] = greedy
        self._patterns[key].extend(patterns)

+    def _require_patterns(self) -> None:
+        """Raise a warning if this component has no patterns defined."""
+        if len(self) == 0:
+            warnings.warn(Warnings.W036.format(name="matcher"))
+
    def remove(self, key):
        """Remove a rule from the matcher. A KeyError is raised if the key does
        not exist.
@ -215,6 +220,7 @@ cdef class Matcher:
            If with_alignments is set to True and as_spans is set to False,
            A list of `(match_id, start, end, alignments)` tuples is returned.
        """
+        self._require_patterns()
        if isinstance(doclike, Doc):
            doc = doclike
            length = len(doc)
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -1,3 +1,4 @@
+import warnings
 from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
 from collections import defaultdict
 from pathlib import Path
@ -6,7 +7,7 @@ import srsly
 from .pipe import Pipe
 from ..training import Example
 from ..language import Language
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
@ -139,6 +140,7 @@ class EntityRuler(Pipe):
            error_handler(self.name, self, [doc], e)

    def match(self, doc: Doc):
+        self._require_patterns()
        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        matches = set(
            [(m_id, start, end) for m_id, start, end in matches if start != end]
@ -327,6 +329,11 @@ class EntityRuler(Pipe):
            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
        )

+    def _require_patterns(self) -> None:
+        """Raise a warning if this component has no patterns defined."""
+        if len(self) == 0:
+            warnings.warn(Warnings.W036.format(name=self.name))
+
    def _split_label(self, label: str) -> Tuple[str, str]:
        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep

--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
 from thinc.api import Model
 from pathlib import Path

+import warnings
+
 from .pipe import Pipe
 from ..errors import Errors, Warnings
 from ..language import Language
@ -182,7 +184,7 @@ class Lemmatizer(Pipe):
        univ_pos = token.pos_.lower()
        if univ_pos in ("", "eol", "space"):
            if univ_pos == "":
-                logger.warning(Warnings.W108.format(text=string))
+                warnings.warn(Warnings.W108.format(text=string))
            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(token):
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,8 +2,6 @@ import weakref

 import pytest
 import numpy
-import logging
-import mock

 from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span, Token
@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text):
    def inner_func(d1, d2):
        return "hello!"

-    logger = logging.getLogger("spacy")
-    with mock.patch.object(logger, "warning") as mock_warning:
-        _ = tokens.to_bytes()  # noqa: F841
-        mock_warning.assert_not_called()
+    _ = tokens.to_bytes()  # noqa: F841
+    with pytest.warns(UserWarning):
        tokens.user_hooks["similarity"] = inner_func
        _ = tokens.to_bytes()  # noqa: F841
-        mock_warning.assert_called_once()


 def test_doc_api_set_ents(en_tokenizer):
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -33,6 +33,15 @@ def test_matcher_from_api_docs(en_vocab):
    assert len(patterns[0])


+def test_matcher_empty_patterns_warns(en_vocab):
+    matcher = Matcher(en_vocab)
+    assert len(matcher) == 0
+    doc = Doc(en_vocab, words=["This", "is", "quite", "something"])
+    with pytest.warns(UserWarning):
+        matcher(doc)
+    assert len(doc.ents) == 0
+
+
 def test_matcher_from_usage_docs(en_vocab):
    text = "Wow 😀 This is really cool! 😂 😂"
    doc = Doc(en_vocab, words=text.split(" "))
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -46,6 +46,17 @@ def test_entity_ruler_init(nlp, patterns):
    assert doc.ents[1].label_ == "BYE"


+def test_entity_ruler_no_patterns_warns(nlp):
+    ruler = EntityRuler(nlp)
+    assert len(ruler) == 0
+    assert len(ruler.labels) == 0
+    nlp.add_pipe("entity_ruler")
+    assert nlp.pipe_names == ["entity_ruler"]
+    with pytest.warns(UserWarning):
+        doc = nlp("hello world bye bye")
+    assert len(doc.ents) == 0
+
+
 def test_entity_ruler_init_patterns(nlp, patterns):
    # initialize with patterns
    ruler = nlp.add_pipe("entity_ruler")
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -1,6 +1,4 @@
 import pytest
-import logging
-import mock
 import pickle
 from spacy import util, registry
 from spacy.lang.en import English
@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp):

    # warning if no POS assigned
    doc = nlp.make_doc("coping")
-    logger = logging.getLogger("spacy")
-    with mock.patch.object(logger, "warning") as mock_warning:
+    with pytest.warns(UserWarning):
        doc = lemmatizer(doc)
-        mock_warning.assert_called_once()
+    # warns once by default
+    doc = lemmatizer(doc)

    # works with POS
    doc = nlp.make_doc("coping")
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -1,6 +1,4 @@
 import pytest
-import mock
-import logging
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
@ -437,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass():
    nlp = English()
    nlp.vocab.vectors.resize((1, 4))
    nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
-    logger = logging.getLogger("spacy")
-    with mock.patch.object(logger, "warning") as mock_warning:
+    with pytest.warns(UserWarning):
        nlp.add_pipe("tagger", source=source_nlp)
-        mock_warning.assert_called()


 def test_pipe_factories_from_source_custom():
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1318,7 +1318,7 @@ cdef class Doc:
            if "user_data_values" not in exclude:
                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
        if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
-            util.logger.warning(Warnings.W109)
+            warnings.warn(Warnings.W109)
        return util.to_dict(serializers, exclude)

    def from_dict(self, msg, *, exclude=tuple()):