diff --git a/spacy/__init__.py b/spacy/__init__.py index 1eef7e621..d07931cfd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,10 +1,10 @@ from typing import Union, Iterable, Dict, Any from pathlib import Path -import warnings import sys -warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa -warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa +# set library-specific custom warning handling before doing anything else +from .errors import setup_default_warnings +setup_default_warnings() # These are imported as part of the API from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 diff --git a/spacy/errors.py b/spacy/errors.py index 7cf9e54e4..2e8cc4494 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,3 +1,6 @@ +import warnings + + def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -12,6 +15,33 @@ def add_codes(err_cls): return ErrorsWithCodes() +def setup_default_warnings(): + # ignore certain numpy warnings + filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa + filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa + + # warn about entity_ruler & matcher having no patterns only once + for pipe in ["matcher", "entity_ruler"]: + filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) + + # warn once about lemmatizer without required POS + filter_warning("once", error_msg="[W108]") + + +def filter_warning(action: str, error_msg: str): + """Customize how spaCy should handle a certain warning. + + error_msg (str): e.g. "W006", or a full error message + action (str): "default", "error", "ignore", "always", "module" or "once" + """ + warnings.filterwarnings(action, message=_escape_warning_msg(error_msg)) + + +def _escape_warning_msg(msg): + """To filter with warnings.filterwarnings, the [] brackets need to be escaped""" + return msg.replace("[", "\\[").replace("]", "\\]") + + # fmt: off @add_codes @@ -80,8 +110,9 @@ class Warnings: "@misc = \"spacy.LookupsDataLoader.v1\"\n" "lang = ${{nlp.lang}}\n" "tables = [\"lexeme_norm\"]\n") - W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' + W035 = ("Discarding subpattern '{pattern}' due to an unrecognized " "attribute or operator.") + W036 = ("The component '{name}' does not have any patterns defined.") # New warnings added in v3.x W086 = ("Component '{listener}' will be (re)trained, but it needs the component " diff --git a/spacy/language.py b/spacy/language.py index 1a447c11b..7786089a5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -689,7 +689,7 @@ class Language: if self.vocab.vectors.shape != source.vocab.vectors.shape or \ self.vocab.vectors.key2row != source.vocab.vectors.key2row or \ self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes(): - util.logger.warning(Warnings.W113.format(name=source_name)) + warnings.warn(Warnings.W113.format(name=source_name)) if not source_name in source.component_names: raise KeyError( Errors.E944.format( diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 0e601281a..b6e84a5da 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -4,6 +4,7 @@ from collections import defaultdict from itertools import product import numpy +import warnings from .matcher cimport Matcher from ..vocab cimport Vocab @@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc from ..errors import Errors, Warnings from ..tokens import Span -from ..util import logger DELIMITER = "||" @@ -282,7 +282,7 @@ cdef class DependencyMatcher: keys_to_position_maps = defaultdict(lambda: defaultdict(list)) for match_id, start, end in self._matcher(doc): if start + 1 != end: - logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0])) + warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0])) token = doc[start] root = ([token] + list(token.ancestors))[-1] keys_to_position_maps[root.i][match_id].append(start) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index dae12c3f6..6fd8bdb03 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -138,6 +138,11 @@ cdef class Matcher: self._filter[key] = greedy self._patterns[key].extend(patterns) + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name="matcher")) + def remove(self, key): """Remove a rule from the matcher. A KeyError is raised if the key does not exist. @@ -215,6 +220,7 @@ cdef class Matcher: If with_alignments is set to True and as_spans is set to False, A list of `(match_id, start, end, alignments)` tuples is returned. """ + self._require_patterns() if isinstance(doclike, Doc): doc = doclike length = len(doc) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 03730f772..78269f180 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,3 +1,4 @@ +import warnings from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence from collections import defaultdict from pathlib import Path @@ -6,7 +7,7 @@ import srsly from .pipe import Pipe from ..training import Example from ..language import Language -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher @@ -139,6 +140,7 @@ class EntityRuler(Pipe): error_handler(self.name, self, [doc], e) def match(self, doc: Doc): + self._require_patterns() matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] @@ -327,6 +329,11 @@ class EntityRuler(Pipe): self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate ) + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name=self.name)) + def _split_label(self, label: str) -> Tuple[str, str]: """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index cfe405efa..87504fade 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple from thinc.api import Model from pathlib import Path +import warnings + from .pipe import Pipe from ..errors import Errors, Warnings from ..language import Language @@ -182,7 +184,7 @@ class Lemmatizer(Pipe): univ_pos = token.pos_.lower() if univ_pos in ("", "eol", "space"): if univ_pos == "": - logger.warning(Warnings.W108.format(text=string)) + warnings.warn(Warnings.W108.format(text=string)) return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(token): diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index d7452a802..358724517 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,8 +2,6 @@ import weakref import pytest import numpy -import logging -import mock from spacy.lang.xx import MultiLanguage from spacy.tokens import Doc, Span, Token @@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text): def inner_func(d1, d2): return "hello!" - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: - _ = tokens.to_bytes() # noqa: F841 - mock_warning.assert_not_called() + _ = tokens.to_bytes() # noqa: F841 + with pytest.warns(UserWarning): tokens.user_hooks["similarity"] = inner_func _ = tokens.to_bytes() # noqa: F841 - mock_warning.assert_called_once() def test_doc_api_set_ents(en_tokenizer): diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 094bf22a6..4e6b4bfae 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -33,6 +33,15 @@ def test_matcher_from_api_docs(en_vocab): assert len(patterns[0]) +def test_matcher_empty_patterns_warns(en_vocab): + matcher = Matcher(en_vocab) + assert len(matcher) == 0 + doc = Doc(en_vocab, words=["This", "is", "quite", "something"]) + with pytest.warns(UserWarning): + matcher(doc) + assert len(doc.ents) == 0 + + def test_matcher_from_usage_docs(en_vocab): text = "Wow 😀 This is really cool! 😂 😂" doc = Doc(en_vocab, words=text.split(" ")) diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 79ad44abd..4a01ce183 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -46,6 +46,17 @@ def test_entity_ruler_init(nlp, patterns): assert doc.ents[1].label_ == "BYE" +def test_entity_ruler_no_patterns_warns(nlp): + ruler = EntityRuler(nlp) + assert len(ruler) == 0 + assert len(ruler.labels) == 0 + nlp.add_pipe("entity_ruler") + assert nlp.pipe_names == ["entity_ruler"] + with pytest.warns(UserWarning): + doc = nlp("hello world bye bye") + assert len(doc.ents) == 0 + + def test_entity_ruler_init_patterns(nlp, patterns): # initialize with patterns ruler = nlp.add_pipe("entity_ruler") diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 3c16d3bcb..1bec8696c 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -1,6 +1,4 @@ import pytest -import logging -import mock import pickle from spacy import util, registry from spacy.lang.en import English @@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp): # warning if no POS assigned doc = nlp.make_doc("coping") - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: + with pytest.warns(UserWarning): doc = lemmatizer(doc) - mock_warning.assert_called_once() + # warns once by default + doc = lemmatizer(doc) # works with POS doc = nlp.make_doc("coping") diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index c5cc62661..b99e9a863 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,6 +1,4 @@ import pytest -import mock -import logging from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German @@ -437,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass(): nlp = English() nlp.vocab.vectors.resize((1, 4)) nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4]) - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: + with pytest.warns(UserWarning): nlp.add_pipe("tagger", source=source_nlp) - mock_warning.assert_called() def test_pipe_factories_from_source_custom(): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index aae0ff374..28f8debf3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1318,7 +1318,7 @@ cdef class Doc: if "user_data_values" not in exclude: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)): - util.logger.warning(Warnings.W109) + warnings.warn(Warnings.W109) return util.to_dict(serializers, exclude) def from_dict(self, msg, *, exclude=tuple()):