Merge pull request #8285 from adrianeboyd/feature/refactor-logger-warnings

Refactor warnings
This commit is contained in:
Adriane Boyd 2021-06-11 10:20:02 +02:00 committed by GitHub
commit dbbeab2506
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 82 additions and 27 deletions

View File

@ -1,10 +1,10 @@
from typing import Union, Iterable, Dict, Any
from pathlib import Path
import warnings
import sys
warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
setup_default_warnings()
# These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401

View File

@ -1,3 +1,6 @@
import warnings
def add_codes(err_cls):
"""Add error codes to string messages via class attribute names."""
@ -12,6 +15,33 @@ def add_codes(err_cls):
return ErrorsWithCodes()
def setup_default_warnings():
# ignore certain numpy warnings
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
# warn about entity_ruler & matcher having no patterns only once
for pipe in ["matcher", "entity_ruler"]:
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS
filter_warning("once", error_msg="[W108]")
def filter_warning(action: str, error_msg: str):
"""Customize how spaCy should handle a certain warning.
error_msg (str): e.g. "W006", or a full error message
action (str): "default", "error", "ignore", "always", "module" or "once"
"""
warnings.filterwarnings(action, message=_escape_warning_msg(error_msg))
def _escape_warning_msg(msg):
"""To filter with warnings.filterwarnings, the [] brackets need to be escaped"""
return msg.replace("[", "\\[").replace("]", "\\]")
# fmt: off
@add_codes
@ -80,8 +110,9 @@ class Warnings:
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
"lang = ${{nlp.lang}}\n"
"tables = [\"lexeme_norm\"]\n")
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
"attribute or operator.")
W036 = ("The component '{name}' does not have any patterns defined.")
# New warnings added in v3.x
W086 = ("Component '{listener}' will be (re)trained, but it needs the component "

View File

@ -689,7 +689,7 @@ class Language:
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
util.logger.warning(Warnings.W113.format(name=source_name))
warnings.warn(Warnings.W113.format(name=source_name))
if not source_name in source.component_names:
raise KeyError(
Errors.E944.format(

View File

@ -4,6 +4,7 @@ from collections import defaultdict
from itertools import product
import numpy
import warnings
from .matcher cimport Matcher
from ..vocab cimport Vocab
@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc
from ..errors import Errors, Warnings
from ..tokens import Span
from ..util import logger
DELIMITER = "||"
@ -282,7 +282,7 @@ cdef class DependencyMatcher:
keys_to_position_maps = defaultdict(lambda: defaultdict(list))
for match_id, start, end in self._matcher(doc):
if start + 1 != end:
logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
token = doc[start]
root = ([token] + list(token.ancestors))[-1]
keys_to_position_maps[root.i][match_id].append(start)

View File

@ -138,6 +138,11 @@ cdef class Matcher:
self._filter[key] = greedy
self._patterns[key].extend(patterns)
def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
if len(self) == 0:
warnings.warn(Warnings.W036.format(name="matcher"))
def remove(self, key):
"""Remove a rule from the matcher. A KeyError is raised if the key does
not exist.
@ -215,6 +220,7 @@ cdef class Matcher:
If with_alignments is set to True and as_spans is set to False,
A list of `(match_id, start, end, alignments)` tuples is returned.
"""
self._require_patterns()
if isinstance(doclike, Doc):
doc = doclike
length = len(doc)

View File

@ -1,3 +1,4 @@
import warnings
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
from collections import defaultdict
from pathlib import Path
@ -6,7 +7,7 @@ import srsly
from .pipe import Pipe
from ..training import Example
from ..language import Language
from ..errors import Errors
from ..errors import Errors, Warnings
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
@ -139,6 +140,7 @@ class EntityRuler(Pipe):
error_handler(self.name, self, [doc], e)
def match(self, doc: Doc):
self._require_patterns()
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end]
@ -327,6 +329,11 @@ class EntityRuler(Pipe):
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
)
def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
if len(self) == 0:
warnings.warn(Warnings.W036.format(name=self.name))
def _split_label(self, label: str) -> Tuple[str, str]:
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep

View File

@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
from thinc.api import Model
from pathlib import Path
import warnings
from .pipe import Pipe
from ..errors import Errors, Warnings
from ..language import Language
@ -182,7 +184,7 @@ class Lemmatizer(Pipe):
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
if univ_pos == "":
logger.warning(Warnings.W108.format(text=string))
warnings.warn(Warnings.W108.format(text=string))
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(token):

View File

@ -2,8 +2,6 @@ import weakref
import pytest
import numpy
import logging
import mock
from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span, Token
@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text):
def inner_func(d1, d2):
return "hello!"
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
_ = tokens.to_bytes() # noqa: F841
mock_warning.assert_not_called()
with pytest.warns(UserWarning):
tokens.user_hooks["similarity"] = inner_func
_ = tokens.to_bytes() # noqa: F841
mock_warning.assert_called_once()
def test_doc_api_set_ents(en_tokenizer):

View File

@ -33,6 +33,15 @@ def test_matcher_from_api_docs(en_vocab):
assert len(patterns[0])
def test_matcher_empty_patterns_warns(en_vocab):
matcher = Matcher(en_vocab)
assert len(matcher) == 0
doc = Doc(en_vocab, words=["This", "is", "quite", "something"])
with pytest.warns(UserWarning):
matcher(doc)
assert len(doc.ents) == 0
def test_matcher_from_usage_docs(en_vocab):
text = "Wow 😀 This is really cool! 😂 😂"
doc = Doc(en_vocab, words=text.split(" "))

View File

@ -46,6 +46,17 @@ def test_entity_ruler_init(nlp, patterns):
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_no_patterns_warns(nlp):
ruler = EntityRuler(nlp)
assert len(ruler) == 0
assert len(ruler.labels) == 0
nlp.add_pipe("entity_ruler")
assert nlp.pipe_names == ["entity_ruler"]
with pytest.warns(UserWarning):
doc = nlp("hello world bye bye")
assert len(doc.ents) == 0
def test_entity_ruler_init_patterns(nlp, patterns):
# initialize with patterns
ruler = nlp.add_pipe("entity_ruler")

View File

@ -1,6 +1,4 @@
import pytest
import logging
import mock
import pickle
from spacy import util, registry
from spacy.lang.en import English
@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp):
# warning if no POS assigned
doc = nlp.make_doc("coping")
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
with pytest.warns(UserWarning):
doc = lemmatizer(doc)
# warns once by default
doc = lemmatizer(doc)
mock_warning.assert_called_once()
# works with POS
doc = nlp.make_doc("coping")

View File

@ -1,6 +1,4 @@
import pytest
import mock
import logging
from spacy.language import Language
from spacy.lang.en import English
from spacy.lang.de import German
@ -437,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass():
nlp = English()
nlp.vocab.vectors.resize((1, 4))
nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
with pytest.warns(UserWarning):
nlp.add_pipe("tagger", source=source_nlp)
mock_warning.assert_called()
def test_pipe_factories_from_source_custom():

View File

@ -1318,7 +1318,7 @@ cdef class Doc:
if "user_data_values" not in exclude:
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
util.logger.warning(Warnings.W109)
warnings.warn(Warnings.W109)
return util.to_dict(serializers, exclude)
def from_dict(self, msg, *, exclude=tuple()):