Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.
This commit is contained in:
Adriane Boyd 2021-03-19 10:45:16 +01:00
parent 34e13c1161
commit 39153ef90f
6 changed files with 31 additions and 12 deletions

View File

@ -73,8 +73,13 @@ class Warnings:
"degree. If this is intentional or the language you're using "
"doesn't have a normalization table, please ignore this warning. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed. The languages with lexeme normalization tables "
"are currently: {langs}")
"package installed and load the table in your config. The "
"languages with lexeme normalization tables are currently: "
"{langs}\n\nLoad the table in your config with:\n\n"
"[initialize.lookups]\n"
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
"lang = ${{nlp.lang}}\n"
"tables = [\"lexeme_norm\"]\n")
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
"attribute or operator.")

View File

@ -137,6 +137,7 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#initialize
"""
validate_get_examples(get_examples, "Morphologizer.initialize")
util.check_lexeme_norms(self.vocab, "morphologizer")
if labels is not None:
self.cfg["labels_morph"] = labels["morph"]
self.cfg["labels_pos"] = labels["pos"]

View File

@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer#initialize
"""
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
util.check_lexeme_norms(self.vocab, "senter")
doc_sample = []
label_sample = []
assert self.labels, Errors.E924.format(name=self.name)

View File

@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#initialize
"""
validate_get_examples(get_examples, "Tagger.initialize")
util.check_lexeme_norms(self.vocab, "tagger")
if labels is not None:
for tag in labels:
self.add_label(tag)

View File

@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
def initialize(self, get_examples, nlp=None, labels=None):
validate_get_examples(get_examples, "Parser.initialize")
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS)
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
util.check_lexeme_norms(self.vocab, "parser or NER")
if labels is not None:
actions = dict(labels)
else:

View File

@ -59,7 +59,7 @@ if TYPE_CHECKING:
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
logger = logging.getLogger("spacy")
logger_stream_handler = logging.StreamHandler()
logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
logger_stream_handler.setFormatter(
logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
)
logger.addHandler(logger_stream_handler)
@ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool:
if hasattr(func, attr): # function or class instance
return True
# https://stackoverflow.com/a/55767059
if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
and func.__module__ in sys.modules: # method
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
return hasattr(cls_func, attr)
if (
hasattr(func, "__qualname__")
and hasattr(func, "__module__")
and func.__module__ in sys.modules
): # method
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
return hasattr(cls_func, attr)
return False
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
"""
if is_in_jupyter():
from thinc.backends.cupy_ops import CupyOps
if CupyOps.xp is not None:
from thinc.backends import contextvars_eq_thread_ops
if not contextvars_eq_thread_ops():
warnings.warn(Warnings.W111)
def check_lexeme_norms(vocab, component_name):
lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
langs = ", ".join(LEXEME_NORM_LANGS)
logger.debug(Warnings.W033.format(model=component_name, langs=langs))