Fix and add warnings related to spacy-lookups-data (#5588)

* Fix warning message for lemmatization tables

* Add a warning when the `lexeme_norm` table is empty. (Given the
relatively lang-specific loading for `Lookups`, it seemed like too much
overhead to dynamically extract the list of languages, so for now it's
hard-coded.)
This commit is contained in:
Adriane Boyd 2020-06-15 14:56:04 +02:00 committed by GitHub
parent aa5b40fa64
commit c482f20778
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 36 additions and 7 deletions

View File

@ -92,9 +92,9 @@ class Warnings(object):
W022 = ("Training a new part-of-speech tagger using a model with no " W022 = ("Training a new part-of-speech tagger using a model with no "
"lemmatization rules or data. This means that the trained model " "lemmatization rules or data. This means that the trained model "
"may not be able to lemmatize correctly. If this is intentional " "may not be able to lemmatize correctly. If this is intentional "
"or the language you're using doesn't have lemmatization data. " "or the language you're using doesn't have lemmatization data, "
"If this is surprising, make sure you have the spacy-lookups-data " "please ignore this warning. If this is surprising, make sure you "
"package installed.") "have the spacy-lookups-data package installed.")
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
"'n_process' will be set to 1.") "'n_process' will be set to 1.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
@ -127,6 +127,13 @@ class Warnings(object):
"this, download a newer compatible model or retrain your custom " "this, download a newer compatible model or retrain your custom "
"model with the current spaCy version. For more details and " "model with the current spaCy version. For more details and "
"available updates, run: python -m spacy validate") "available updates, run: python -m spacy validate")
W033 = ("Training a new {model} using a model with no lexeme normalization "
"table. This may degrade the performance of the model to some "
"degree. If this is intentional or the language you're using "
"doesn't have a normalization table, please ignore this warning. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed. The languages with lexeme normalization tables "
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
@add_codes @add_codes

View File

@ -516,6 +516,8 @@ class Tagger(Pipe):
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
if not any(table in self.vocab.lookups for table in lemma_tables): if not any(table in self.vocab.lookups for table in lemma_tables):
warnings.warn(Warnings.W022) warnings.warn(Warnings.W022)
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
orig_tag_map = dict(self.vocab.morphology.tag_map) orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict() new_tag_map = OrderedDict()
for raw_text, annots_brackets in get_gold_tuples(): for raw_text, annots_brackets in get_gold_tuples():

View File

@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec from thinc.linalg cimport Vec, VecVec
import srsly import srsly
import warnings
from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport predict_states, arg_max_if_valid
@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import copy_array from ..compat import copy_array
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..errors import Errors, TempErrors from ..errors import Errors, TempErrors, Warnings
from .. import util from .. import util
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
@ -601,6 +602,8 @@ cdef class Parser:
**self.cfg.get('optimizer', {})) **self.cfg.get('optimizer', {}))
def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
warnings.warn(Warnings.W033.format(model="parser or NER"))
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
if not hasattr(get_gold_tuples, '__call__'): if not hasattr(get_gold_tuples, '__call__'):

View File

@ -4,6 +4,8 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language
from spacy.lookups import Lookups
from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown from spacy.syntax.ner import BiluoPushDown
@ -305,6 +307,21 @@ def test_change_number_features():
nlp("hello world") nlp("hello world")
def test_ner_warns_no_lookups():
nlp = Language()
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
with pytest.warns(UserWarning):
nlp.begin_training()
nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
with pytest.warns(None) as record:
nlp.begin_training()
assert not record.list
class BlockerComponent1(object): class BlockerComponent1(object):
name = "my_blocker" name = "my_blocker"

View File

@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes():
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
def test_tagger_warns_no_lemma_lookups(): def test_tagger_warns_no_lookups():
nlp = Language() nlp = Language()
nlp.vocab.lookups = Lookups() nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups) assert not len(nlp.vocab.lookups)
tagger = nlp.create_pipe("tagger") tagger = nlp.create_pipe("tagger")
with pytest.warns(UserWarning):
tagger.begin_training()
nlp.add_pipe(tagger) nlp.add_pipe(tagger)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
nlp.begin_training() nlp.begin_training()
nlp.vocab.lookups.add_table("lemma_lookup") nlp.vocab.lookups.add_table("lemma_lookup")
nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
with pytest.warns(None) as record: with pytest.warns(None) as record:
nlp.begin_training() nlp.begin_training()
assert not record.list assert not record.list