mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Fix and add warnings related to spacy-lookups-data (#5588)
* Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.)
This commit is contained in:
parent
aa5b40fa64
commit
c482f20778
|
@ -92,9 +92,9 @@ class Warnings(object):
|
|||
W022 = ("Training a new part-of-speech tagger using a model with no "
|
||||
"lemmatization rules or data. This means that the trained model "
|
||||
"may not be able to lemmatize correctly. If this is intentional "
|
||||
"or the language you're using doesn't have lemmatization data. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed.")
|
||||
"or the language you're using doesn't have lemmatization data, "
|
||||
"please ignore this warning. If this is surprising, make sure you "
|
||||
"have the spacy-lookups-data package installed.")
|
||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
||||
"'n_process' will be set to 1.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
|
@ -127,6 +127,13 @@ class Warnings(object):
|
|||
"this, download a newer compatible model or retrain your custom "
|
||||
"model with the current spaCy version. For more details and "
|
||||
"available updates, run: python -m spacy validate")
|
||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||
"table. This may degrade the performance of the model to some "
|
||||
"degree. If this is intentional or the language you're using "
|
||||
"doesn't have a normalization table, please ignore this warning. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed. The languages with lexeme normalization tables "
|
||||
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -516,6 +516,8 @@ class Tagger(Pipe):
|
|||
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
if not any(table in self.vocab.lookups for table in lemma_tables):
|
||||
warnings.warn(Warnings.W022)
|
||||
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
||||
warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
|
||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||
new_tag_map = OrderedDict()
|
||||
for raw_text, annots_brackets in get_gold_tuples():
|
||||
|
|
|
@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps
|
|||
from thinc.neural.util import get_array_module
|
||||
from thinc.linalg cimport Vec, VecVec
|
||||
import srsly
|
||||
import warnings
|
||||
|
||||
from ._parser_model cimport alloc_activations, free_activations
|
||||
from ._parser_model cimport predict_states, arg_max_if_valid
|
||||
|
@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer
|
|||
from ..compat import copy_array
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..gold cimport GoldParse
|
||||
from ..errors import Errors, TempErrors
|
||||
from ..errors import Errors, TempErrors, Warnings
|
||||
from .. import util
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
@ -601,6 +602,8 @@ cdef class Parser:
|
|||
**self.cfg.get('optimizer', {}))
|
||||
|
||||
def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
|
||||
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
||||
warnings.warn(Warnings.W033.format(model="parser or NER"))
|
||||
if 'model' in cfg:
|
||||
self.model = cfg['model']
|
||||
if not hasattr(get_gold_tuples, '__call__'):
|
||||
|
|
|
@ -4,6 +4,8 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
from spacy.lang.en import English
|
||||
|
||||
from spacy.language import Language
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.syntax.ner import BiluoPushDown
|
||||
|
@ -305,6 +307,21 @@ def test_change_number_features():
|
|||
nlp("hello world")
|
||||
|
||||
|
||||
def test_ner_warns_no_lookups():
|
||||
nlp = Language()
|
||||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
with pytest.warns(UserWarning):
|
||||
nlp.begin_training()
|
||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||
with pytest.warns(None) as record:
|
||||
nlp.begin_training()
|
||||
assert not record.list
|
||||
|
||||
|
||||
class BlockerComponent1(object):
|
||||
name = "my_blocker"
|
||||
|
||||
|
|
|
@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes():
|
|||
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
|
||||
|
||||
|
||||
def test_tagger_warns_no_lemma_lookups():
|
||||
def test_tagger_warns_no_lookups():
|
||||
nlp = Language()
|
||||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training()
|
||||
nlp.add_pipe(tagger)
|
||||
with pytest.warns(UserWarning):
|
||||
nlp.begin_training()
|
||||
nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||
with pytest.warns(None) as record:
|
||||
nlp.begin_training()
|
||||
assert not record.list
|
||||
|
|
Loading…
Reference in New Issue
Block a user