mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Warn in Tagger.begin_training if no lemma tables are available (#4351)
This commit is contained in:
parent
bc7e7db208
commit
3297a19545
|
@ -88,6 +88,13 @@ class Warnings(object):
|
||||||
"loaded. (Shape: {shape})")
|
"loaded. (Shape: {shape})")
|
||||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||||
|
W022 = ("Training a new part-of-speech tagger using a model with no "
|
||||||
|
"lemmatization rules or data. This means that the trained model "
|
||||||
|
"may not be able to lemmatize correctly. If this is intentional "
|
||||||
|
"or the language you're using doesn't have lemmatization data, "
|
||||||
|
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
||||||
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
|
"package installed.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -30,7 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
from .._ml import build_bow_text_classifier, build_nel_encoder
|
from .._ml import build_bow_text_classifier, build_nel_encoder
|
||||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from .._ml import masked_language_model, create_default_optimizer
|
from .._ml import masked_language_model, create_default_optimizer
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors, user_warning, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -501,6 +501,9 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
|
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||||
|
if not any(table in self.vocab.lookups for table in lemma_tables):
|
||||||
|
user_warning(Warnings.W022)
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = OrderedDict()
|
new_tag_map = OrderedDict()
|
||||||
for raw_text, annots_brackets in get_gold_tuples():
|
for raw_text, annots_brackets in get_gold_tuples():
|
||||||
|
|
22
spacy/tests/pipeline/test_tagger.py
Normal file
22
spacy/tests/pipeline/test_tagger.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
|
def test_tagger_warns_no_lemma_lookups():
|
||||||
|
nlp = English()
|
||||||
|
nlp.vocab.lookups = Lookups()
|
||||||
|
assert not len(nlp.vocab.lookups)
|
||||||
|
tagger = nlp.create_pipe("tagger")
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
tagger.begin_training()
|
||||||
|
nlp.add_pipe(tagger)
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
nlp.begin_training()
|
||||||
|
nlp.vocab.lookups.add_table("lemma_lookup")
|
||||||
|
with pytest.warns(None) as record:
|
||||||
|
nlp.begin_training()
|
||||||
|
assert not record.list
|
Loading…
Reference in New Issue
Block a user