Improve vocab data integration and warning

This commit is contained in:
Ines Montani 2020-07-25 11:51:30 +02:00
parent 38f6ea7a78
commit b9aaa4e457
8 changed files with 22 additions and 17 deletions

View File

@ -1,6 +1,5 @@
[nlp]
lang = null
vocab_data = {}
pipeline = []
[nlp.tokenizer]

View File

@ -83,7 +83,7 @@ class Warnings:
"doesn't have a normalization table, please ignore this warning. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed. The languages with lexeme normalization tables "
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
"are currently: {langs}")
# TODO: fix numbering after merging develop into master
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")

View File

@ -184,8 +184,10 @@ class Tagger(Pipe):
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
if not any(table in self.vocab.lookups for table in lemma_tables):
warnings.warn(Warnings.W022)
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS)
warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs))
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {}
for example in get_examples():

View File

@ -239,7 +239,6 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
tokenizer: Callable = Field(..., title="The tokenizer to use")
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
# fmt: on
class Config:

View File

@ -431,8 +431,10 @@ cdef class Parser:
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
self.cfg.update(kwargs)
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
warnings.warn(Warnings.W033.format(model="parser or NER"))
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS)
warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs))
if not hasattr(get_examples, '__call__'):
gold_tuples = get_examples
get_examples = lambda: gold_tuples

View File

@ -342,7 +342,8 @@ def test_overfitting_IO():
def test_ner_warns_no_lookups():
nlp = Language()
nlp = English()
assert nlp.lang in util.LEXEME_NORM_LANGS
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner")

View File

@ -53,6 +53,7 @@ if TYPE_CHECKING:
_PRINT_ENV = False
OOV_RANK = numpy.iinfo(numpy.uint64).max
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
class registry(thinc.registry):

View File

@ -17,13 +17,20 @@ from .lemmatizer import Lemmatizer
from .attrs import intify_attrs, NORM, IS_STOP
from .vectors import Vectors
from .util import link_vectors_to_models, registry
from .lookups import Lookups
from .lookups import Lookups, load_lookups
from . import util
from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None):
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True):
# If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available
if load_lookups_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False)
else:
lookups = Lookups()
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -32,13 +39,8 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
vocab_data.get("lexeme_norm", {}),
lookups.get_table("lexeme_norm", {}),
)
lookups = Lookups()
for name, data in vocab_data.items():
if name not in lookups:
data = data if data is not None else {}
lookups.add_table(name, data)
return Vocab(
lex_attr_getters=lex_attrs,
lemmatizer=lemmatizer,
@ -49,7 +51,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No
)
cdef class Vocab:
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying