Improve vocab data integration and warning

2025-07-31 10:29:46 +03:00 · 2020-07-25 11:51:30 +02:00 · 2020-07-25 11:51:30 +02:00 · b9aaa4e457
commit b9aaa4e457
parent 38f6ea7a78
8 changed files with 22 additions and 17 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -1,6 +1,5 @@
 [nlp]
 lang = null
-vocab_data = {}
 pipeline = []

 [nlp.tokenizer]
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -83,7 +83,7 @@ class Warnings:
            "doesn't have a normalization table, please ignore this warning. "
            "If this is surprising, make sure you have the spacy-lookups-data "
            "package installed. The languages with lexeme normalization tables "
-            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
+            "are currently: {langs}")

    # TODO: fix numbering after merging develop into master
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -184,8 +184,10 @@ class Tagger(Pipe):
        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
        if not any(table in self.vocab.lookups for table in lemma_tables):
            warnings.warn(Warnings.W022)
-        if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
-            warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
+        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
+        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
+            langs = ", ".join(util.LEXEME_NORM_LANGS)
+            warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs))
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = {}
        for example in get_examples():
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -239,7 +239,6 @@ class ConfigSchemaNlp(BaseModel):
    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
    tokenizer: Callable = Field(..., title="The tokenizer to use")
    lemmatizer: Callable = Field(..., title="The lemmatizer to use")
-    vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
    # fmt: on

    class Config:
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -431,8 +431,10 @@ cdef class Parser:

    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
        self.cfg.update(kwargs)
-        if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
-            warnings.warn(Warnings.W033.format(model="parser or NER"))
+        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
+        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
+            langs = ", ".join(util.LEXEME_NORM_LANGS)
+            warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs))
        if not hasattr(get_examples, '__call__'):
            gold_tuples = get_examples
            get_examples = lambda: gold_tuples
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -342,7 +342,8 @@ def test_overfitting_IO():


 def test_ner_warns_no_lookups():
-    nlp = Language()
+    nlp = English()
+    assert nlp.lang in util.LEXEME_NORM_LANGS
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    nlp.add_pipe("ner")
--- a/spacy/util.py
+++ b/spacy/util.py
@ -53,6 +53,7 @@ if TYPE_CHECKING:

 _PRINT_ENV = False
 OOV_RANK = numpy.iinfo(numpy.uint64).max
+LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]


 class registry(thinc.registry):
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -17,13 +17,20 @@ from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs, NORM, IS_STOP
 from .vectors import Vectors
 from .util import link_vectors_to_models, registry
-from .lookups import Lookups
+from .lookups import Lookups, load_lookups
 from . import util
 from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang


-def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None):
+def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True):
+    # If the spacy-lookups-data package is installed, we pre-populate the lookups
+    # with lexeme data, if available
+    if load_lookups_data:
+        tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
+        lookups = load_lookups(lang, tables=tables, strict=False)
+    else:
+        lookups = Lookups()
    lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
    # This is messy, but it's the minimal working fix to Issue #639.
    lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -32,13 +39,8 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No
    lex_attrs[NORM] = util.add_lookups(
        lex_attrs.get(NORM, LEX_ATTRS[NORM]),
        BASE_NORMS,
-        vocab_data.get("lexeme_norm", {}),
+        lookups.get_table("lexeme_norm", {}),
    )
-    lookups = Lookups()
-    for name, data in vocab_data.items():
-        if name not in lookups:
-            data = data if data is not None else {}
-            lookups.add_table(name, data)
    return Vocab(
        lex_attr_getters=lex_attrs,
        lemmatizer=lemmatizer,
@ -49,7 +51,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No
    )


-
 cdef class Vocab:
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying