diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7ba008fb6..f1786e04b 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,6 +1,5 @@ [nlp] lang = null -vocab_data = {} pipeline = [] [nlp.tokenizer] diff --git a/spacy/errors.py b/spacy/errors.py index 04d831c41..07c3df686 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -83,7 +83,7 @@ class Warnings: "doesn't have a normalization table, please ignore this warning. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed. The languages with lexeme normalization tables " - "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") + "are currently: {langs}") # TODO: fix numbering after merging develop into master W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index e4250b932..dfbb943f8 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -184,8 +184,10 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) - if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: - warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) + lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) + if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: + langs = ", ".join(util.LEXEME_NORM_LANGS) + warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs)) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = {} for example in get_examples(): diff --git a/spacy/schemas.py b/spacy/schemas.py index ad16f3233..e55123e14 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -239,7 +239,6 @@ class ConfigSchemaNlp(BaseModel): pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use") - vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") # fmt: on class Config: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 09616ee75..f640e2e8d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -431,8 +431,10 @@ cdef class Parser: def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) - if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: - warnings.warn(Warnings.W033.format(model="parser or NER")) + lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) + if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: + langs = ", ".join(util.LEXEME_NORM_LANGS) + warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs)) if not hasattr(get_examples, '__call__'): gold_tuples = get_examples get_examples = lambda: gold_tuples diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 71539fe60..4a6bf73a5 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -342,7 +342,8 @@ def test_overfitting_IO(): def test_ner_warns_no_lookups(): - nlp = Language() + nlp = English() + assert nlp.lang in util.LEXEME_NORM_LANGS nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") diff --git a/spacy/util.py b/spacy/util.py index 0d732034f..18ce7e474 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -53,6 +53,7 @@ if TYPE_CHECKING: _PRINT_ENV = False OOV_RANK = numpy.iinfo(numpy.uint64).max +LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] class registry(thinc.registry): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0f99a45f5..56e62834a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,13 +17,20 @@ from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM, IS_STOP from .vectors import Vectors from .util import link_vectors_to_models, registry -from .lookups import Lookups +from .lookups import Lookups, load_lookups from . import util from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None): +def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True): + # If the spacy-lookups-data package is installed, we pre-populate the lookups + # with lexeme data, if available + if load_lookups_data: + tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] + lookups = load_lookups(lang, tables=tables, strict=False) + else: + lookups = Lookups() lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} # This is messy, but it's the minimal working fix to Issue #639. lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) @@ -32,13 +39,8 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No lex_attrs[NORM] = util.add_lookups( lex_attrs.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, - vocab_data.get("lexeme_norm", {}), + lookups.get_table("lexeme_norm", {}), ) - lookups = Lookups() - for name, data in vocab_data.items(): - if name not in lookups: - data = data if data is not None else {} - lookups.add_table(name, data) return Vocab( lex_attr_getters=lex_attrs, lemmatizer=lemmatizer, @@ -49,7 +51,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No ) - cdef class Vocab: """A look-up table that allows you to access `Lexeme` objects. The `Vocab` instance also provides access to the `StringStore`, and owns underlying