Fix lang check and error handling in Language.from_config

This commit is contained in:
Ines Montani 2020-09-15 14:24:06 +02:00
parent 2ed6e2a218
commit d3d7f92f05
3 changed files with 16 additions and 2 deletions

View File

@ -552,7 +552,10 @@ class Errors:
"to register a simple stateless function component that just takes "
"a Doc and returns it.")
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
"language code of current Language subclass {lang} ({lang_code})")
"language code of current Language subclass {lang} ({lang_code}). "
"If you want to create an nlp object from a config, make sure to "
"use the matching subclass with the language-specific settings and "
"data.")
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
E960 = ("No config data found for component '{name}'. This is likely a bug "
"in spaCy.")

View File

@ -1487,7 +1487,7 @@ class Language:
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"]["lang"]
if cls.lang is not None and config_lang is not None and config_lang != cls.lang:
if config_lang is not None and config_lang != cls.lang:
raise ValueError(
Errors.E958.format(
bad_lang_code=config["nlp"]["lang"],

View File

@ -5,6 +5,7 @@ from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.training import Example
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.util import registry
from .util import add_vecs_to_vocab, assert_docs_equal
@ -266,3 +267,13 @@ def test_language_custom_tokenizer():
assert [t.text for t in doc] == ["_hello", "_world"]
doc = list(nlp.pipe(["hello world"]))[0]
assert [t.text for t in doc] == ["_hello", "_world"]
def test_language_from_config_invalid_lang():
"""Test that calling Language.from_config raises an error and lang defined
in config needs to match language-specific subclasses."""
config = {"nlp": {"lang": "en"}}
with pytest.raises(ValueError):
Language.from_config(config)
with pytest.raises(ValueError):
German.from_config(config)