Fix lang check and error handling in Language.from_config

This commit is contained in:
Ines Montani 2020-09-15 14:24:06 +02:00
parent 2ed6e2a218
commit d3d7f92f05
3 changed files with 16 additions and 2 deletions

View File

@ -552,7 +552,10 @@ class Errors:
"to register a simple stateless function component that just takes " "to register a simple stateless function component that just takes "
"a Doc and returns it.") "a Doc and returns it.")
E958 = ("Language code defined in config ({bad_lang_code}) does not match " E958 = ("Language code defined in config ({bad_lang_code}) does not match "
"language code of current Language subclass {lang} ({lang_code})") "language code of current Language subclass {lang} ({lang_code}). "
"If you want to create an nlp object from a config, make sure to "
"use the matching subclass with the language-specific settings and "
"data.")
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}") E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
E960 = ("No config data found for component '{name}'. This is likely a bug " E960 = ("No config data found for component '{name}'. This is likely a bug "
"in spaCy.") "in spaCy.")

View File

@ -1487,7 +1487,7 @@ class Language:
if "nlp" not in config: if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config)) raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"]["lang"] config_lang = config["nlp"]["lang"]
if cls.lang is not None and config_lang is not None and config_lang != cls.lang: if config_lang is not None and config_lang != cls.lang:
raise ValueError( raise ValueError(
Errors.E958.format( Errors.E958.format(
bad_lang_code=config["nlp"]["lang"], bad_lang_code=config["nlp"]["lang"],

View File

@ -5,6 +5,7 @@ from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.training import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German
from spacy.util import registry from spacy.util import registry
from .util import add_vecs_to_vocab, assert_docs_equal from .util import add_vecs_to_vocab, assert_docs_equal
@ -266,3 +267,13 @@ def test_language_custom_tokenizer():
assert [t.text for t in doc] == ["_hello", "_world"] assert [t.text for t in doc] == ["_hello", "_world"]
doc = list(nlp.pipe(["hello world"]))[0] doc = list(nlp.pipe(["hello world"]))[0]
assert [t.text for t in doc] == ["_hello", "_world"] assert [t.text for t in doc] == ["_hello", "_world"]
def test_language_from_config_invalid_lang():
"""Test that calling Language.from_config raises an error and lang defined
in config needs to match language-specific subclasses."""
config = {"nlp": {"lang": "en"}}
with pytest.raises(ValueError):
Language.from_config(config)
with pytest.raises(ValueError):
German.from_config(config)