diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 8cac30b26..7128338af 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class DanishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index b645d3480..99c161961 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class GermanDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 1a7b19914..818405842 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX from .lemmatizer import GreekLemmatizer from ...lookups import Lookups from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class GreekDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index bf7e9987f..f4ea10f9c 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -10,9 +9,21 @@ from .punctuation import TOKENIZER_INFIXES from .lemmatizer import EnglishLemmatizer from ...language import Language from ...lookups import Lookups +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class EnglishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index 87373551c..46bef57ca 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class IndonesianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index e7cc1ef3b..4e6bf9d3c 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any from pathlib import Path import srsly from collections import namedtuple -from thinc.api import Config from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS @@ -16,7 +15,7 @@ from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str from ... import util @@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer): class JapaneseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index dd07ef89c..83c9f4962 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,5 +1,4 @@ from typing import Optional, Any, Dict -from thinc.api import Config from .stop_words import STOP_WORDS from .tag_map import TAG_MAP @@ -10,7 +9,7 @@ from ...compat import copy_reg from ...scorer import Scorer from ...symbols import POS from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer): class KoreanDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index da6fe55d7..ead5f5d10 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class LuxembourgishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 0447099f0..1c95c11d9 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class PortugueseDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES prefixes = TOKENIZER_PREFIXES diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 4a296dd23..857e197e9 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .stop_words import STOP_WORDS @@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer from ...language import Language from ...lookups import Lookups +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class RussianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index 165e54975..5da19c6f3 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class SerbianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index ac5fc7124..7a5a3ac8f 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,9 +1,21 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class TamilDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index a35ae987f..834fe1871 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -12,6 +10,13 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.th.ThaiTokenizer" + +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] """ @@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer): class ThaiDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 1db762adb..e2f7b3e35 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from .stop_words import STOP_WORDS -from ...util import DummyTokenizer, registry -from .lex_attrs import LEX_ATTRS +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer): class VietnameseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 457502e21..a413d86eb 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -4,14 +4,13 @@ import tempfile import srsly import warnings from pathlib import Path -from thinc.api import Config from ...errors import Warnings, Errors from ...language import Language from ...scorer import Scorer from ...tokens import Doc from ...training import validate_examples, Example -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util @@ -329,7 +328,7 @@ class ChineseTokenizer(DummyTokenizer): class ChineseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b657ae2e8..78a20c1e8 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog): nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") + nlp.config["initialize"]["lookups"] = None with caplog.at_level(logging.DEBUG): nlp.initialize() assert "W033" in caplog.text