diff --git a/spacy/errors.py b/spacy/errors.py index 4ba51f669..1c934d188 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -562,7 +562,10 @@ class Errors: E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " "component.") - E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") + E955 = ("Can't find table(s) '{table}' for language '{lang}' in " + "spacy-lookups-data. If you want to initialize a blank nlp object, " + "make sure you have the spacy-lookups-data package installed or " + "remove the [initialize.lookups] block from your config.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") E957 = ("Writing directly to Language.factories isn't needed anymore in " @@ -680,14 +683,9 @@ class Errors: E999 = ("Unable to merge the `Doc` objects because they do not all share " "the same `Vocab`.") E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was " - "specified. Provide the name of a pretrained model or the path to " - "a model when initializing the pipeline:\n" - 'config = {\n' - ' "@tokenizers": "spacy.zh.ChineseTokenizer",\n' - ' "segmenter": "pkuseg",\n' - ' "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n' - '}\n' - 'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})') + "loaded. Provide the name of a pretrained model or the path to " + "a model and initialize the pipeline:\n\n" + 'nlp.tokenizer.initialize(pkuseg_model="default")') E1001 = ("Target token outside of matched span for match with tokens " "'{span}' and offset '{index}' matched by patterns '{patterns}'.") E1002 = ("Span index out of range.") diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 8cac30b26..7128338af 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class DanishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index b645d3480..99c161961 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class GermanDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 1a7b19914..818405842 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX from .lemmatizer import GreekLemmatizer from ...lookups import Lookups from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class GreekDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index bf7e9987f..cc01f1aea 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index 87373551c..46bef57ca 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class IndonesianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index e7cc1ef3b..4e6bf9d3c 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any from pathlib import Path import srsly from collections import namedtuple -from thinc.api import Config from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS @@ -16,7 +15,7 @@ from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str from ... import util @@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer): class JapaneseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index dd07ef89c..83c9f4962 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,5 +1,4 @@ from typing import Optional, Any, Dict -from thinc.api import Config from .stop_words import STOP_WORDS from .tag_map import TAG_MAP @@ -10,7 +9,7 @@ from ...compat import copy_reg from ...scorer import Scorer from ...symbols import POS from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer): class KoreanDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index da6fe55d7..ead5f5d10 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class LuxembourgishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 0447099f0..1c95c11d9 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class PortugueseDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES prefixes = TOKENIZER_PREFIXES diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 4a296dd23..857e197e9 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .stop_words import STOP_WORDS @@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer from ...language import Language from ...lookups import Lookups +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class RussianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index 165e54975..5da19c6f3 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class SerbianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index ac5fc7124..7a5a3ac8f 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,9 +1,21 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class TamilDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index a35ae987f..834fe1871 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -12,6 +10,13 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.th.ThaiTokenizer" + +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] """ @@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer): class ThaiDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 71f51eac6..1328de495 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from .stop_words import STOP_WORDS -from ...util import DummyTokenizer, registry -from .lex_attrs import LEX_ATTRS +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer): class VietnameseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 752f77d11..858f41f65 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,17 +1,16 @@ -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Callable, Iterable from enum import Enum import tempfile import srsly import warnings from pathlib import Path -from thinc.api import Config from ...errors import Warnings, Errors from ...language import Language from ...scorer import Scorer from ...tokens import Doc -from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...training import validate_examples, Example +from ...util import DummyTokenizer, registry, load_config_from_str from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util @@ -28,6 +27,10 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.zh.ChineseTokenizer" segmenter = "char" + +[initialize] + +[initialize.tokenizer] pkuseg_model = null pkuseg_user_dict = "default" """ @@ -44,41 +47,23 @@ class Segmenter(str, Enum): @registry.tokenizers("spacy.zh.ChineseTokenizer") -def create_chinese_tokenizer( - segmenter: Segmenter = Segmenter.char, - pkuseg_model: Optional[str] = None, - pkuseg_user_dict: Optional[str] = "default", -): +def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,): def chinese_tokenizer_factory(nlp): - return ChineseTokenizer( - nlp, - segmenter=segmenter, - pkuseg_model=pkuseg_model, - pkuseg_user_dict=pkuseg_user_dict, - ) + return ChineseTokenizer(nlp, segmenter=segmenter) return chinese_tokenizer_factory class ChineseTokenizer(DummyTokenizer): def __init__( - self, - nlp: Language, - segmenter: Segmenter = Segmenter.char, - pkuseg_model: Optional[str] = None, - pkuseg_user_dict: Optional[str] = None, + self, nlp: Language, segmenter: Segmenter = Segmenter.char, ): self.vocab = nlp.vocab if isinstance(segmenter, Segmenter): segmenter = segmenter.value self.segmenter = segmenter - self.pkuseg_model = pkuseg_model - self.pkuseg_user_dict = pkuseg_user_dict self.pkuseg_seg = None self.jieba_seg = None - self.configure_segmenter(segmenter) - - def configure_segmenter(self, segmenter: str): if segmenter not in Segmenter.values(): warn_msg = Warnings.W103.format( lang="Chinese", @@ -88,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer): ) warnings.warn(warn_msg) self.segmenter = Segmenter.char - self.jieba_seg = try_jieba_import(self.segmenter) - self.pkuseg_seg = try_pkuseg_import( - self.segmenter, - pkuseg_model=self.pkuseg_model, - pkuseg_user_dict=self.pkuseg_user_dict, - ) + if segmenter == Segmenter.jieba: + self.jieba_seg = try_jieba_import() + + def initialize( + self, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, + nlp: Optional[Language] = None, + pkuseg_model: Optional[str] = None, + pkuseg_user_dict: str = "default", + ): + if self.segmenter == Segmenter.pkuseg: + self.pkuseg_seg = try_pkuseg_import( + pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict, + ) def __call__(self, text: str) -> Doc: if self.segmenter == Segmenter.jieba: @@ -148,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer): def _get_config(self) -> Dict[str, Any]: return { "segmenter": self.segmenter, - "pkuseg_model": self.pkuseg_model, - "pkuseg_user_dict": self.pkuseg_user_dict, } def _set_config(self, config: Dict[str, Any] = {}) -> None: self.segmenter = config.get("segmenter", Segmenter.char) - self.pkuseg_model = config.get("pkuseg_model", None) - self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default") def to_bytes(self, **kwargs): pkuseg_features_b = b"" @@ -322,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer): class ChineseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @@ -333,42 +323,33 @@ class Chinese(Language): Defaults = ChineseDefaults -def try_jieba_import(segmenter: str) -> None: +def try_jieba_import() -> None: try: import jieba - if segmenter == Segmenter.jieba: - # segment a short text to have jieba initialize its cache in advance - list(jieba.cut("作为", cut_all=False)) + # segment a short text to have jieba initialize its cache in advance + list(jieba.cut("作为", cut_all=False)) return jieba except ImportError: - if segmenter == Segmenter.jieba: - msg = ( - "Jieba not installed. To use jieba, install it with `pip " - " install jieba` or from https://github.com/fxsjy/jieba" - ) - raise ImportError(msg) from None + msg = ( + "Jieba not installed. To use jieba, install it with `pip " + " install jieba` or from https://github.com/fxsjy/jieba" + ) + raise ImportError(msg) from None -def try_pkuseg_import( - segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str -) -> None: +def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: try: import pkuseg - if pkuseg_model is None: - return None - else: - return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) + return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) except ImportError: - if segmenter == Segmenter.pkuseg: - msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG - raise ImportError(msg) from None + msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG + raise ImportError(msg) from None except FileNotFoundError: - if segmenter == Segmenter.pkuseg: - msg = "Unable to load pkuseg model from: " + pkuseg_model - raise FileNotFoundError(msg) from None + msg = "Unable to load pkuseg model from: " + pkuseg_model + raise FileNotFoundError(msg) from None def _get_pkuseg_trie_data(node, path=""): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 23fc5e98f..bcf582388 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -272,10 +272,14 @@ def zh_tokenizer_char(): def zh_tokenizer_jieba(): pytest.importorskip("jieba") config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "jieba", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "jieba", + } + } } - nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) + nlp = get_lang_class("zh").from_config(config) return nlp.tokenizer @@ -284,11 +288,19 @@ def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") pytest.importorskip("pickle5") config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "pkuseg", - "pkuseg_model": "default", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "pkuseg", + } + }, + "initialize": {"tokenizer": { + "pkuseg_model": "default", + } + }, } - nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) + nlp = get_lang_class("zh").from_config(config) + nlp.initialize() return nlp.tokenizer diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 5491314e2..58c084ec8 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -28,9 +28,17 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): @pytest.mark.slow def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "pkuseg", - "pkuseg_model": "medicine", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "pkuseg", + } + }, + "initialize": {"tokenizer": { + "pkuseg_model": "medicine", + } + }, } - nlp = Chinese.from_config({"nlp": {"tokenizer": config}}) + nlp = Chinese.from_config(config) + nlp.initialize() zh_tokenizer_serialize(nlp.tokenizer) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b657ae2e8..78a20c1e8 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog): nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") + nlp.config["initialize"]["lookups"] = None with caplog.at_level(logging.DEBUG): nlp.initialize() assert "W033" in caplog.text