Update errors and make Tokenizer.initialize args less strict

2025-11-02 00:47:52 +03:00 · 2020-09-30 23:48:47 +02:00 · 2020-09-30 23:48:47 +02:00 · 6f29f68f69
commit 6f29f68f69
parent 6b7bb32834
2 changed files with 8 additions and 20 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -554,7 +554,10 @@ class Errors:
    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
            "component.")
-    E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
+    E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
+            "spacy-lookups-data. If you want to initialize a blank nlp object, "
+            "make sure you have the spacy-lookups-data package installed or "
+            "remove the [initialize.lookups] block from your config.")
    E956 = ("Can't find component '{name}' in [components] block in the config. "
            "Available components: {opts}")
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -674,20 +677,7 @@ class Errors:
    E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
             "loaded. Provide the name of a pretrained model or the path to "
             "a model and initialize the pipeline:\n\n"
-             'config = {\n'
-             '    "nlp": {\n'
-             '        "tokenizer": {\n'
-             '            "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
-             '            "segmenter": "pkuseg",\n'
-             '        }\n'
-             '    },\n'
-             '    "initialize": {"tokenizer": {\n'
-             '            "pkuseg_model": "default", # or /path/to/model\n'
-             '        }\n'
-             '    },\n'
-             '}\n'
-             'nlp = Chinese.from_config(config)\n'
-             'nlp.initialize()')
+             'nlp.tokenizer.initialize(pkuseg_model="default")')
    E1001 = ("Target token outside of matched span for match with tokens "
             "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
    E1002 = ("Span index out of range.")
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -56,9 +56,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):

 class ChineseTokenizer(DummyTokenizer):
    def __init__(
-        self,
-        nlp: Language,
-        segmenter: Segmenter = Segmenter.char,
+        self, nlp: Language, segmenter: Segmenter = Segmenter.char,
    ):
        self.vocab = nlp.vocab
        if isinstance(segmenter, Segmenter):
@ -80,9 +78,9 @@ class ChineseTokenizer(DummyTokenizer):

    def initialize(
        self,
-        get_examples: Callable[[], Iterable[Example]],
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
-        nlp: Optional[Language],
+        nlp: Optional[Language] = None,
        pkuseg_model: Optional[str] = None,
        pkuseg_user_dict: str = "default",
    ):