mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update errors and make Tokenizer.initialize args less strict
This commit is contained in:
parent
6b7bb32834
commit
6f29f68f69
|
@ -554,7 +554,10 @@ class Errors:
|
|||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||
"component.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
|
||||
"spacy-lookups-data. If you want to initialize a blank nlp object, "
|
||||
"make sure you have the spacy-lookups-data package installed or "
|
||||
"remove the [initialize.lookups] block from your config.")
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||
|
@ -674,20 +677,7 @@ class Errors:
|
|||
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
||||
"loaded. Provide the name of a pretrained model or the path to "
|
||||
"a model and initialize the pipeline:\n\n"
|
||||
'config = {\n'
|
||||
' "nlp": {\n'
|
||||
' "tokenizer": {\n'
|
||||
' "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
|
||||
' "segmenter": "pkuseg",\n'
|
||||
' }\n'
|
||||
' },\n'
|
||||
' "initialize": {"tokenizer": {\n'
|
||||
' "pkuseg_model": "default", # or /path/to/model\n'
|
||||
' }\n'
|
||||
' },\n'
|
||||
'}\n'
|
||||
'nlp = Chinese.from_config(config)\n'
|
||||
'nlp.initialize()')
|
||||
'nlp.tokenizer.initialize(pkuseg_model="default")')
|
||||
E1001 = ("Target token outside of matched span for match with tokens "
|
||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||
E1002 = ("Span index out of range.")
|
||||
|
|
|
@ -56,9 +56,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
|
|||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(
|
||||
self,
|
||||
nlp: Language,
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
||||
):
|
||||
self.vocab = nlp.vocab
|
||||
if isinstance(segmenter, Segmenter):
|
||||
|
@ -80,9 +78,9 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
nlp: Optional[Language],
|
||||
nlp: Optional[Language] = None,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: str = "default",
|
||||
):
|
||||
|
|
Loading…
Reference in New Issue
Block a user