Merge pull request #6165 from explosion/feature/update-tokenizers-initialize

This commit is contained in:
Ines Montani 2020-10-01 09:49:47 +02:00 committed by GitHub
commit 381258b75b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 204 additions and 97 deletions

View File

@ -562,7 +562,10 @@ class Errors:
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.") "component.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
"spacy-lookups-data. If you want to initialize a blank nlp object, "
"make sure you have the spacy-lookups-data package installed or "
"remove the [initialize.lookups] block from your config.")
E956 = ("Can't find component '{name}' in [components] block in the config. " E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}") "Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in " E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -680,14 +683,9 @@ class Errors:
E999 = ("Unable to merge the `Doc` objects because they do not all share " E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.") "the same `Vocab`.")
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was " E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
"specified. Provide the name of a pretrained model or the path to " "loaded. Provide the name of a pretrained model or the path to "
"a model when initializing the pipeline:\n" "a model and initialize the pipeline:\n\n"
'config = {\n' 'nlp.tokenizer.initialize(pkuseg_model="default")')
' "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
' "segmenter": "pkuseg",\n'
' "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n'
'}\n'
'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})')
E1001 = ("Target token outside of matched span for match with tokens " E1001 = ("Target token outside of matched span for match with tokens "
"'{span}' and offset '{index}' matched by patterns '{patterns}'.") "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
E1002 = ("Span index out of range.") E1002 = ("Span index out of range.")

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups from ...lookups import Lookups
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS

View File

@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
from pathlib import Path from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -16,7 +15,7 @@ from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
from ... import util from ... import util
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -1,5 +1,4 @@
from typing import Optional, Any, Dict from typing import Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
@ -10,7 +9,7 @@ from ...compat import copy_reg
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults): class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,9 +1,21 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class TamilDefaults(Language.Defaults): class TamilDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -12,6 +10,13 @@ DEFAULT_CONFIG = """
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.th.ThaiTokenizer" @tokenizers = "spacy.th.ThaiTokenizer"
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
""" """
@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer):
class ThaiDefaults(Language.Defaults): class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from .stop_words import STOP_WORDS from ...util import DummyTokenizer, registry, load_config_from_str
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
class VietnameseDefaults(Language.Defaults): class VietnameseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,17 +1,16 @@
from typing import Optional, List, Dict, Any from typing import Optional, List, Dict, Any, Callable, Iterable
from enum import Enum from enum import Enum
import tempfile import tempfile
import srsly import srsly
import warnings import warnings
from pathlib import Path from pathlib import Path
from thinc.api import Config
from ...errors import Warnings, Errors from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer from ...scorer import Scorer
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ... import util from ... import util
@ -28,6 +27,10 @@ DEFAULT_CONFIG = """
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.zh.ChineseTokenizer" @tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char" segmenter = "char"
[initialize]
[initialize.tokenizer]
pkuseg_model = null pkuseg_model = null
pkuseg_user_dict = "default" pkuseg_user_dict = "default"
""" """
@ -44,41 +47,23 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer") @registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer( def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = "default",
):
def chinese_tokenizer_factory(nlp): def chinese_tokenizer_factory(nlp):
return ChineseTokenizer( return ChineseTokenizer(nlp, segmenter=segmenter)
nlp,
segmenter=segmenter,
pkuseg_model=pkuseg_model,
pkuseg_user_dict=pkuseg_user_dict,
)
return chinese_tokenizer_factory return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer): class ChineseTokenizer(DummyTokenizer):
def __init__( def __init__(
self, self, nlp: Language, segmenter: Segmenter = Segmenter.char,
nlp: Language,
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = None,
): ):
self.vocab = nlp.vocab self.vocab = nlp.vocab
if isinstance(segmenter, Segmenter): if isinstance(segmenter, Segmenter):
segmenter = segmenter.value segmenter = segmenter.value
self.segmenter = segmenter self.segmenter = segmenter
self.pkuseg_model = pkuseg_model
self.pkuseg_user_dict = pkuseg_user_dict
self.pkuseg_seg = None self.pkuseg_seg = None
self.jieba_seg = None self.jieba_seg = None
self.configure_segmenter(segmenter)
def configure_segmenter(self, segmenter: str):
if segmenter not in Segmenter.values(): if segmenter not in Segmenter.values():
warn_msg = Warnings.W103.format( warn_msg = Warnings.W103.format(
lang="Chinese", lang="Chinese",
@ -88,11 +73,20 @@ class ChineseTokenizer(DummyTokenizer):
) )
warnings.warn(warn_msg) warnings.warn(warn_msg)
self.segmenter = Segmenter.char self.segmenter = Segmenter.char
self.jieba_seg = try_jieba_import(self.segmenter) if segmenter == Segmenter.jieba:
self.jieba_seg = try_jieba_import()
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: str = "default",
):
if self.segmenter == Segmenter.pkuseg:
self.pkuseg_seg = try_pkuseg_import( self.pkuseg_seg = try_pkuseg_import(
self.segmenter, pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
pkuseg_model=self.pkuseg_model,
pkuseg_user_dict=self.pkuseg_user_dict,
) )
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
@ -148,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer):
def _get_config(self) -> Dict[str, Any]: def _get_config(self) -> Dict[str, Any]:
return { return {
"segmenter": self.segmenter, "segmenter": self.segmenter,
"pkuseg_model": self.pkuseg_model,
"pkuseg_user_dict": self.pkuseg_user_dict,
} }
def _set_config(self, config: Dict[str, Any] = {}) -> None: def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.segmenter = config.get("segmenter", Segmenter.char) self.segmenter = config.get("segmenter", Segmenter.char)
self.pkuseg_model = config.get("pkuseg_model", None)
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
pkuseg_features_b = b"" pkuseg_features_b = b""
@ -322,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@ -333,17 +323,15 @@ class Chinese(Language):
Defaults = ChineseDefaults Defaults = ChineseDefaults
def try_jieba_import(segmenter: str) -> None: def try_jieba_import() -> None:
try: try:
import jieba import jieba
if segmenter == Segmenter.jieba:
# segment a short text to have jieba initialize its cache in advance # segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False)) list(jieba.cut("作为", cut_all=False))
return jieba return jieba
except ImportError: except ImportError:
if segmenter == Segmenter.jieba:
msg = ( msg = (
"Jieba not installed. To use jieba, install it with `pip " "Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba" " install jieba` or from https://github.com/fxsjy/jieba"
@ -351,22 +339,15 @@ def try_jieba_import(segmenter: str) -> None:
raise ImportError(msg) from None raise ImportError(msg) from None
def try_pkuseg_import( def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str
) -> None:
try: try:
import pkuseg import pkuseg
if pkuseg_model is None:
return None
else:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except ImportError: except ImportError:
if segmenter == Segmenter.pkuseg:
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg) from None raise ImportError(msg) from None
except FileNotFoundError: except FileNotFoundError:
if segmenter == Segmenter.pkuseg:
msg = "Unable to load pkuseg model from: " + pkuseg_model msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg) from None raise FileNotFoundError(msg) from None

View File

@ -272,10 +272,14 @@ def zh_tokenizer_char():
def zh_tokenizer_jieba(): def zh_tokenizer_jieba():
pytest.importorskip("jieba") pytest.importorskip("jieba")
config = { config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer", "@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "jieba", "segmenter": "jieba",
} }
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) }
}
nlp = get_lang_class("zh").from_config(config)
return nlp.tokenizer return nlp.tokenizer
@ -284,11 +288,19 @@ def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg") pytest.importorskip("pkuseg")
pytest.importorskip("pickle5") pytest.importorskip("pickle5")
config = { config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer", "@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg", "segmenter": "pkuseg",
}
},
"initialize": {"tokenizer": {
"pkuseg_model": "default", "pkuseg_model": "default",
} }
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) },
}
nlp = get_lang_class("zh").from_config(config)
nlp.initialize()
return nlp.tokenizer return nlp.tokenizer

View File

@ -28,9 +28,17 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
@pytest.mark.slow @pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
config = { config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer", "@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg", "segmenter": "pkuseg",
}
},
"initialize": {"tokenizer": {
"pkuseg_model": "medicine", "pkuseg_model": "medicine",
} }
nlp = Chinese.from_config({"nlp": {"tokenizer": config}}) },
}
nlp = Chinese.from_config(config)
nlp.initialize()
zh_tokenizer_serialize(nlp.tokenizer) zh_tokenizer_serialize(nlp.tokenizer)

View File

@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog):
nlp.vocab.lookups = Lookups() nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups) assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner") nlp.add_pipe("ner")
nlp.config["initialize"]["lookups"] = None
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
nlp.initialize() nlp.initialize()
assert "W033" in caplog.text assert "W033" in caplog.text