mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Merge pull request #6165 from explosion/feature/update-tokenizers-initialize
This commit is contained in:
commit
381258b75b
|
@ -562,7 +562,10 @@ class Errors:
|
|||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||
"component.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
|
||||
"spacy-lookups-data. If you want to initialize a blank nlp object, "
|
||||
"make sure you have the spacy-lookups-data package installed or "
|
||||
"remove the [initialize.lookups] block from your config.")
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||
|
@ -680,14 +683,9 @@ class Errors:
|
|||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||
"the same `Vocab`.")
|
||||
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
||||
"specified. Provide the name of a pretrained model or the path to "
|
||||
"a model when initializing the pipeline:\n"
|
||||
'config = {\n'
|
||||
' "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
|
||||
' "segmenter": "pkuseg",\n'
|
||||
' "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n'
|
||||
'}\n'
|
||||
'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})')
|
||||
"loaded. Provide the name of a pretrained model or the path to "
|
||||
"a model and initialize the pipeline:\n\n"
|
||||
'nlp.tokenizer.initialize(pkuseg_model="default")')
|
||||
E1001 = ("Target token outside of matched span for match with tokens "
|
||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||
E1002 = ("Span index out of range.")
|
||||
|
|
|
@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
|||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
|||
from .lemmatizer import GreekLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
|
|
@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
|
|||
from pathlib import Path
|
||||
import srsly
|
||||
from collections import namedtuple
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
@ -16,7 +15,7 @@ from ...scorer import Scorer
|
|||
from ...symbols import POS
|
||||
from ...tokens import Doc
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ... import util
|
||||
|
||||
|
||||
|
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional, Any, Dict
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
|
@ -10,7 +9,7 @@ from ...compat import copy_reg
|
|||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class KoreanDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
|
|
@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class LuxembourgishDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
|
|
@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .lemmatizer import RussianLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class SerbianDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -1,9 +1,21 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class TamilDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -12,6 +10,13 @@ DEFAULT_CONFIG = """
|
|||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.th.ThaiTokenizer"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
|
@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class ThaiDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...util import DummyTokenizer, registry
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class VietnameseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -1,17 +1,16 @@
|
|||
from typing import Optional, List, Dict, Any
|
||||
from typing import Optional, List, Dict, Any, Callable, Iterable
|
||||
from enum import Enum
|
||||
import tempfile
|
||||
import srsly
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
|
||||
from ...errors import Warnings, Errors
|
||||
from ...language import Language
|
||||
from ...scorer import Scorer
|
||||
from ...tokens import Doc
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...training import validate_examples, Example
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ... import util
|
||||
|
@ -28,6 +27,10 @@ DEFAULT_CONFIG = """
|
|||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||
segmenter = "char"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.tokenizer]
|
||||
pkuseg_model = null
|
||||
pkuseg_user_dict = "default"
|
||||
"""
|
||||
|
@ -44,41 +47,23 @@ class Segmenter(str, Enum):
|
|||
|
||||
|
||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||
def create_chinese_tokenizer(
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: Optional[str] = "default",
|
||||
):
|
||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
|
||||
def chinese_tokenizer_factory(nlp):
|
||||
return ChineseTokenizer(
|
||||
nlp,
|
||||
segmenter=segmenter,
|
||||
pkuseg_model=pkuseg_model,
|
||||
pkuseg_user_dict=pkuseg_user_dict,
|
||||
)
|
||||
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||
|
||||
return chinese_tokenizer_factory
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(
|
||||
self,
|
||||
nlp: Language,
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: Optional[str] = None,
|
||||
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
||||
):
|
||||
self.vocab = nlp.vocab
|
||||
if isinstance(segmenter, Segmenter):
|
||||
segmenter = segmenter.value
|
||||
self.segmenter = segmenter
|
||||
self.pkuseg_model = pkuseg_model
|
||||
self.pkuseg_user_dict = pkuseg_user_dict
|
||||
self.pkuseg_seg = None
|
||||
self.jieba_seg = None
|
||||
self.configure_segmenter(segmenter)
|
||||
|
||||
def configure_segmenter(self, segmenter: str):
|
||||
if segmenter not in Segmenter.values():
|
||||
warn_msg = Warnings.W103.format(
|
||||
lang="Chinese",
|
||||
|
@ -88,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
)
|
||||
warnings.warn(warn_msg)
|
||||
self.segmenter = Segmenter.char
|
||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
self.segmenter,
|
||||
pkuseg_model=self.pkuseg_model,
|
||||
pkuseg_user_dict=self.pkuseg_user_dict,
|
||||
)
|
||||
if segmenter == Segmenter.jieba:
|
||||
self.jieba_seg = try_jieba_import()
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: str = "default",
|
||||
):
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
||||
)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
if self.segmenter == Segmenter.jieba:
|
||||
|
@ -148,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"segmenter": self.segmenter,
|
||||
"pkuseg_model": self.pkuseg_model,
|
||||
"pkuseg_user_dict": self.pkuseg_user_dict,
|
||||
}
|
||||
|
||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||
self.pkuseg_model = config.get("pkuseg_model", None)
|
||||
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
pkuseg_features_b = b""
|
||||
|
@ -322,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class ChineseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
@ -333,42 +323,33 @@ class Chinese(Language):
|
|||
Defaults = ChineseDefaults
|
||||
|
||||
|
||||
def try_jieba_import(segmenter: str) -> None:
|
||||
def try_jieba_import() -> None:
|
||||
try:
|
||||
import jieba
|
||||
|
||||
if segmenter == Segmenter.jieba:
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
|
||||
return jieba
|
||||
except ImportError:
|
||||
if segmenter == Segmenter.jieba:
|
||||
msg = (
|
||||
"Jieba not installed. To use jieba, install it with `pip "
|
||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg) from None
|
||||
msg = (
|
||||
"Jieba not installed. To use jieba, install it with `pip "
|
||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg) from None
|
||||
|
||||
|
||||
def try_pkuseg_import(
|
||||
segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str
|
||||
) -> None:
|
||||
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||
try:
|
||||
import pkuseg
|
||||
|
||||
if pkuseg_model is None:
|
||||
return None
|
||||
else:
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
except ImportError:
|
||||
if segmenter == Segmenter.pkuseg:
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg) from None
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg) from None
|
||||
except FileNotFoundError:
|
||||
if segmenter == Segmenter.pkuseg:
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg) from None
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg) from None
|
||||
|
||||
|
||||
def _get_pkuseg_trie_data(node, path=""):
|
||||
|
|
|
@ -272,10 +272,14 @@ def zh_tokenizer_char():
|
|||
def zh_tokenizer_jieba():
|
||||
pytest.importorskip("jieba")
|
||||
config = {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "jieba",
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "jieba",
|
||||
}
|
||||
}
|
||||
}
|
||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
||||
nlp = get_lang_class("zh").from_config(config)
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
|
@ -284,11 +288,19 @@ def zh_tokenizer_pkuseg():
|
|||
pytest.importorskip("pkuseg")
|
||||
pytest.importorskip("pickle5")
|
||||
config = {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
"pkuseg_model": "default",
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
}
|
||||
},
|
||||
"initialize": {"tokenizer": {
|
||||
"pkuseg_model": "default",
|
||||
}
|
||||
},
|
||||
}
|
||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
||||
nlp = get_lang_class("zh").from_config(config)
|
||||
nlp.initialize()
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
|
|
|
@ -28,9 +28,17 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
|||
@pytest.mark.slow
|
||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||
config = {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
"pkuseg_model": "medicine",
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
}
|
||||
},
|
||||
"initialize": {"tokenizer": {
|
||||
"pkuseg_model": "medicine",
|
||||
}
|
||||
},
|
||||
}
|
||||
nlp = Chinese.from_config({"nlp": {"tokenizer": config}})
|
||||
nlp = Chinese.from_config(config)
|
||||
nlp.initialize()
|
||||
zh_tokenizer_serialize(nlp.tokenizer)
|
||||
|
|
|
@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog):
|
|||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
nlp.add_pipe("ner")
|
||||
nlp.config["initialize"]["lookups"] = None
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
nlp.initialize()
|
||||
assert "W033" in caplog.text
|
||||
|
|
Loading…
Reference in New Issue
Block a user