mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge pull request #6165 from explosion/feature/update-tokenizers-initialize
This commit is contained in:
commit
381258b75b
|
@ -562,7 +562,10 @@ class Errors:
|
||||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||||
"component.")
|
"component.")
|
||||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
|
||||||
|
"spacy-lookups-data. If you want to initialize a blank nlp object, "
|
||||||
|
"make sure you have the spacy-lookups-data package installed or "
|
||||||
|
"remove the [initialize.lookups] block from your config.")
|
||||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||||
"Available components: {opts}")
|
"Available components: {opts}")
|
||||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||||
|
@ -680,14 +683,9 @@ class Errors:
|
||||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||||
"the same `Vocab`.")
|
"the same `Vocab`.")
|
||||||
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
||||||
"specified. Provide the name of a pretrained model or the path to "
|
"loaded. Provide the name of a pretrained model or the path to "
|
||||||
"a model when initializing the pipeline:\n"
|
"a model and initialize the pipeline:\n\n"
|
||||||
'config = {\n'
|
'nlp.tokenizer.initialize(pkuseg_model="default")')
|
||||||
' "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
|
|
||||||
' "segmenter": "pkuseg",\n'
|
|
||||||
' "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n'
|
|
||||||
'}\n'
|
|
||||||
'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})')
|
|
||||||
E1001 = ("Target token outside of matched span for match with tokens "
|
E1001 = ("Target token outside of matched span for match with tokens "
|
||||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||||
E1002 = ("Span index out of range.")
|
E1002 = ("Span index out of range.")
|
||||||
|
|
|
@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...lookups import Lookups
|
from ...lookups import Lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
|
@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
@ -16,7 +15,7 @@ from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional, Any, Dict
|
from typing import Optional, Any, Dict
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
@ -10,7 +9,7 @@ from ...compat import copy_reg
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
class KoreanDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
|
@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class PortugueseDefaults(Language.Defaults):
|
class PortugueseDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
from ...lookups import Lookups
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class SerbianDefaults(Language.Defaults):
|
class SerbianDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -1,9 +1,21 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class TamilDefaults(Language.Defaults):
|
class TamilDefaults(Language.Defaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -12,6 +10,13 @@ DEFAULT_CONFIG = """
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.th.ThaiTokenizer"
|
@tokenizers = "spacy.th.ThaiTokenizer"
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.lookups]
|
||||||
|
@misc = "spacy.LookupsDataLoader.v1"
|
||||||
|
lang = ${nlp.lang}
|
||||||
|
tables = ["lexeme_norm"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ThaiDefaults(Language.Defaults):
|
class ThaiDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from .stop_words import STOP_WORDS
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ...util import DummyTokenizer, registry
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
class VietnameseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,16 @@
|
||||||
from typing import Optional, List, Dict, Any
|
from typing import Optional, List, Dict, Any, Callable, Iterable
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import tempfile
|
import tempfile
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ...errors import Warnings, Errors
|
from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples, Example
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -28,6 +27,10 @@ DEFAULT_CONFIG = """
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||||
segmenter = "char"
|
segmenter = "char"
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
pkuseg_model = null
|
pkuseg_model = null
|
||||||
pkuseg_user_dict = "default"
|
pkuseg_user_dict = "default"
|
||||||
"""
|
"""
|
||||||
|
@ -44,41 +47,23 @@ class Segmenter(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = "default",
|
|
||||||
):
|
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(
|
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||||
nlp,
|
|
||||||
segmenter=segmenter,
|
|
||||||
pkuseg_model=pkuseg_model,
|
|
||||||
pkuseg_user_dict=pkuseg_user_dict,
|
|
||||||
)
|
|
||||||
|
|
||||||
return chinese_tokenizer_factory
|
return chinese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
||||||
nlp: Language,
|
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = None,
|
|
||||||
):
|
):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
if isinstance(segmenter, Segmenter):
|
if isinstance(segmenter, Segmenter):
|
||||||
segmenter = segmenter.value
|
segmenter = segmenter.value
|
||||||
self.segmenter = segmenter
|
self.segmenter = segmenter
|
||||||
self.pkuseg_model = pkuseg_model
|
|
||||||
self.pkuseg_user_dict = pkuseg_user_dict
|
|
||||||
self.pkuseg_seg = None
|
self.pkuseg_seg = None
|
||||||
self.jieba_seg = None
|
self.jieba_seg = None
|
||||||
self.configure_segmenter(segmenter)
|
|
||||||
|
|
||||||
def configure_segmenter(self, segmenter: str):
|
|
||||||
if segmenter not in Segmenter.values():
|
if segmenter not in Segmenter.values():
|
||||||
warn_msg = Warnings.W103.format(
|
warn_msg = Warnings.W103.format(
|
||||||
lang="Chinese",
|
lang="Chinese",
|
||||||
|
@ -88,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
self.segmenter = Segmenter.char
|
self.segmenter = Segmenter.char
|
||||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
if segmenter == Segmenter.jieba:
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
self.jieba_seg = try_jieba_import()
|
||||||
self.segmenter,
|
|
||||||
pkuseg_model=self.pkuseg_model,
|
def initialize(
|
||||||
pkuseg_user_dict=self.pkuseg_user_dict,
|
self,
|
||||||
)
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
pkuseg_model: Optional[str] = None,
|
||||||
|
pkuseg_user_dict: str = "default",
|
||||||
|
):
|
||||||
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
|
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.segmenter == Segmenter.jieba:
|
if self.segmenter == Segmenter.jieba:
|
||||||
|
@ -148,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"segmenter": self.segmenter,
|
"segmenter": self.segmenter,
|
||||||
"pkuseg_model": self.pkuseg_model,
|
|
||||||
"pkuseg_user_dict": self.pkuseg_user_dict,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
self.segmenter = config.get("segmenter", Segmenter.char)
|
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||||
self.pkuseg_model = config.get("pkuseg_model", None)
|
|
||||||
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_features_b = b""
|
||||||
|
@ -322,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
@ -333,42 +323,33 @@ class Chinese(Language):
|
||||||
Defaults = ChineseDefaults
|
Defaults = ChineseDefaults
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(segmenter: str) -> None:
|
def try_jieba_import() -> None:
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
if segmenter == Segmenter.jieba:
|
# segment a short text to have jieba initialize its cache in advance
|
||||||
# segment a short text to have jieba initialize its cache in advance
|
list(jieba.cut("作为", cut_all=False))
|
||||||
list(jieba.cut("作为", cut_all=False))
|
|
||||||
|
|
||||||
return jieba
|
return jieba
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.jieba:
|
msg = (
|
||||||
msg = (
|
"Jieba not installed. To use jieba, install it with `pip "
|
||||||
"Jieba not installed. To use jieba, install it with `pip "
|
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
)
|
||||||
)
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def try_pkuseg_import(
|
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||||
segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str
|
|
||||||
) -> None:
|
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
|
|
||||||
if pkuseg_model is None:
|
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
raise FileNotFoundError(msg) from None
|
||||||
raise FileNotFoundError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_pkuseg_trie_data(node, path=""):
|
def _get_pkuseg_trie_data(node, path=""):
|
||||||
|
|
|
@ -272,10 +272,14 @@ def zh_tokenizer_char():
|
||||||
def zh_tokenizer_jieba():
|
def zh_tokenizer_jieba():
|
||||||
pytest.importorskip("jieba")
|
pytest.importorskip("jieba")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
"nlp": {
|
||||||
"segmenter": "jieba",
|
"tokenizer": {
|
||||||
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
|
"segmenter": "jieba",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
nlp = get_lang_class("zh").from_config(config)
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -284,11 +288,19 @@ def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("pkuseg")
|
||||||
pytest.importorskip("pickle5")
|
pytest.importorskip("pickle5")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
"nlp": {
|
||||||
"segmenter": "pkuseg",
|
"tokenizer": {
|
||||||
"pkuseg_model": "default",
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
|
"segmenter": "pkuseg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"initialize": {"tokenizer": {
|
||||||
|
"pkuseg_model": "default",
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
nlp = get_lang_class("zh").from_config(config)
|
||||||
|
nlp.initialize()
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,9 +28,17 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
"nlp": {
|
||||||
"segmenter": "pkuseg",
|
"tokenizer": {
|
||||||
"pkuseg_model": "medicine",
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
|
"segmenter": "pkuseg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"initialize": {"tokenizer": {
|
||||||
|
"pkuseg_model": "medicine",
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
nlp = Chinese.from_config({"nlp": {"tokenizer": config}})
|
nlp = Chinese.from_config(config)
|
||||||
|
nlp.initialize()
|
||||||
zh_tokenizer_serialize(nlp.tokenizer)
|
zh_tokenizer_serialize(nlp.tokenizer)
|
||||||
|
|
|
@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog):
|
||||||
nlp.vocab.lookups = Lookups()
|
nlp.vocab.lookups = Lookups()
|
||||||
assert not len(nlp.vocab.lookups)
|
assert not len(nlp.vocab.lookups)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
|
nlp.config["initialize"]["lookups"] = None
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
assert "W033" in caplog.text
|
assert "W033" in caplog.text
|
||||||
|
|
Loading…
Reference in New Issue
Block a user