Merge pull request #6165 from explosion/feature/update-tokenizers-initialize

This commit is contained in:
Ines Montani 2020-10-01 09:49:47 +02:00 committed by GitHub
commit 381258b75b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 204 additions and 97 deletions

View File

@ -562,7 +562,10 @@ class Errors:
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
"spacy-lookups-data. If you want to initialize a blank nlp object, "
"make sure you have the spacy-lookups-data package installed or "
"remove the [initialize.lookups] block from your config.")
E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -680,14 +683,9 @@ class Errors:
E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.")
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
"specified. Provide the name of a pretrained model or the path to "
"a model when initializing the pipeline:\n"
'config = {\n'
' "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
' "segmenter": "pkuseg",\n'
' "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n'
'}\n'
'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})')
"loaded. Provide the name of a pretrained model or the path to "
"a model and initialize the pipeline:\n\n"
'nlp.tokenizer.initialize(pkuseg_model="default")')
E1001 = ("Target token outside of matched span for match with tokens "
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
E1002 = ("Span index out of range.")

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class DanishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GermanDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GreekDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS

View File

@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class IndonesianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
from pathlib import Path
import srsly
from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
@ -16,7 +15,7 @@ from ...scorer import Scorer
from ...symbols import POS
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from ...util import DummyTokenizer, registry, load_config_from_str
from ... import util
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
class JapaneseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -1,5 +1,4 @@
from typing import Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
@ -10,7 +9,7 @@ from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class LuxembourgishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS

View File

@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class PortugueseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ...language import Language
from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class RussianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class SerbianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,9 +1,21 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class TamilDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer, registry
from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """
@ -12,6 +10,13 @@ DEFAULT_CONFIG = """
[nlp.tokenizer]
@tokenizers = "spacy.th.ThaiTokenizer"
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer):
class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from .stop_words import STOP_WORDS
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
class VietnameseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,17 +1,16 @@
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Callable, Iterable
from enum import Enum
import tempfile
import srsly
import warnings
from pathlib import Path
from thinc.api import Config
from ...errors import Warnings, Errors
from ...language import Language
from ...scorer import Scorer
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry, load_config_from_str
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ... import util
@ -28,6 +27,10 @@ DEFAULT_CONFIG = """
[nlp.tokenizer]
@tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char"
[initialize]
[initialize.tokenizer]
pkuseg_model = null
pkuseg_user_dict = "default"
"""
@ -44,41 +47,23 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = "default",
):
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(
nlp,
segmenter=segmenter,
pkuseg_model=pkuseg_model,
pkuseg_user_dict=pkuseg_user_dict,
)
return ChineseTokenizer(nlp, segmenter=segmenter)
return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer):
def __init__(
self,
nlp: Language,
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = None,
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
):
self.vocab = nlp.vocab
if isinstance(segmenter, Segmenter):
segmenter = segmenter.value
self.segmenter = segmenter
self.pkuseg_model = pkuseg_model
self.pkuseg_user_dict = pkuseg_user_dict
self.pkuseg_seg = None
self.jieba_seg = None
self.configure_segmenter(segmenter)
def configure_segmenter(self, segmenter: str):
if segmenter not in Segmenter.values():
warn_msg = Warnings.W103.format(
lang="Chinese",
@ -88,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer):
)
warnings.warn(warn_msg)
self.segmenter = Segmenter.char
self.jieba_seg = try_jieba_import(self.segmenter)
self.pkuseg_seg = try_pkuseg_import(
self.segmenter,
pkuseg_model=self.pkuseg_model,
pkuseg_user_dict=self.pkuseg_user_dict,
)
if segmenter == Segmenter.jieba:
self.jieba_seg = try_jieba_import()
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: str = "default",
):
if self.segmenter == Segmenter.pkuseg:
self.pkuseg_seg = try_pkuseg_import(
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
)
def __call__(self, text: str) -> Doc:
if self.segmenter == Segmenter.jieba:
@ -148,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer):
def _get_config(self) -> Dict[str, Any]:
return {
"segmenter": self.segmenter,
"pkuseg_model": self.pkuseg_model,
"pkuseg_user_dict": self.pkuseg_user_dict,
}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.segmenter = config.get("segmenter", Segmenter.char)
self.pkuseg_model = config.get("pkuseg_model", None)
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
def to_bytes(self, **kwargs):
pkuseg_features_b = b""
@ -322,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
class ChineseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@ -333,42 +323,33 @@ class Chinese(Language):
Defaults = ChineseDefaults
def try_jieba_import(segmenter: str) -> None:
def try_jieba_import() -> None:
try:
import jieba
if segmenter == Segmenter.jieba:
# segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False))
# segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False))
return jieba
except ImportError:
if segmenter == Segmenter.jieba:
msg = (
"Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg) from None
msg = (
"Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg) from None
def try_pkuseg_import(
segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str
) -> None:
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
try:
import pkuseg
if pkuseg_model is None:
return None
else:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except ImportError:
if segmenter == Segmenter.pkuseg:
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg) from None
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg) from None
except FileNotFoundError:
if segmenter == Segmenter.pkuseg:
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg) from None
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg) from None
def _get_pkuseg_trie_data(node, path=""):

View File

@ -272,10 +272,14 @@ def zh_tokenizer_char():
def zh_tokenizer_jieba():
pytest.importorskip("jieba")
config = {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "jieba",
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "jieba",
}
}
}
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
nlp = get_lang_class("zh").from_config(config)
return nlp.tokenizer
@ -284,11 +288,19 @@ def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg")
pytest.importorskip("pickle5")
config = {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg",
"pkuseg_model": "default",
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg",
}
},
"initialize": {"tokenizer": {
"pkuseg_model": "default",
}
},
}
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
nlp = get_lang_class("zh").from_config(config)
nlp.initialize()
return nlp.tokenizer

View File

@ -28,9 +28,17 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
@pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
config = {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg",
"pkuseg_model": "medicine",
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg",
}
},
"initialize": {"tokenizer": {
"pkuseg_model": "medicine",
}
},
}
nlp = Chinese.from_config({"nlp": {"tokenizer": config}})
nlp = Chinese.from_config(config)
nlp.initialize()
zh_tokenizer_serialize(nlp.tokenizer)

View File

@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog):
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner")
nlp.config["initialize"]["lookups"] = None
with caplog.at_level(logging.DEBUG):
nlp.initialize()
assert "W033" in caplog.text