From 6467a560e30052d79c3a9dd1b5649f12ddcb13f6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 21:10:22 +0200 Subject: [PATCH 1/5] WIP: Test updating Chinese tokenizer --- spacy/lang/zh/__init__.py | 33 ++++++++++++++++++++------------- spacy/tests/conftest.py | 13 +++++++++---- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 752f77d11..457502e21 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Callable, Iterable from enum import Enum import tempfile import srsly @@ -10,7 +10,7 @@ from ...errors import Warnings, Errors from ...language import Language from ...scorer import Scorer from ...tokens import Doc -from ...training import validate_examples +from ...training import validate_examples, Example from ...util import DummyTokenizer, registry from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS @@ -28,6 +28,10 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.zh.ChineseTokenizer" segmenter = "char" + +[initialize] + +[initialize.tokenizer] pkuseg_model = null pkuseg_user_dict = "default" """ @@ -44,18 +48,9 @@ class Segmenter(str, Enum): @registry.tokenizers("spacy.zh.ChineseTokenizer") -def create_chinese_tokenizer( - segmenter: Segmenter = Segmenter.char, - pkuseg_model: Optional[str] = None, - pkuseg_user_dict: Optional[str] = "default", -): +def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,): def chinese_tokenizer_factory(nlp): - return ChineseTokenizer( - nlp, - segmenter=segmenter, - pkuseg_model=pkuseg_model, - pkuseg_user_dict=pkuseg_user_dict, - ) + return ChineseTokenizer(nlp, segmenter=segmenter) return chinese_tokenizer_factory @@ -78,6 +73,18 @@ class ChineseTokenizer(DummyTokenizer): self.jieba_seg = None self.configure_segmenter(segmenter) + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Optional[Language], + pkuseg_model: Optional[str] = None, + pkuseg_user_dict: Optional[str] = None + ): + self.pkuseg_model = pkuseg_model + self.pkuseg_user_dict = pkuseg_user_dict + self.configure_segmenter(self.segmenter) + def configure_segmenter(self, segmenter: str): if segmenter not in Segmenter.values(): warn_msg = Warnings.W103.format( diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 23fc5e98f..6cf019173 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -284,11 +284,16 @@ def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") pytest.importorskip("pickle5") config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "pkuseg", - "pkuseg_model": "default", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "pkuseg", + } + }, + "initialize": {"tokenizer": {"pkuseg_model": "default"}}, } - nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) + nlp = get_lang_class("zh").from_config(config) + nlp.initialize() return nlp.tokenizer From 34f9c26c6235842db219a543897baba95fd980ff Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Sep 2020 10:20:14 +0200 Subject: [PATCH 2/5] Add lexeme norm defaults --- spacy/lang/da/__init__.py | 12 ++++++++++++ spacy/lang/de/__init__.py | 12 ++++++++++++ spacy/lang/el/__init__.py | 12 ++++++++++++ spacy/lang/en/__init__.py | 13 ++++++++++++- spacy/lang/id/__init__.py | 12 ++++++++++++ spacy/lang/ja/__init__.py | 5 ++--- spacy/lang/ko/__init__.py | 5 ++--- spacy/lang/lb/__init__.py | 12 ++++++++++++ spacy/lang/pt/__init__.py | 12 ++++++++++++ spacy/lang/ru/__init__.py | 13 ++++++++++++- spacy/lang/sr/__init__.py | 12 ++++++++++++ spacy/lang/ta/__init__.py | 12 ++++++++++++ spacy/lang/th/__init__.py | 13 +++++++++---- spacy/lang/vi/__init__.py | 10 ++++------ spacy/lang/zh/__init__.py | 5 ++--- spacy/tests/parser/test_ner.py | 1 + 16 files changed, 140 insertions(+), 21 deletions(-) diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 8cac30b26..7128338af 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class DanishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index b645d3480..99c161961 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class GermanDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 1a7b19914..818405842 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX from .lemmatizer import GreekLemmatizer from ...lookups import Lookups from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class GreekDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index bf7e9987f..f4ea10f9c 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -10,9 +9,21 @@ from .punctuation import TOKENIZER_INFIXES from .lemmatizer import EnglishLemmatizer from ...language import Language from ...lookups import Lookups +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class EnglishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index 87373551c..46bef57ca 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class IndonesianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index e7cc1ef3b..4e6bf9d3c 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any from pathlib import Path import srsly from collections import namedtuple -from thinc.api import Config from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS @@ -16,7 +15,7 @@ from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str from ... import util @@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer): class JapaneseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index dd07ef89c..83c9f4962 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,5 +1,4 @@ from typing import Optional, Any, Dict -from thinc.api import Config from .stop_words import STOP_WORDS from .tag_map import TAG_MAP @@ -10,7 +9,7 @@ from ...compat import copy_reg from ...scorer import Scorer from ...symbols import POS from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer): class KoreanDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index da6fe55d7..ead5f5d10 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class LuxembourgishDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 0447099f0..1c95c11d9 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class PortugueseDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES prefixes = TOKENIZER_PREFIXES diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 4a296dd23..857e197e9 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .stop_words import STOP_WORDS @@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer from ...language import Language from ...lookups import Lookups +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class RussianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index 165e54975..5da19c6f3 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class SerbianDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index ac5fc7124..7a5a3ac8f 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,9 +1,21 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language +from ...util import load_config_from_str + + +DEFAULT_CONFIG = """ +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] +""" class TamilDefaults(Language.Defaults): + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index a35ae987f..834fe1871 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -12,6 +10,13 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.th.ThaiTokenizer" + +[initialize] + +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_norm"] """ @@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer): class ThaiDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 1db762adb..e2f7b3e35 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from .stop_words import STOP_WORDS -from ...util import DummyTokenizer, registry -from .lex_attrs import LEX_ATTRS +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer): class VietnameseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 457502e21..a413d86eb 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -4,14 +4,13 @@ import tempfile import srsly import warnings from pathlib import Path -from thinc.api import Config from ...errors import Warnings, Errors from ...language import Language from ...scorer import Scorer from ...tokens import Doc from ...training import validate_examples, Example -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util @@ -329,7 +328,7 @@ class ChineseTokenizer(DummyTokenizer): class ChineseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b657ae2e8..78a20c1e8 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog): nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") + nlp.config["initialize"]["lookups"] = None with caplog.at_level(logging.DEBUG): nlp.initialize() assert "W033" in caplog.text From 6b7bb32834c412367c2d49aa62a3bd1deeb4f921 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 30 Sep 2020 11:46:45 +0200 Subject: [PATCH 3/5] Refactor Chinese initialization --- spacy/errors.py | 20 +++++-- spacy/lang/zh/__init__.py | 81 ++++++++++----------------- spacy/tests/conftest.py | 15 +++-- spacy/tests/lang/zh/test_serialize.py | 16 ++++-- 4 files changed, 66 insertions(+), 66 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 09b722a7b..f8fb7dd8b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -672,14 +672,22 @@ class Errors: E999 = ("Unable to merge the `Doc` objects because they do not all share " "the same `Vocab`.") E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was " - "specified. Provide the name of a pretrained model or the path to " - "a model when initializing the pipeline:\n" + "loaded. Provide the name of a pretrained model or the path to " + "a model and initialize the pipeline:\n\n" 'config = {\n' - ' "@tokenizers": "spacy.zh.ChineseTokenizer",\n' - ' "segmenter": "pkuseg",\n' - ' "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n' + ' "nlp": {\n' + ' "tokenizer": {\n' + ' "@tokenizers": "spacy.zh.ChineseTokenizer",\n' + ' "segmenter": "pkuseg",\n' + ' }\n' + ' },\n' + ' "initialize": {"tokenizer": {\n' + ' "pkuseg_model": "default", # or /path/to/model\n' + ' }\n' + ' },\n' '}\n' - 'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})') + 'nlp = Chinese.from_config(config)\n' + 'nlp.initialize()') E1001 = ("Target token outside of matched span for match with tokens " "'{span}' and offset '{index}' matched by patterns '{patterns}'.") E1002 = ("Span index out of range.") diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index a413d86eb..ecabb6555 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -59,32 +59,13 @@ class ChineseTokenizer(DummyTokenizer): self, nlp: Language, segmenter: Segmenter = Segmenter.char, - pkuseg_model: Optional[str] = None, - pkuseg_user_dict: Optional[str] = None, ): self.vocab = nlp.vocab if isinstance(segmenter, Segmenter): segmenter = segmenter.value self.segmenter = segmenter - self.pkuseg_model = pkuseg_model - self.pkuseg_user_dict = pkuseg_user_dict self.pkuseg_seg = None self.jieba_seg = None - self.configure_segmenter(segmenter) - - def initialize( - self, - get_examples: Callable[[], Iterable[Example]], - *, - nlp: Optional[Language], - pkuseg_model: Optional[str] = None, - pkuseg_user_dict: Optional[str] = None - ): - self.pkuseg_model = pkuseg_model - self.pkuseg_user_dict = pkuseg_user_dict - self.configure_segmenter(self.segmenter) - - def configure_segmenter(self, segmenter: str): if segmenter not in Segmenter.values(): warn_msg = Warnings.W103.format( lang="Chinese", @@ -94,12 +75,21 @@ class ChineseTokenizer(DummyTokenizer): ) warnings.warn(warn_msg) self.segmenter = Segmenter.char - self.jieba_seg = try_jieba_import(self.segmenter) - self.pkuseg_seg = try_pkuseg_import( - self.segmenter, - pkuseg_model=self.pkuseg_model, - pkuseg_user_dict=self.pkuseg_user_dict, - ) + if segmenter == Segmenter.jieba: + self.jieba_seg = try_jieba_import() + + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Optional[Language], + pkuseg_model: Optional[str] = None, + pkuseg_user_dict: str = "default", + ): + if self.segmenter == Segmenter.pkuseg: + self.pkuseg_seg = try_pkuseg_import( + pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict, + ) def __call__(self, text: str) -> Doc: if self.segmenter == Segmenter.jieba: @@ -154,14 +144,10 @@ class ChineseTokenizer(DummyTokenizer): def _get_config(self) -> Dict[str, Any]: return { "segmenter": self.segmenter, - "pkuseg_model": self.pkuseg_model, - "pkuseg_user_dict": self.pkuseg_user_dict, } def _set_config(self, config: Dict[str, Any] = {}) -> None: self.segmenter = config.get("segmenter", Segmenter.char) - self.pkuseg_model = config.get("pkuseg_model", None) - self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default") def to_bytes(self, **kwargs): pkuseg_features_b = b"" @@ -339,42 +325,33 @@ class Chinese(Language): Defaults = ChineseDefaults -def try_jieba_import(segmenter: str) -> None: +def try_jieba_import() -> None: try: import jieba - if segmenter == Segmenter.jieba: - # segment a short text to have jieba initialize its cache in advance - list(jieba.cut("作为", cut_all=False)) + # segment a short text to have jieba initialize its cache in advance + list(jieba.cut("作为", cut_all=False)) return jieba except ImportError: - if segmenter == Segmenter.jieba: - msg = ( - "Jieba not installed. To use jieba, install it with `pip " - " install jieba` or from https://github.com/fxsjy/jieba" - ) - raise ImportError(msg) from None + msg = ( + "Jieba not installed. To use jieba, install it with `pip " + " install jieba` or from https://github.com/fxsjy/jieba" + ) + raise ImportError(msg) from None -def try_pkuseg_import( - segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str -) -> None: +def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: try: import pkuseg - if pkuseg_model is None: - return None - else: - return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) + return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) except ImportError: - if segmenter == Segmenter.pkuseg: - msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG - raise ImportError(msg) from None + msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG + raise ImportError(msg) from None except FileNotFoundError: - if segmenter == Segmenter.pkuseg: - msg = "Unable to load pkuseg model from: " + pkuseg_model - raise FileNotFoundError(msg) from None + msg = "Unable to load pkuseg model from: " + pkuseg_model + raise FileNotFoundError(msg) from None def _get_pkuseg_trie_data(node, path=""): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6cf019173..bcf582388 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -272,10 +272,14 @@ def zh_tokenizer_char(): def zh_tokenizer_jieba(): pytest.importorskip("jieba") config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "jieba", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "jieba", + } + } } - nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) + nlp = get_lang_class("zh").from_config(config) return nlp.tokenizer @@ -290,7 +294,10 @@ def zh_tokenizer_pkuseg(): "segmenter": "pkuseg", } }, - "initialize": {"tokenizer": {"pkuseg_model": "default"}}, + "initialize": {"tokenizer": { + "pkuseg_model": "default", + } + }, } nlp = get_lang_class("zh").from_config(config) nlp.initialize() diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 5491314e2..58c084ec8 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -28,9 +28,17 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): @pytest.mark.slow def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "pkuseg", - "pkuseg_model": "medicine", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "pkuseg", + } + }, + "initialize": {"tokenizer": { + "pkuseg_model": "medicine", + } + }, } - nlp = Chinese.from_config({"nlp": {"tokenizer": config}}) + nlp = Chinese.from_config(config) + nlp.initialize() zh_tokenizer_serialize(nlp.tokenizer) From 6f29f68f694d183b58ff8091d473e909231b52ec Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Sep 2020 23:48:47 +0200 Subject: [PATCH 4/5] Update errors and make Tokenizer.initialize args less strict --- spacy/errors.py | 20 +++++--------------- spacy/lang/zh/__init__.py | 8 +++----- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index f8fb7dd8b..1263796b3 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -554,7 +554,10 @@ class Errors: E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " "component.") - E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") + E955 = ("Can't find table(s) '{table}' for language '{lang}' in " + "spacy-lookups-data. If you want to initialize a blank nlp object, " + "make sure you have the spacy-lookups-data package installed or " + "remove the [initialize.lookups] block from your config.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") E957 = ("Writing directly to Language.factories isn't needed anymore in " @@ -674,20 +677,7 @@ class Errors: E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was " "loaded. Provide the name of a pretrained model or the path to " "a model and initialize the pipeline:\n\n" - 'config = {\n' - ' "nlp": {\n' - ' "tokenizer": {\n' - ' "@tokenizers": "spacy.zh.ChineseTokenizer",\n' - ' "segmenter": "pkuseg",\n' - ' }\n' - ' },\n' - ' "initialize": {"tokenizer": {\n' - ' "pkuseg_model": "default", # or /path/to/model\n' - ' }\n' - ' },\n' - '}\n' - 'nlp = Chinese.from_config(config)\n' - 'nlp.initialize()') + 'nlp.tokenizer.initialize(pkuseg_model="default")') E1001 = ("Target token outside of matched span for match with tokens " "'{span}' and offset '{index}' matched by patterns '{patterns}'.") E1002 = ("Span index out of range.") diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ecabb6555..858f41f65 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -56,9 +56,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,): class ChineseTokenizer(DummyTokenizer): def __init__( - self, - nlp: Language, - segmenter: Segmenter = Segmenter.char, + self, nlp: Language, segmenter: Segmenter = Segmenter.char, ): self.vocab = nlp.vocab if isinstance(segmenter, Segmenter): @@ -80,9 +78,9 @@ class ChineseTokenizer(DummyTokenizer): def initialize( self, - get_examples: Callable[[], Iterable[Example]], + get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, - nlp: Optional[Language], + nlp: Optional[Language] = None, pkuseg_model: Optional[str] = None, pkuseg_user_dict: str = "default", ): From 4b6afd36114fbe1871f17998f9e3f4ec0e116f0f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Sep 2020 23:49:29 +0200 Subject: [PATCH 5/5] Remove English [initialize] default block for now to get tests to pass --- spacy/lang/en/__init__.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index f4ea10f9c..cc01f1aea 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_INFIXES from .lemmatizer import EnglishLemmatizer from ...language import Language from ...lookups import Lookups -from ...util import load_config_from_str - - -DEFAULT_CONFIG = """ -[initialize] - -[initialize.lookups] -@misc = "spacy.LookupsDataLoader.v1" -lang = ${nlp.lang} -tables = ["lexeme_norm"] -""" class EnglishDefaults(Language.Defaults): - config = load_config_from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS