diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 506380b0b..68cb572ea 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -243,6 +243,8 @@ def show_validation_error( yield except ConfigValidationError as e: title = title if title is not None else e.title + if e.desc: + desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}" # Re-generate a new error object with overrides err = e.from_error(e, title="", desc=desc, show_config=show_config) msg.fail(title) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 117514c09..e7cc1ef3b 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP from ...compat import copy_reg from ...errors import Errors from ...language import Language +from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc +from ...training import validate_examples from ...util import DummyTokenizer, registry from ... import util @@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer): ) return sub_tokens_list + def score(self, examples): + validate_examples(examples, "JapaneseTokenizer.score") + return Scorer.score_tokenization(examples) + def _get_config(self) -> Dict[str, Any]: return {"split_mode": self.split_mode} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 47a3887a6..dd07ef89c 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...compat import copy_reg +from ...scorer import Scorer from ...symbols import POS +from ...training import validate_examples from ...util import DummyTokenizer, registry @@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer): lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} + def score(self, examples): + validate_examples(examples, "KoreanTokenizer.score") + return Scorer.score_tokenization(examples) + class KoreanDefaults(Language.Defaults): config = Config().from_str(DEFAULT_CONFIG) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 5d3bd2a96..fa9bb810d 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -8,7 +8,9 @@ from thinc.api import Config from ...errors import Warnings, Errors from ...language import Language +from ...scorer import Scorer from ...tokens import Doc +from ...training import validate_examples from ...util import DummyTokenizer, registry from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS @@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer): warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) + def score(self, examples): + validate_examples(examples, "ChineseTokenizer.score") + return Scorer.score_tokenization(examples) + def _get_config(self) -> Dict[str, Any]: return { "segmenter": self.segmenter, diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fcfe216ba..cc0f61cea 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -29,7 +29,8 @@ cdef class Morphology: FEATURE_SEP = "|" FIELD_SEP = "=" VALUE_SEP = "," - EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0 + # not an empty string so that the PreshMap key is not 0 + EMPTY_MORPH = symbols.NAMES[symbols._] def __init__(self, StringStore strings): self.mem = Pool() diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index e516f3ed9..bc15d9b80 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -466,3 +466,4 @@ cdef enum symbol_t: ENT_ID IDX + _ diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 92607e120..b0345c710 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -465,6 +465,7 @@ IDS = { "acl": acl, "LAW": LAW, "MORPH": MORPH, + "_": _, }