diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 117514c09..e7cc1ef3b 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP from ...compat import copy_reg from ...errors import Errors from ...language import Language +from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc +from ...training import validate_examples from ...util import DummyTokenizer, registry from ... import util @@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer): ) return sub_tokens_list + def score(self, examples): + validate_examples(examples, "JapaneseTokenizer.score") + return Scorer.score_tokenization(examples) + def _get_config(self) -> Dict[str, Any]: return {"split_mode": self.split_mode} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 47a3887a6..dd07ef89c 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...compat import copy_reg +from ...scorer import Scorer from ...symbols import POS +from ...training import validate_examples from ...util import DummyTokenizer, registry @@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer): lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} + def score(self, examples): + validate_examples(examples, "KoreanTokenizer.score") + return Scorer.score_tokenization(examples) + class KoreanDefaults(Language.Defaults): config = Config().from_str(DEFAULT_CONFIG) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 5d3bd2a96..fa9bb810d 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -8,7 +8,9 @@ from thinc.api import Config from ...errors import Warnings, Errors from ...language import Language +from ...scorer import Scorer from ...tokens import Doc +from ...training import validate_examples from ...util import DummyTokenizer, registry from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS @@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer): warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) + def score(self, examples): + validate_examples(examples, "ChineseTokenizer.score") + return Scorer.score_tokenization(examples) + def _get_config(self) -> Dict[str, Any]: return { "segmenter": self.segmenter,