Add tokenizer scoring to ja / ko / zh (#6152)

This commit is contained in:
Adriane Boyd 2020-09-27 22:20:45 +02:00 committed by GitHub
parent a6548ead17
commit 013b66de05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 18 additions and 0 deletions

View File

@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
from ...compat import copy_reg
from ...errors import Errors
from ...language import Language
from ...scorer import Scorer
from ...symbols import POS
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from ... import util
@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer):
)
return sub_tokens_list
def score(self, examples):
validate_examples(examples, "JapaneseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]:
return {"split_mode": self.split_mode}

View File

@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry
@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer):
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples)
class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)

View File

@ -8,7 +8,9 @@ from thinc.api import Config
from ...errors import Warnings, Errors
from ...language import Language
from ...scorer import Scorer
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer):
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg)
def score(self, examples):
validate_examples(examples, "ChineseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]:
return {
"segmenter": self.segmenter,