Add tokenizer scoring to ja / ko / zh (#6152)

This commit is contained in:
Adriane Boyd 2020-09-27 22:20:45 +02:00 committed by GitHub
parent a6548ead17
commit 013b66de05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 18 additions and 0 deletions

View File

@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
from ...compat import copy_reg from ...compat import copy_reg
from ...errors import Errors from ...errors import Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry
from ... import util from ... import util
@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer):
) )
return sub_tokens_list return sub_tokens_list
def score(self, examples):
validate_examples(examples, "JapaneseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]: def _get_config(self) -> Dict[str, Any]:
return {"split_mode": self.split_mode} return {"split_mode": self.split_mode}

View File

@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...compat import copy_reg from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry
@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer):
lemma = surface lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag} yield {"surface": surface, "lemma": lemma, "tag": tag}
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples)
class KoreanDefaults(Language.Defaults): class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = Config().from_str(DEFAULT_CONFIG)

View File

@ -8,7 +8,9 @@ from thinc.api import Config
from ...errors import Warnings, Errors from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer):
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg) warnings.warn(warn_msg)
def score(self, examples):
validate_examples(examples, "ChineseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]: def _get_config(self) -> Dict[str, Any]:
return { return {
"segmenter": self.segmenter, "segmenter": self.segmenter,