mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Add tokenizer scoring to ja / ko / zh (#6152)
This commit is contained in:
parent
a6548ead17
commit
013b66de05
|
@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
|
|||
from ...compat import copy_reg
|
||||
from ...errors import Errors
|
||||
from ...language import Language
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...tokens import Doc
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ... import util
|
||||
|
||||
|
@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
)
|
||||
return sub_tokens_list
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "JapaneseTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {"split_mode": self.split_mode}
|
||||
|
||||
|
|
|
@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...compat import copy_reg
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
|
||||
|
||||
|
@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "KoreanTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
|
||||
class KoreanDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
|
|
@ -8,7 +8,9 @@ from thinc.api import Config
|
|||
|
||||
from ...errors import Warnings, Errors
|
||||
from ...language import Language
|
||||
from ...scorer import Scorer
|
||||
from ...tokens import Doc
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||
warnings.warn(warn_msg)
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "ChineseTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"segmenter": self.segmenter,
|
||||
|
|
Loading…
Reference in New Issue
Block a user