Merge branch 'develop' into feature/new-thinc-config-resolution

This commit is contained in:
Ines Montani 2020-09-27 22:34:46 +02:00
commit e04bd16f7f
7 changed files with 24 additions and 1 deletions

View File

@ -243,6 +243,8 @@ def show_validation_error(
yield
except ConfigValidationError as e:
title = title if title is not None else e.title
if e.desc:
desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
# Re-generate a new error object with overrides
err = e.from_error(e, title="", desc=desc, show_config=show_config)
msg.fail(title)

View File

@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
from ...compat import copy_reg
from ...errors import Errors
from ...language import Language
from ...scorer import Scorer
from ...symbols import POS
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from ... import util
@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer):
)
return sub_tokens_list
def score(self, examples):
validate_examples(examples, "JapaneseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]:
return {"split_mode": self.split_mode}

View File

@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry
@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer):
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples)
class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)

View File

@ -8,7 +8,9 @@ from thinc.api import Config
from ...errors import Warnings, Errors
from ...language import Language
from ...scorer import Scorer
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer):
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg)
def score(self, examples):
validate_examples(examples, "ChineseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]:
return {
"segmenter": self.segmenter,

View File

@ -29,7 +29,8 @@ cdef class Morphology:
FEATURE_SEP = "|"
FIELD_SEP = "="
VALUE_SEP = ","
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
# not an empty string so that the PreshMap key is not 0
EMPTY_MORPH = symbols.NAMES[symbols._]
def __init__(self, StringStore strings):
self.mem = Pool()

View File

@ -466,3 +466,4 @@ cdef enum symbol_t:
ENT_ID
IDX
_

View File

@ -465,6 +465,7 @@ IDS = {
"acl": acl,
"LAW": LAW,
"MORPH": MORPH,
"_": _,
}