Merge branch 'develop' into feature/new-thinc-config-resolution

This commit is contained in:
Ines Montani 2020-09-27 22:34:46 +02:00
commit e04bd16f7f
7 changed files with 24 additions and 1 deletions

View File

@ -243,6 +243,8 @@ def show_validation_error(
yield yield
except ConfigValidationError as e: except ConfigValidationError as e:
title = title if title is not None else e.title title = title if title is not None else e.title
if e.desc:
desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
# Re-generate a new error object with overrides # Re-generate a new error object with overrides
err = e.from_error(e, title="", desc=desc, show_config=show_config) err = e.from_error(e, title="", desc=desc, show_config=show_config)
msg.fail(title) msg.fail(title)

View File

@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
from ...compat import copy_reg from ...compat import copy_reg
from ...errors import Errors from ...errors import Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry
from ... import util from ... import util
@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer):
) )
return sub_tokens_list return sub_tokens_list
def score(self, examples):
validate_examples(examples, "JapaneseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]: def _get_config(self) -> Dict[str, Any]:
return {"split_mode": self.split_mode} return {"split_mode": self.split_mode}

View File

@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...compat import copy_reg from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry
@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer):
lemma = surface lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag} yield {"surface": surface, "lemma": lemma, "tag": tag}
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples)
class KoreanDefaults(Language.Defaults): class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = Config().from_str(DEFAULT_CONFIG)

View File

@ -8,7 +8,9 @@ from thinc.api import Config
from ...errors import Warnings, Errors from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer):
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg) warnings.warn(warn_msg)
def score(self, examples):
validate_examples(examples, "ChineseTokenizer.score")
return Scorer.score_tokenization(examples)
def _get_config(self) -> Dict[str, Any]: def _get_config(self) -> Dict[str, Any]:
return { return {
"segmenter": self.segmenter, "segmenter": self.segmenter,

View File

@ -29,7 +29,8 @@ cdef class Morphology:
FEATURE_SEP = "|" FEATURE_SEP = "|"
FIELD_SEP = "=" FIELD_SEP = "="
VALUE_SEP = "," VALUE_SEP = ","
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0 # not an empty string so that the PreshMap key is not 0
EMPTY_MORPH = symbols.NAMES[symbols._]
def __init__(self, StringStore strings): def __init__(self, StringStore strings):
self.mem = Pool() self.mem = Pool()

View File

@ -466,3 +466,4 @@ cdef enum symbol_t:
ENT_ID ENT_ID
IDX IDX
_

View File

@ -465,6 +465,7 @@ IDS = {
"acl": acl, "acl": acl,
"LAW": LAW, "LAW": LAW,
"MORPH": MORPH, "MORPH": MORPH,
"_": _,
} }