mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge branch 'develop' into feature/new-thinc-config-resolution
This commit is contained in:
commit
e04bd16f7f
|
@ -243,6 +243,8 @@ def show_validation_error(
|
||||||
yield
|
yield
|
||||||
except ConfigValidationError as e:
|
except ConfigValidationError as e:
|
||||||
title = title if title is not None else e.title
|
title = title if title is not None else e.title
|
||||||
|
if e.desc:
|
||||||
|
desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
|
||||||
# Re-generate a new error object with overrides
|
# Re-generate a new error object with overrides
|
||||||
err = e.from_error(e, title="", desc=desc, show_config=show_config)
|
err = e.from_error(e, title="", desc=desc, show_config=show_config)
|
||||||
msg.fail(title)
|
msg.fail(title)
|
||||||
|
|
|
@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
return sub_tokens_list
|
return sub_tokens_list
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "JapaneseTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {"split_mode": self.split_mode}
|
return {"split_mode": self.split_mode}
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
lemma = surface
|
lemma = surface
|
||||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "KoreanTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
class KoreanDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
|
@ -8,7 +8,9 @@ from thinc.api import Config
|
||||||
|
|
||||||
from ...errors import Warnings, Errors
|
from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "ChineseTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"segmenter": self.segmenter,
|
"segmenter": self.segmenter,
|
||||||
|
|
|
@ -29,7 +29,8 @@ cdef class Morphology:
|
||||||
FEATURE_SEP = "|"
|
FEATURE_SEP = "|"
|
||||||
FIELD_SEP = "="
|
FIELD_SEP = "="
|
||||||
VALUE_SEP = ","
|
VALUE_SEP = ","
|
||||||
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
|
# not an empty string so that the PreshMap key is not 0
|
||||||
|
EMPTY_MORPH = symbols.NAMES[symbols._]
|
||||||
|
|
||||||
def __init__(self, StringStore strings):
|
def __init__(self, StringStore strings):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
|
@ -466,3 +466,4 @@ cdef enum symbol_t:
|
||||||
ENT_ID
|
ENT_ID
|
||||||
|
|
||||||
IDX
|
IDX
|
||||||
|
_
|
||||||
|
|
|
@ -465,6 +465,7 @@ IDS = {
|
||||||
"acl": acl,
|
"acl": acl,
|
||||||
"LAW": LAW,
|
"LAW": LAW,
|
||||||
"MORPH": MORPH,
|
"MORPH": MORPH,
|
||||||
|
"_": _,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user