From a6548ead1764e4bcff4b19ebba6588780b93d334 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 27 Sep 2020 22:20:14 +0200 Subject: [PATCH 1/3] Add _ as a symbol (#6153) * Add _ to StringStore in Morphology * Add _ as a symbol Add `_` as a symbol instead of adding to the `StringStore`. --- spacy/morphology.pyx | 3 ++- spacy/symbols.pxd | 1 + spacy/symbols.pyx | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fcfe216ba..cc0f61cea 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -29,7 +29,8 @@ cdef class Morphology: FEATURE_SEP = "|" FIELD_SEP = "=" VALUE_SEP = "," - EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0 + # not an empty string so that the PreshMap key is not 0 + EMPTY_MORPH = symbols.NAMES[symbols._] def __init__(self, StringStore strings): self.mem = Pool() diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index e516f3ed9..bc15d9b80 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -466,3 +466,4 @@ cdef enum symbol_t: ENT_ID IDX + _ diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 92607e120..b0345c710 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -465,6 +465,7 @@ IDS = { "acl": acl, "LAW": LAW, "MORPH": MORPH, + "_": _, } From 013b66de05ee31e5e05a440ab5b29173530929fa Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 27 Sep 2020 22:20:45 +0200 Subject: [PATCH 2/3] Add tokenizer scoring to ja / ko / zh (#6152) --- spacy/lang/ja/__init__.py | 6 ++++++ spacy/lang/ko/__init__.py | 6 ++++++ spacy/lang/zh/__init__.py | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 117514c09..e7cc1ef3b 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP from ...compat import copy_reg from ...errors import Errors from ...language import Language +from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc +from ...training import validate_examples from ...util import DummyTokenizer, registry from ... import util @@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer): ) return sub_tokens_list + def score(self, examples): + validate_examples(examples, "JapaneseTokenizer.score") + return Scorer.score_tokenization(examples) + def _get_config(self) -> Dict[str, Any]: return {"split_mode": self.split_mode} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 47a3887a6..dd07ef89c 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...compat import copy_reg +from ...scorer import Scorer from ...symbols import POS +from ...training import validate_examples from ...util import DummyTokenizer, registry @@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer): lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} + def score(self, examples): + validate_examples(examples, "KoreanTokenizer.score") + return Scorer.score_tokenization(examples) + class KoreanDefaults(Language.Defaults): config = Config().from_str(DEFAULT_CONFIG) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 5d3bd2a96..fa9bb810d 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -8,7 +8,9 @@ from thinc.api import Config from ...errors import Warnings, Errors from ...language import Language +from ...scorer import Scorer from ...tokens import Doc +from ...training import validate_examples from ...util import DummyTokenizer, registry from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS @@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer): warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) + def score(self, examples): + validate_examples(examples, "ChineseTokenizer.score") + return Scorer.score_tokenization(examples) + def _get_config(self) -> Dict[str, Any]: return { "segmenter": self.segmenter, From d7ad65a9bbfd09395de933ec38cac2d258e1a94b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 27 Sep 2020 22:31:57 +0200 Subject: [PATCH 3/3] Fix handling of error description [ci skip] --- spacy/cli/_util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 506380b0b..68cb572ea 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -243,6 +243,8 @@ def show_validation_error( yield except ConfigValidationError as e: title = title if title is not None else e.title + if e.desc: + desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}" # Re-generate a new error object with overrides err = e.from_error(e, title="", desc=desc, show_config=show_config) msg.fail(title)