mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
Merge pull request #7495 from adrianeboyd/bugfix/norm-ux
Update lexeme_norm checks
This commit is contained in:
commit
d545ab4ca4
|
@ -73,8 +73,13 @@ class Warnings:
|
||||||
"degree. If this is intentional or the language you're using "
|
"degree. If this is intentional or the language you're using "
|
||||||
"doesn't have a normalization table, please ignore this warning. "
|
"doesn't have a normalization table, please ignore this warning. "
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
"package installed. The languages with lexeme normalization tables "
|
"package installed and load the table in your config. The "
|
||||||
"are currently: {langs}")
|
"languages with lexeme normalization tables are currently: "
|
||||||
|
"{langs}\n\nLoad the table in your config with:\n\n"
|
||||||
|
"[initialize.lookups]\n"
|
||||||
|
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
|
||||||
|
"lang = ${{nlp.lang}}\n"
|
||||||
|
"tables = [\"lexeme_norm\"]\n")
|
||||||
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
||||||
"attribute or operator.")
|
"attribute or operator.")
|
||||||
|
|
||||||
|
|
|
@ -137,6 +137,7 @@ class Morphologizer(Tagger):
|
||||||
DOCS: https://spacy.io/api/morphologizer#initialize
|
DOCS: https://spacy.io/api/morphologizer#initialize
|
||||||
"""
|
"""
|
||||||
validate_get_examples(get_examples, "Morphologizer.initialize")
|
validate_get_examples(get_examples, "Morphologizer.initialize")
|
||||||
|
util.check_lexeme_norms(self.vocab, "morphologizer")
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
self.cfg["labels_morph"] = labels["morph"]
|
self.cfg["labels_morph"] = labels["morph"]
|
||||||
self.cfg["labels_pos"] = labels["pos"]
|
self.cfg["labels_pos"] = labels["pos"]
|
||||||
|
|
|
@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#initialize
|
DOCS: https://spacy.io/api/sentencerecognizer#initialize
|
||||||
"""
|
"""
|
||||||
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
|
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
|
||||||
|
util.check_lexeme_norms(self.vocab, "senter")
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
label_sample = []
|
label_sample = []
|
||||||
assert self.labels, Errors.E924.format(name=self.name)
|
assert self.labels, Errors.E924.format(name=self.name)
|
||||||
|
|
|
@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
|
||||||
DOCS: https://spacy.io/api/tagger#initialize
|
DOCS: https://spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
validate_get_examples(get_examples, "Tagger.initialize")
|
validate_get_examples(get_examples, "Tagger.initialize")
|
||||||
|
util.check_lexeme_norms(self.vocab, "tagger")
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
for tag in labels:
|
for tag in labels:
|
||||||
self.add_label(tag)
|
self.add_label(tag)
|
||||||
|
|
|
@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
|
||||||
|
|
||||||
def initialize(self, get_examples, nlp=None, labels=None):
|
def initialize(self, get_examples, nlp=None, labels=None):
|
||||||
validate_get_examples(get_examples, "Parser.initialize")
|
validate_get_examples(get_examples, "Parser.initialize")
|
||||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
util.check_lexeme_norms(self.vocab, "parser or NER")
|
||||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
|
||||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
|
||||||
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
actions = dict(labels)
|
actions = dict(labels)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -59,7 +59,7 @@ if TYPE_CHECKING:
|
||||||
|
|
||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||||
|
|
||||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||||
# and additional sections are added at the end, in alphabetical order.
|
# and additional sections are added at the end, in alphabetical order.
|
||||||
|
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
|
||||||
|
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
logger_stream_handler = logging.StreamHandler()
|
logger_stream_handler = logging.StreamHandler()
|
||||||
logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
|
logger_stream_handler.setFormatter(
|
||||||
|
logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
|
||||||
|
)
|
||||||
logger.addHandler(logger_stream_handler)
|
logger.addHandler(logger_stream_handler)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool:
|
||||||
if hasattr(func, attr): # function or class instance
|
if hasattr(func, attr): # function or class instance
|
||||||
return True
|
return True
|
||||||
# https://stackoverflow.com/a/55767059
|
# https://stackoverflow.com/a/55767059
|
||||||
if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
|
if (
|
||||||
and func.__module__ in sys.modules: # method
|
hasattr(func, "__qualname__")
|
||||||
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
and hasattr(func, "__module__")
|
||||||
return hasattr(cls_func, attr)
|
and func.__module__ in sys.modules
|
||||||
|
): # method
|
||||||
|
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
||||||
|
return hasattr(cls_func, attr)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
|
||||||
"""
|
"""
|
||||||
if is_in_jupyter():
|
if is_in_jupyter():
|
||||||
from thinc.backends.cupy_ops import CupyOps
|
from thinc.backends.cupy_ops import CupyOps
|
||||||
|
|
||||||
if CupyOps.xp is not None:
|
if CupyOps.xp is not None:
|
||||||
from thinc.backends import contextvars_eq_thread_ops
|
from thinc.backends import contextvars_eq_thread_ops
|
||||||
|
|
||||||
if not contextvars_eq_thread_ops():
|
if not contextvars_eq_thread_ops():
|
||||||
warnings.warn(Warnings.W111)
|
warnings.warn(Warnings.W111)
|
||||||
|
|
||||||
|
|
||||||
|
def check_lexeme_norms(vocab, component_name):
|
||||||
|
lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
|
||||||
|
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
|
||||||
|
langs = ", ".join(LEXEME_NORM_LANGS)
|
||||||
|
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user