mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Update lexeme_norm checks
* Add util method for check * Add new languages to list with lexeme norm tables * Add check to all relevant components * Add config details to warning message Note that we're not actually inspecting the model config to see if `NORM` is used as an attribute, so it may warn in cases where it's not relevant.
This commit is contained in:
parent
34e13c1161
commit
39153ef90f
|
@ -73,8 +73,13 @@ class Warnings:
|
|||
"degree. If this is intentional or the language you're using "
|
||||
"doesn't have a normalization table, please ignore this warning. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed. The languages with lexeme normalization tables "
|
||||
"are currently: {langs}")
|
||||
"package installed and load the table in your config. The "
|
||||
"languages with lexeme normalization tables are currently: "
|
||||
"{langs}\n\nLoad the table in your config with:\n\n"
|
||||
"[initialize.lookups]\n"
|
||||
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
|
||||
"lang = ${{nlp.lang}}\n"
|
||||
"tables = [\"lexeme_norm\"]\n")
|
||||
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
||||
"attribute or operator.")
|
||||
|
||||
|
|
|
@ -137,6 +137,7 @@ class Morphologizer(Tagger):
|
|||
DOCS: https://spacy.io/api/morphologizer#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "Morphologizer.initialize")
|
||||
util.check_lexeme_norms(self.vocab, "morphologizer")
|
||||
if labels is not None:
|
||||
self.cfg["labels_morph"] = labels["morph"]
|
||||
self.cfg["labels_pos"] = labels["pos"]
|
||||
|
|
|
@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
|
|||
DOCS: https://spacy.io/api/sentencerecognizer#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
|
||||
util.check_lexeme_norms(self.vocab, "senter")
|
||||
doc_sample = []
|
||||
label_sample = []
|
||||
assert self.labels, Errors.E924.format(name=self.name)
|
||||
|
|
|
@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/tagger#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "Tagger.initialize")
|
||||
util.check_lexeme_norms(self.vocab, "tagger")
|
||||
if labels is not None:
|
||||
for tag in labels:
|
||||
self.add_label(tag)
|
||||
|
|
|
@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
def initialize(self, get_examples, nlp=None, labels=None):
|
||||
validate_get_examples(get_examples, "Parser.initialize")
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
||||
util.check_lexeme_norms(self.vocab, "parser or NER")
|
||||
if labels is not None:
|
||||
actions = dict(labels)
|
||||
else:
|
||||
|
|
|
@ -59,7 +59,7 @@ if TYPE_CHECKING:
|
|||
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
DEFAULT_OOV_PROB = -20
|
||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
|
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
|
|||
|
||||
logger = logging.getLogger("spacy")
|
||||
logger_stream_handler = logging.StreamHandler()
|
||||
logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
|
||||
logger_stream_handler.setFormatter(
|
||||
logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
|
||||
)
|
||||
logger.addHandler(logger_stream_handler)
|
||||
|
||||
|
||||
|
@ -1454,8 +1456,11 @@ def is_cython_func(func: Callable) -> bool:
|
|||
if hasattr(func, attr): # function or class instance
|
||||
return True
|
||||
# https://stackoverflow.com/a/55767059
|
||||
if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
|
||||
and func.__module__ in sys.modules: # method
|
||||
if (
|
||||
hasattr(func, "__qualname__")
|
||||
and hasattr(func, "__module__")
|
||||
and func.__module__ in sys.modules
|
||||
): # method
|
||||
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
||||
return hasattr(cls_func, attr)
|
||||
return False
|
||||
|
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
|
|||
"""
|
||||
if is_in_jupyter():
|
||||
from thinc.backends.cupy_ops import CupyOps
|
||||
|
||||
if CupyOps.xp is not None:
|
||||
from thinc.backends import contextvars_eq_thread_ops
|
||||
|
||||
if not contextvars_eq_thread_ops():
|
||||
warnings.warn(Warnings.W111)
|
||||
|
||||
|
||||
def check_lexeme_norms(vocab, component_name):
|
||||
lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(LEXEME_NORM_LANGS)
|
||||
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
|
||||
|
|
Loading…
Reference in New Issue
Block a user