mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Update lexeme_norm checks
* Add util method for check * Add new languages to list with lexeme norm tables * Add check to all relevant components * Add config details to warning message Note that we're not actually inspecting the model config to see if `NORM` is used as an attribute, so it may warn in cases where it's not relevant.
This commit is contained in:
		
							parent
							
								
									34e13c1161
								
							
						
					
					
						commit
						39153ef90f
					
				|  | @ -73,8 +73,13 @@ class Warnings: | |||
|             "degree. If this is intentional or the language you're using " | ||||
|             "doesn't have a normalization table, please ignore this warning. " | ||||
|             "If this is surprising, make sure you have the spacy-lookups-data " | ||||
|             "package installed. The languages with lexeme normalization tables " | ||||
|             "are currently: {langs}") | ||||
|             "package installed and load the table in your config. The " | ||||
|             "languages with lexeme normalization tables are currently: " | ||||
|             "{langs}\n\nLoad the table in your config with:\n\n" | ||||
|             "[initialize.lookups]\n" | ||||
|             "@misc = \"spacy.LookupsDataLoader.v1\"\n" | ||||
|             "lang = ${{nlp.lang}}\n" | ||||
|             "tables = [\"lexeme_norm\"]\n") | ||||
|     W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' | ||||
|             "attribute or operator.") | ||||
| 
 | ||||
|  |  | |||
|  | @ -137,6 +137,7 @@ class Morphologizer(Tagger): | |||
|         DOCS: https://spacy.io/api/morphologizer#initialize | ||||
|         """ | ||||
|         validate_get_examples(get_examples, "Morphologizer.initialize") | ||||
|         util.check_lexeme_norms(self.vocab, "morphologizer") | ||||
|         if labels is not None: | ||||
|             self.cfg["labels_morph"] = labels["morph"] | ||||
|             self.cfg["labels_pos"] = labels["pos"] | ||||
|  |  | |||
|  | @ -138,6 +138,7 @@ class SentenceRecognizer(Tagger): | |||
|         DOCS: https://spacy.io/api/sentencerecognizer#initialize | ||||
|         """ | ||||
|         validate_get_examples(get_examples, "SentenceRecognizer.initialize") | ||||
|         util.check_lexeme_norms(self.vocab, "senter") | ||||
|         doc_sample = [] | ||||
|         label_sample = [] | ||||
|         assert self.labels, Errors.E924.format(name=self.name) | ||||
|  |  | |||
|  | @ -249,6 +249,7 @@ class Tagger(TrainablePipe): | |||
|         DOCS: https://spacy.io/api/tagger#initialize | ||||
|         """ | ||||
|         validate_get_examples(get_examples, "Tagger.initialize") | ||||
|         util.check_lexeme_norms(self.vocab, "tagger") | ||||
|         if labels is not None: | ||||
|             for tag in labels: | ||||
|                 self.add_label(tag) | ||||
|  |  | |||
|  | @ -493,10 +493,7 @@ cdef class Parser(TrainablePipe): | |||
| 
 | ||||
|     def initialize(self, get_examples, nlp=None, labels=None): | ||||
|         validate_get_examples(get_examples, "Parser.initialize") | ||||
|         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) | ||||
|         if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: | ||||
|             langs = ", ".join(util.LEXEME_NORM_LANGS) | ||||
|             util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs)) | ||||
|         util.check_lexeme_norms(self.vocab, "parser or NER") | ||||
|         if labels is not None: | ||||
|             actions = dict(labels) | ||||
|         else: | ||||
|  |  | |||
|  | @ -59,7 +59,7 @@ if TYPE_CHECKING: | |||
| 
 | ||||
| OOV_RANK = numpy.iinfo(numpy.uint64).max | ||||
| DEFAULT_OOV_PROB = -20 | ||||
| LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] | ||||
| LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] | ||||
| 
 | ||||
| # Default order of sections in the config.cfg. Not all sections needs to exist, | ||||
| # and additional sections are added at the end, in alphabetical order. | ||||
|  | @ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co | |||
| 
 | ||||
| logger = logging.getLogger("spacy") | ||||
| logger_stream_handler = logging.StreamHandler() | ||||
| logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")) | ||||
| logger_stream_handler.setFormatter( | ||||
|     logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s") | ||||
| ) | ||||
| logger.addHandler(logger_stream_handler) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool: | |||
|     if hasattr(func, attr):  # function or class instance | ||||
|         return True | ||||
|     # https://stackoverflow.com/a/55767059 | ||||
|     if hasattr(func, "__qualname__") and hasattr(func, "__module__") \ | ||||
|         and func.__module__ in sys.modules:  # method | ||||
|             cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] | ||||
|             return hasattr(cls_func, attr) | ||||
|     if ( | ||||
|         hasattr(func, "__qualname__") | ||||
|         and hasattr(func, "__module__") | ||||
|         and func.__module__ in sys.modules | ||||
|     ):  # method | ||||
|         cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] | ||||
|         return hasattr(cls_func, attr) | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
|  | @ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy(): | |||
|     """ | ||||
|     if is_in_jupyter(): | ||||
|         from thinc.backends.cupy_ops import CupyOps | ||||
| 
 | ||||
|         if CupyOps.xp is not None: | ||||
|             from thinc.backends import contextvars_eq_thread_ops | ||||
| 
 | ||||
|             if not contextvars_eq_thread_ops(): | ||||
|                 warnings.warn(Warnings.W111) | ||||
| 
 | ||||
| 
 | ||||
| def check_lexeme_norms(vocab, component_name): | ||||
|     lexeme_norms = vocab.lookups.get_table("lexeme_norm", {}) | ||||
|     if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS: | ||||
|         langs = ", ".join(LEXEME_NORM_LANGS) | ||||
|         logger.debug(Warnings.W033.format(model=component_name, langs=langs)) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user