mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge pull request #5462 from adrianeboyd/feature/lemmatizer-all-upos
Extend lemmatizer rules for all UPOS tags
This commit is contained in:
		
						commit
						26cd6a0229
					
				|  | @ -6,6 +6,7 @@ from collections import OrderedDict | |||
| from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN | ||||
| from .errors import Errors | ||||
| from .lookups import Lookups | ||||
| from .parts_of_speech import NAMES as UPOS_NAMES | ||||
| 
 | ||||
| 
 | ||||
| class Lemmatizer(object): | ||||
|  | @ -43,17 +44,11 @@ class Lemmatizer(object): | |||
|         lookup_table = self.lookups.get_table("lemma_lookup", {}) | ||||
|         if "lemma_rules" not in self.lookups: | ||||
|             return [lookup_table.get(string, string)] | ||||
|         if univ_pos in (NOUN, "NOUN", "noun"): | ||||
|             univ_pos = "noun" | ||||
|         elif univ_pos in (VERB, "VERB", "verb"): | ||||
|             univ_pos = "verb" | ||||
|         elif univ_pos in (ADJ, "ADJ", "adj"): | ||||
|             univ_pos = "adj" | ||||
|         elif univ_pos in (PUNCT, "PUNCT", "punct"): | ||||
|             univ_pos = "punct" | ||||
|         elif univ_pos in (PROPN, "PROPN"): | ||||
|             return [string] | ||||
|         else: | ||||
|         if isinstance(univ_pos, int): | ||||
|             univ_pos = UPOS_NAMES.get(univ_pos, "X") | ||||
|         univ_pos = univ_pos.lower() | ||||
| 
 | ||||
|         if univ_pos in ("", "eol", "space"): | ||||
|             return [string.lower()] | ||||
|         # See Issue #435 for example of where this logic is requied. | ||||
|         if self.is_base_form(univ_pos, morphology): | ||||
|  | @ -61,6 +56,11 @@ class Lemmatizer(object): | |||
|         index_table = self.lookups.get_table("lemma_index", {}) | ||||
|         exc_table = self.lookups.get_table("lemma_exc", {}) | ||||
|         rules_table = self.lookups.get_table("lemma_rules", {}) | ||||
|         if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): | ||||
|             if univ_pos == "propn": | ||||
|                 return [string] | ||||
|             else: | ||||
|                 return [string.lower()] | ||||
|         lemmas = self.lemmatize( | ||||
|             string, | ||||
|             index_table.get(univ_pos, {}), | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user