Extend lemmatizer rules for all UPOS tags

This commit is contained in:
Adriane Boyd 2020-05-20 09:56:56 +02:00
parent 40e65d6f63
commit 4fa9670537

View File

@ -6,6 +6,7 @@ from collections import OrderedDict
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors from .errors import Errors
from .lookups import Lookups from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES
class Lemmatizer(object): class Lemmatizer(object):
@ -43,17 +44,11 @@ class Lemmatizer(object):
lookup_table = self.lookups.get_table("lemma_lookup", {}) lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups: if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)] return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"): if isinstance(univ_pos, int):
univ_pos = "noun" univ_pos = UPOS_NAMES.get(univ_pos, "X")
elif univ_pos in (VERB, "VERB", "verb"): univ_pos = univ_pos.lower()
univ_pos = "verb"
elif univ_pos in (ADJ, "ADJ", "adj"): if univ_pos in ("", "eol", "space"):
univ_pos = "adj"
elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = "punct"
elif univ_pos in (PROPN, "PROPN"):
return [string]
else:
return [string.lower()] return [string.lower()]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):