Extend lemmatizer rules for all UPOS tags

This commit is contained in:
Adriane Boyd 2020-05-20 09:56:56 +02:00
parent 40e65d6f63
commit 4fa9670537

View File

@ -6,6 +6,7 @@ from collections import OrderedDict
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors
from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES
class Lemmatizer(object):
@ -43,17 +44,11 @@ class Lemmatizer(object):
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
univ_pos = "verb"
elif univ_pos in (ADJ, "ADJ", "adj"):
univ_pos = "adj"
elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = "punct"
elif univ_pos in (PROPN, "PROPN"):
return [string]
else:
if isinstance(univ_pos, int):
univ_pos = UPOS_NAMES.get(univ_pos, "X")
univ_pos = univ_pos.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):