2020-08-07 16:27:13 +03:00
|
|
|
from ...pipeline import Lemmatizer
|
|
|
|
from ...tokens import Token
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
|
2020-08-07 16:27:13 +03:00
|
|
|
class EnglishLemmatizer(Lemmatizer):
|
2020-09-29 22:39:28 +03:00
|
|
|
"""English lemmatizer. Only overrides is_base_form."""
|
2020-08-07 16:27:13 +03:00
|
|
|
|
|
|
|
def is_base_form(self, token: Token) -> bool:
|
|
|
|
"""
|
|
|
|
Check whether we're dealing with an uninflected paradigm, so we can
|
|
|
|
avoid lemmatization entirely.
|
|
|
|
|
|
|
|
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
|
|
|
morphology (dict): The token's morphological features following the
|
|
|
|
Universal Dependencies scheme.
|
|
|
|
"""
|
|
|
|
univ_pos = token.pos_.lower()
|
|
|
|
morphology = token.morph.to_dict()
|
|
|
|
if univ_pos == "noun" and morphology.get("Number") == "Sing":
|
|
|
|
return True
|
|
|
|
elif univ_pos == "verb" and morphology.get("VerbForm") == "Inf":
|
|
|
|
return True
|
|
|
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
|
|
|
# morphology
|
|
|
|
elif univ_pos == "verb" and (
|
|
|
|
morphology.get("VerbForm") == "Fin"
|
|
|
|
and morphology.get("Tense") == "Pres"
|
|
|
|
and morphology.get("Number") is None
|
|
|
|
):
|
|
|
|
return True
|
|
|
|
elif univ_pos == "adj" and morphology.get("Degree") == "Pos":
|
|
|
|
return True
|
|
|
|
elif morphology.get("VerbForm") == "Inf":
|
|
|
|
return True
|
|
|
|
elif morphology.get("VerbForm") == "None":
|
|
|
|
return True
|
|
|
|
elif morphology.get("Degree") == "Pos":
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|