Merge pull request #5462 from adrianeboyd/feature/lemmatizer-all-upos

Extend lemmatizer rules for all UPOS tags
This commit is contained in:
Matthew Honnibal 2020-05-21 16:05:31 +02:00 committed by GitHub
commit 26cd6a0229
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,6 +6,7 @@ from collections import OrderedDict
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors from .errors import Errors
from .lookups import Lookups from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES
class Lemmatizer(object): class Lemmatizer(object):
@ -43,17 +44,11 @@ class Lemmatizer(object):
lookup_table = self.lookups.get_table("lemma_lookup", {}) lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups: if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)] return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"): if isinstance(univ_pos, int):
univ_pos = "noun" univ_pos = UPOS_NAMES.get(univ_pos, "X")
elif univ_pos in (VERB, "VERB", "verb"): univ_pos = univ_pos.lower()
univ_pos = "verb"
elif univ_pos in (ADJ, "ADJ", "adj"): if univ_pos in ("", "eol", "space"):
univ_pos = "adj"
elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = "punct"
elif univ_pos in (PROPN, "PROPN"):
return [string]
else:
return [string.lower()] return [string.lower()]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):
@ -61,6 +56,11 @@ class Lemmatizer(object):
index_table = self.lookups.get_table("lemma_index", {}) index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {}) exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {}) rules_table = self.lookups.get_table("lemma_rules", {})
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
if univ_pos == "propn":
return [string]
else:
return [string.lower()]
lemmas = self.lemmatize( lemmas = self.lemmatize(
string, string,
index_table.get(univ_pos, {}), index_table.get(univ_pos, {}),