diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 33908eecf..1f0f0da3f 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -6,6 +6,7 @@ from collections import OrderedDict from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups +from .parts_of_speech import NAMES as UPOS_NAMES class Lemmatizer(object): @@ -43,17 +44,11 @@ class Lemmatizer(object): lookup_table = self.lookups.get_table("lemma_lookup", {}) if "lemma_rules" not in self.lookups: return [lookup_table.get(string, string)] - if univ_pos in (NOUN, "NOUN", "noun"): - univ_pos = "noun" - elif univ_pos in (VERB, "VERB", "verb"): - univ_pos = "verb" - elif univ_pos in (ADJ, "ADJ", "adj"): - univ_pos = "adj" - elif univ_pos in (PUNCT, "PUNCT", "punct"): - univ_pos = "punct" - elif univ_pos in (PROPN, "PROPN"): - return [string] - else: + if isinstance(univ_pos, int): + univ_pos = UPOS_NAMES.get(univ_pos, "X") + univ_pos = univ_pos.lower() + + if univ_pos in ("", "eol", "space"): return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): @@ -61,6 +56,11 @@ class Lemmatizer(object): index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) + if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): + if univ_pos == "propn": + return [string] + else: + return [string.lower()] lemmas = self.lemmatize( string, index_table.get(univ_pos, {}),