From 4fa96705379b10b761a7097b1adb12145402cb1f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 20 May 2020 09:56:56 +0200 Subject: [PATCH 1/2] Extend lemmatizer rules for all UPOS tags --- spacy/lemmatizer.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 33908eecf..a070574bb 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -6,6 +6,7 @@ from collections import OrderedDict from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups +from .parts_of_speech import NAMES as UPOS_NAMES class Lemmatizer(object): @@ -43,17 +44,11 @@ class Lemmatizer(object): lookup_table = self.lookups.get_table("lemma_lookup", {}) if "lemma_rules" not in self.lookups: return [lookup_table.get(string, string)] - if univ_pos in (NOUN, "NOUN", "noun"): - univ_pos = "noun" - elif univ_pos in (VERB, "VERB", "verb"): - univ_pos = "verb" - elif univ_pos in (ADJ, "ADJ", "adj"): - univ_pos = "adj" - elif univ_pos in (PUNCT, "PUNCT", "punct"): - univ_pos = "punct" - elif univ_pos in (PROPN, "PROPN"): - return [string] - else: + if isinstance(univ_pos, int): + univ_pos = UPOS_NAMES.get(univ_pos, "X") + univ_pos = univ_pos.lower() + + if univ_pos in ("", "eol", "space"): return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): From 8cba0e41d8e2797763110e8dd1b3b2ec8a29e719 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 20 May 2020 15:35:08 +0200 Subject: [PATCH 2/2] Return lowercase form as default except for PROPN --- spacy/lemmatizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index a070574bb..1f0f0da3f 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -56,6 +56,11 @@ class Lemmatizer(object): index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) + if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): + if univ_pos == "propn": + return [string] + else: + return [string.lower()] lemmas = self.lemmatize( string, index_table.get(univ_pos, {}),