Merge pull request #5462 from adrianeboyd/feature/lemmatizer-all-upos

Extend lemmatizer rules for all UPOS tags
2025-07-31 02:19:46 +03:00 · 2020-05-21 16:05:31 +02:00 · 2020-05-21 16:05:31 +02:00 · 26cd6a0229
commit 26cd6a0229
parent 1f572ce89b 8cba0e41d8
1 changed files with 11 additions and 11 deletions
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -6,6 +6,7 @@ from collections import OrderedDict
 from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
 from .errors import Errors
 from .lookups import Lookups
+from .parts_of_speech import NAMES as UPOS_NAMES


 class Lemmatizer(object):
@ -43,17 +44,11 @@ class Lemmatizer(object):
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if "lemma_rules" not in self.lookups:
            return [lookup_table.get(string, string)]
-        if univ_pos in (NOUN, "NOUN", "noun"):
-            univ_pos = "noun"
-        elif univ_pos in (VERB, "VERB", "verb"):
-            univ_pos = "verb"
-        elif univ_pos in (ADJ, "ADJ", "adj"):
-            univ_pos = "adj"
-        elif univ_pos in (PUNCT, "PUNCT", "punct"):
-            univ_pos = "punct"
-        elif univ_pos in (PROPN, "PROPN"):
-            return [string]
-        else:
+        if isinstance(univ_pos, int):
+            univ_pos = UPOS_NAMES.get(univ_pos, "X")
+        univ_pos = univ_pos.lower()
+
+        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
@ -61,6 +56,11 @@ class Lemmatizer(object):
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
+        if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
+            if univ_pos == "propn":
+                return [string]
+            else:
+                return [string.lower()]
        lemmas = self.lemmatize(
            string,
            index_table.get(univ_pos, {}),