Merge pull request #6503 from adrianeboyd/feature/lemmatizer-rule-warning-pos

Warn on empty POS for the rule-based lemmatizer
2025-12-13 13:14:32 +03:00 · 2020-12-09 11:34:16 +11:00 · 2020-12-09 11:34:16 +11:00 · dfaef27f90
commit dfaef27f90
parent cfc72c2995 d70950605c
3 changed files with 19 additions and 2 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -119,6 +119,10 @@ class Warnings:
            "call the {matcher} on each Doc object.")
    W107 = ("The property `Doc.{prop}` is deprecated. Use "
            "`Doc.has_annotation(\"{attr}\")` instead.")
    W108 = ("The rule-based lemmatizer did not find POS annotation for the "
            "token '{text}'. Check that your pipeline includes components that "
            "assign token.pos, typically 'tagger'+'attribute_ruler' or "
            "'morphologizer'.")
@add_codes
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -4,7 +4,7 @@ from thinc.api import Model
 from pathlib import Path
 from .pipe import Pipe
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..language import Language
 from ..training import Example
 from ..lookups import Lookups, load_lookups
@ -197,6 +197,8 @@ class Lemmatizer(Pipe):
        string = token.text
        univ_pos = token.pos_.lower()
        if univ_pos in ("", "eol", "space"):
            if univ_pos == "":
                logger.warn(Warnings.W108.format(text=string))
            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(token):
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -1,4 +1,6 @@
 import pytest
 import logging
 import mock
 from spacy import util, registry
 from spacy.lang.en import English
 from spacy.lookups import Lookups
@ -54,9 +56,18 @@ def test_lemmatizer_config(nlp):
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
    nlp.initialize()
    # warning if no POS assigned
    doc = nlp.make_doc("coping")
    logger = logging.getLogger("spacy")
    with mock.patch.object(logger, "warn") as mock_warn:
        doc = lemmatizer(doc)
        mock_warn.assert_called_once()
    # works with POS
    doc = nlp.make_doc("coping")
    doc[0].pos_ = "VERB"
    assert doc[0].lemma_ == ""
    doc[0].pos_ = "VERB"
    doc = lemmatizer(doc)
    doc = lemmatizer(doc)
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"