From d70950605c8623816282733414d7bbaf6c84a1b5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 4 Dec 2020 11:46:15 +0100 Subject: [PATCH] Warn on empty POS for the rule-based lemmatizer Add a warning to the rule-based lemmatizer for any tokens without POS annotation. --- spacy/errors.py | 4 ++++ spacy/pipeline/lemmatizer.py | 4 +++- spacy/tests/pipeline/test_lemmatizer.py | 13 ++++++++++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index c2bb36b93..bfc8777b7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -119,6 +119,10 @@ class Warnings: "call the {matcher} on each Doc object.") W107 = ("The property `Doc.{prop}` is deprecated. Use " "`Doc.has_annotation(\"{attr}\")` instead.") + W108 = ("The rule-based lemmatizer did not find POS annotation for the " + "token '{text}'. Check that your pipeline includes components that " + "assign token.pos, typically 'tagger'+'attribute_ruler' or " + "'morphologizer'.") @add_codes diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 9be596868..70a224c0b 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -4,7 +4,7 @@ from thinc.api import Model from pathlib import Path from .pipe import Pipe -from ..errors import Errors +from ..errors import Errors, Warnings from ..language import Language from ..training import Example from ..lookups import Lookups, load_lookups @@ -197,6 +197,8 @@ class Lemmatizer(Pipe): string = token.text univ_pos = token.pos_.lower() if univ_pos in ("", "eol", "space"): + if univ_pos == "": + logger.warn(Warnings.W108.format(text=string)) return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(token): diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index d37c87059..5f6975f88 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -1,4 +1,6 @@ import pytest +import logging +import mock from spacy import util, registry from spacy.lang.en import English from spacy.lookups import Lookups @@ -54,9 +56,18 @@ def test_lemmatizer_config(nlp): lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) nlp.initialize() + # warning if no POS assigned + doc = nlp.make_doc("coping") + logger = logging.getLogger("spacy") + with mock.patch.object(logger, "warn") as mock_warn: + doc = lemmatizer(doc) + mock_warn.assert_called_once() + + # works with POS doc = nlp.make_doc("coping") - doc[0].pos_ = "VERB" assert doc[0].lemma_ == "" + doc[0].pos_ = "VERB" + doc = lemmatizer(doc) doc = lemmatizer(doc) assert doc[0].text == "coping" assert doc[0].lemma_ == "cope"