Warn on empty POS for the rule-based lemmatizer

Add a warning to the rule-based lemmatizer for any tokens without POS
annotation.
This commit is contained in:
Adriane Boyd 2020-12-04 11:46:15 +01:00
parent 63f83e7034
commit d70950605c
3 changed files with 19 additions and 2 deletions

View File

@ -119,6 +119,10 @@ class Warnings:
"call the {matcher} on each Doc object.")
W107 = ("The property `Doc.{prop}` is deprecated. Use "
"`Doc.has_annotation(\"{attr}\")` instead.")
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
"token '{text}'. Check that your pipeline includes components that "
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
"'morphologizer'.")
@add_codes

View File

@ -4,7 +4,7 @@ from thinc.api import Model
from pathlib import Path
from .pipe import Pipe
from ..errors import Errors
from ..errors import Errors, Warnings
from ..language import Language
from ..training import Example
from ..lookups import Lookups, load_lookups
@ -197,6 +197,8 @@ class Lemmatizer(Pipe):
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
if univ_pos == "":
logger.warn(Warnings.W108.format(text=string))
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(token):

View File

@ -1,4 +1,6 @@
import pytest
import logging
import mock
from spacy import util, registry
from spacy.lang.en import English
from spacy.lookups import Lookups
@ -54,9 +56,18 @@ def test_lemmatizer_config(nlp):
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
# warning if no POS assigned
doc = nlp.make_doc("coping")
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warn") as mock_warn:
doc = lemmatizer(doc)
mock_warn.assert_called_once()
# works with POS
doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB"
assert doc[0].lemma_ == ""
doc[0].pos_ = "VERB"
doc = lemmatizer(doc)
doc = lemmatizer(doc)
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"