Merge pull request #6503 from adrianeboyd/feature/lemmatizer-rule-warning-pos

Warn on empty POS for the rule-based lemmatizer
This commit is contained in:
Ines Montani 2020-12-09 11:34:16 +11:00 committed by GitHub
commit dfaef27f90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 2 deletions

View File

@ -119,6 +119,10 @@ class Warnings:
"call the {matcher} on each Doc object.")
W107 = ("The property `Doc.{prop}` is deprecated. Use "
"`Doc.has_annotation(\"{attr}\")` instead.")
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
"token '{text}'. Check that your pipeline includes components that "
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
"'morphologizer'.")
@add_codes

View File

@ -4,7 +4,7 @@ from thinc.api import Model
from pathlib import Path
from .pipe import Pipe
from ..errors import Errors
from ..errors import Errors, Warnings
from ..language import Language
from ..training import Example
from ..lookups import Lookups, load_lookups
@ -197,6 +197,8 @@ class Lemmatizer(Pipe):
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
if univ_pos == "":
logger.warn(Warnings.W108.format(text=string))
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(token):

View File

@ -1,4 +1,6 @@
import pytest
import logging
import mock
from spacy import util, registry
from spacy.lang.en import English
from spacy.lookups import Lookups
@ -54,9 +56,18 @@ def test_lemmatizer_config(nlp):
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
# warning if no POS assigned
doc = nlp.make_doc("coping")
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warn") as mock_warn:
doc = lemmatizer(doc)
mock_warn.assert_called_once()
# works with POS
doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB"
assert doc[0].lemma_ == ""
doc[0].pos_ = "VERB"
doc = lemmatizer(doc)
doc = lemmatizer(doc)
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"