mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Warn on empty POS for the rule-based lemmatizer
Add a warning to the rule-based lemmatizer for any tokens without POS annotation.
This commit is contained in:
parent
63f83e7034
commit
d70950605c
|
@ -119,6 +119,10 @@ class Warnings:
|
|||
"call the {matcher} on each Doc object.")
|
||||
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
||||
"`Doc.has_annotation(\"{attr}\")` instead.")
|
||||
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
|
||||
"token '{text}'. Check that your pipeline includes components that "
|
||||
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
||||
"'morphologizer'.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -4,7 +4,7 @@ from thinc.api import Model
|
|||
from pathlib import Path
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..errors import Errors
|
||||
from ..errors import Errors, Warnings
|
||||
from ..language import Language
|
||||
from ..training import Example
|
||||
from ..lookups import Lookups, load_lookups
|
||||
|
@ -197,6 +197,8 @@ class Lemmatizer(Pipe):
|
|||
string = token.text
|
||||
univ_pos = token.pos_.lower()
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
if univ_pos == "":
|
||||
logger.warn(Warnings.W108.format(text=string))
|
||||
return [string.lower()]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(token):
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import pytest
|
||||
import logging
|
||||
import mock
|
||||
from spacy import util, registry
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups
|
||||
|
@ -54,9 +56,18 @@ def test_lemmatizer_config(nlp):
|
|||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||
nlp.initialize()
|
||||
|
||||
# warning if no POS assigned
|
||||
doc = nlp.make_doc("coping")
|
||||
logger = logging.getLogger("spacy")
|
||||
with mock.patch.object(logger, "warn") as mock_warn:
|
||||
doc = lemmatizer(doc)
|
||||
mock_warn.assert_called_once()
|
||||
|
||||
# works with POS
|
||||
doc = nlp.make_doc("coping")
|
||||
doc[0].pos_ = "VERB"
|
||||
assert doc[0].lemma_ == ""
|
||||
doc[0].pos_ = "VERB"
|
||||
doc = lemmatizer(doc)
|
||||
doc = lemmatizer(doc)
|
||||
assert doc[0].text == "coping"
|
||||
assert doc[0].lemma_ == "cope"
|
||||
|
|
Loading…
Reference in New Issue
Block a user