mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 13:14:11 +03:00
Merge pull request #6503 from adrianeboyd/feature/lemmatizer-rule-warning-pos
Warn on empty POS for the rule-based lemmatizer
This commit is contained in:
commit
dfaef27f90
|
@ -119,6 +119,10 @@ class Warnings:
|
||||||
"call the {matcher} on each Doc object.")
|
"call the {matcher} on each Doc object.")
|
||||||
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
||||||
"`Doc.has_annotation(\"{attr}\")` instead.")
|
"`Doc.has_annotation(\"{attr}\")` instead.")
|
||||||
|
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
|
||||||
|
"token '{text}'. Check that your pipeline includes components that "
|
||||||
|
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
||||||
|
"'morphologizer'.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -4,7 +4,7 @@ from thinc.api import Model
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors
|
from ..errors import Errors, Warnings
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..training import Example
|
from ..training import Example
|
||||||
from ..lookups import Lookups, load_lookups
|
from ..lookups import Lookups, load_lookups
|
||||||
|
@ -197,6 +197,8 @@ class Lemmatizer(Pipe):
|
||||||
string = token.text
|
string = token.text
|
||||||
univ_pos = token.pos_.lower()
|
univ_pos = token.pos_.lower()
|
||||||
if univ_pos in ("", "eol", "space"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
|
if univ_pos == "":
|
||||||
|
logger.warn(Warnings.W108.format(text=string))
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(token):
|
if self.is_base_form(token):
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import logging
|
||||||
|
import mock
|
||||||
from spacy import util, registry
|
from spacy import util, registry
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
|
@ -54,9 +56,18 @@ def test_lemmatizer_config(nlp):
|
||||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
|
||||||
|
# warning if no POS assigned
|
||||||
|
doc = nlp.make_doc("coping")
|
||||||
|
logger = logging.getLogger("spacy")
|
||||||
|
with mock.patch.object(logger, "warn") as mock_warn:
|
||||||
|
doc = lemmatizer(doc)
|
||||||
|
mock_warn.assert_called_once()
|
||||||
|
|
||||||
|
# works with POS
|
||||||
doc = nlp.make_doc("coping")
|
doc = nlp.make_doc("coping")
|
||||||
doc[0].pos_ = "VERB"
|
|
||||||
assert doc[0].lemma_ == ""
|
assert doc[0].lemma_ == ""
|
||||||
|
doc[0].pos_ = "VERB"
|
||||||
|
doc = lemmatizer(doc)
|
||||||
doc = lemmatizer(doc)
|
doc = lemmatizer(doc)
|
||||||
assert doc[0].text == "coping"
|
assert doc[0].text == "coping"
|
||||||
assert doc[0].lemma_ == "cope"
|
assert doc[0].lemma_ == "cope"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user