spaCy/spacy/tests/pipeline/test_attributeruler.py
Adriane Boyd 63f5951f8b Add AttributeRuler for token attribute exceptions
Add the `AttributeRuler` to handle exceptions for token-level
attributes. The `AttributeRuler` uses `Matcher` patterns to identify
target spans and applies the specified attributes to the token at the
provided index in the matched span. A negative index can be used to
index from the end of the matched span. The retokenizer is used to
"merge" the individual tokens and assign them the provided attributes.

Helper functions can import existing tag maps and morph rules to the
corresponding `Matcher` patterns.

There is an additional minor bug fix for `MORPH` attributes in the
retokenizer to correctly normalize the values and to handle `MORPH`
alongside `_` in an attrs dict.
2020-07-30 09:10:59 +02:00

123 lines
4.0 KiB
Python

import pytest
import numpy
from spacy.lang.en import English
from spacy.pipeline import AttributeRuler
from spacy import util
from ..util import get_doc, make_tempdir
@pytest.fixture
def nlp():
return English()
@pytest.fixture
def tag_map():
return {
".": {"POS": "PUNCT", "PunctType": "peri"},
",": {"POS": "PUNCT", "PunctType": "comm"},
}
@pytest.fixture
def morph_rules():
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
def test_attributeruler_init(nlp):
a = AttributeRuler(nlp.vocab)
a = nlp.add_pipe("attribute_ruler")
a.add([[{"ORTH": "a"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"})
a.add([[{"ORTH": "test"}]], {"LEMMA": "cat", "MORPH": "Number=Sing|Case=Nom"})
a.add([[{"ORTH": "test"}]], {"LEMMA": "dog"})
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
def test_attributeruler_tag_map(nlp, tag_map):
a = AttributeRuler(nlp.vocab)
a.load_from_tag_map(tag_map)
doc = get_doc(nlp.vocab, words=["This", "is", "a", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
doc = a(doc)
for i in range(len(doc)):
if i == 4:
assert doc[i].pos_ == "PUNCT"
assert doc[i].morph_ == "PunctType=peri"
else:
assert doc[i].pos_ == ""
assert doc[i].morph_ == ""
def test_attributeruler_morph_rules(nlp, morph_rules):
a = AttributeRuler(nlp.vocab)
a.load_from_morph_rules(morph_rules)
doc = get_doc(nlp.vocab, words=["This", "is", "the", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
doc = a(doc)
for i in range(len(doc)):
if i != 2:
assert doc[i].pos_ == ""
assert doc[i].morph_ == ""
else:
assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a"
assert doc[2].morph_ == "Case=Nom"
def test_attributeruler_indices(nlp):
a = nlp.add_pipe("attribute_ruler")
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
text = "This is a test."
doc = nlp(text)
for i in range(len(doc)):
if i == 1:
assert doc[i].lemma_ == "was"
assert doc[i].morph_ == "Case=Nom|Number=Sing"
elif i == 2:
assert doc[i].lemma_ == "the"
assert doc[i].morph_ == "Case=Nom|Number=Plur"
elif i == 3:
assert doc[i].lemma_ == "cat"
else:
assert doc[i].morph_ == ""
# raises an error when trying to modify a token outside of the match
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
with pytest.raises(ValueError):
doc = nlp(text)
def test_attributeruler_serialize(nlp):
a = nlp.add_pipe("attribute_ruler")
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
text = "This is a test."
attrs = ["ORTH", "LEMMA", "MORPH"]
doc = nlp(text)
# bytes roundtrip
a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
assert a.to_bytes() == a_reloaded.to_bytes()
doc1 = a_reloaded(nlp.make_doc(text))
numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
# disk roundtrip
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(text)
assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))