mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-02 21:36:36 +03:00
63f5951f8b
Add the `AttributeRuler` to handle exceptions for token-level attributes. The `AttributeRuler` uses `Matcher` patterns to identify target spans and applies the specified attributes to the token at the provided index in the matched span. A negative index can be used to index from the end of the matched span. The retokenizer is used to "merge" the individual tokens and assign them the provided attributes. Helper functions can import existing tag maps and morph rules to the corresponding `Matcher` patterns. There is an additional minor bug fix for `MORPH` attributes in the retokenizer to correctly normalize the values and to handle `MORPH` alongside `_` in an attrs dict.
123 lines
4.0 KiB
Python
123 lines
4.0 KiB
Python
import pytest
|
|
import numpy
|
|
from spacy.lang.en import English
|
|
from spacy.pipeline import AttributeRuler
|
|
from spacy import util
|
|
|
|
from ..util import get_doc, make_tempdir
|
|
|
|
|
|
@pytest.fixture
|
|
def nlp():
|
|
return English()
|
|
|
|
|
|
@pytest.fixture
|
|
def tag_map():
|
|
return {
|
|
".": {"POS": "PUNCT", "PunctType": "peri"},
|
|
",": {"POS": "PUNCT", "PunctType": "comm"},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def morph_rules():
|
|
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
|
|
|
|
|
|
def test_attributeruler_init(nlp):
|
|
a = AttributeRuler(nlp.vocab)
|
|
|
|
a = nlp.add_pipe("attribute_ruler")
|
|
a.add([[{"ORTH": "a"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"})
|
|
a.add([[{"ORTH": "test"}]], {"LEMMA": "cat", "MORPH": "Number=Sing|Case=Nom"})
|
|
a.add([[{"ORTH": "test"}]], {"LEMMA": "dog"})
|
|
|
|
doc = nlp("This is a test.")
|
|
assert doc[2].lemma_ == "the"
|
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
|
assert doc[3].lemma_ == "cat"
|
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
|
|
|
|
|
def test_attributeruler_tag_map(nlp, tag_map):
|
|
a = AttributeRuler(nlp.vocab)
|
|
a.load_from_tag_map(tag_map)
|
|
doc = get_doc(nlp.vocab, words=["This", "is", "a", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
|
|
doc = a(doc)
|
|
|
|
for i in range(len(doc)):
|
|
if i == 4:
|
|
assert doc[i].pos_ == "PUNCT"
|
|
assert doc[i].morph_ == "PunctType=peri"
|
|
else:
|
|
assert doc[i].pos_ == ""
|
|
assert doc[i].morph_ == ""
|
|
|
|
|
|
def test_attributeruler_morph_rules(nlp, morph_rules):
|
|
a = AttributeRuler(nlp.vocab)
|
|
a.load_from_morph_rules(morph_rules)
|
|
doc = get_doc(nlp.vocab, words=["This", "is", "the", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
|
|
doc = a(doc)
|
|
|
|
for i in range(len(doc)):
|
|
if i != 2:
|
|
assert doc[i].pos_ == ""
|
|
assert doc[i].morph_ == ""
|
|
else:
|
|
assert doc[2].pos_ == "DET"
|
|
assert doc[2].lemma_ == "a"
|
|
assert doc[2].morph_ == "Case=Nom"
|
|
|
|
|
|
def test_attributeruler_indices(nlp):
|
|
a = nlp.add_pipe("attribute_ruler")
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
|
|
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
|
|
|
|
text = "This is a test."
|
|
doc = nlp(text)
|
|
|
|
for i in range(len(doc)):
|
|
if i == 1:
|
|
assert doc[i].lemma_ == "was"
|
|
assert doc[i].morph_ == "Case=Nom|Number=Sing"
|
|
elif i == 2:
|
|
assert doc[i].lemma_ == "the"
|
|
assert doc[i].morph_ == "Case=Nom|Number=Plur"
|
|
elif i == 3:
|
|
assert doc[i].lemma_ == "cat"
|
|
else:
|
|
assert doc[i].morph_ == ""
|
|
|
|
# raises an error when trying to modify a token outside of the match
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
|
with pytest.raises(ValueError):
|
|
doc = nlp(text)
|
|
|
|
def test_attributeruler_serialize(nlp):
|
|
a = nlp.add_pipe("attribute_ruler")
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
|
|
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
|
|
|
|
text = "This is a test."
|
|
attrs = ["ORTH", "LEMMA", "MORPH"]
|
|
doc = nlp(text)
|
|
|
|
# bytes roundtrip
|
|
a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
|
|
assert a.to_bytes() == a_reloaded.to_bytes()
|
|
doc1 = a_reloaded(nlp.make_doc(text))
|
|
numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
|
|
|
|
# disk roundtrip
|
|
with make_tempdir() as tmp_dir:
|
|
nlp.to_disk(tmp_dir)
|
|
nlp2 = util.load_model_from_path(tmp_dir)
|
|
doc2 = nlp2(text)
|
|
assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
|
|
assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
|