spaCy/spacy/tests/pipeline/test_attributeruler.py

import pytest
import numpy
from spacy.lang.en import English
from spacy.pipeline import AttributeRuler
from spacy import util

from ..util import get_doc, make_tempdir


@pytest.fixture
def nlp():
    return English()


@pytest.fixture
def tag_map():
    return {
        ".": {"POS": "PUNCT", "PunctType": "peri"},
        ",": {"POS": "PUNCT", "PunctType": "comm"},
    }


@pytest.fixture
def morph_rules():
    return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}


def test_attributeruler_init(nlp):
    a = AttributeRuler(nlp.vocab)

    a = nlp.add_pipe("attribute_ruler")
    a.add([[{"ORTH": "a"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"})
    a.add([[{"ORTH": "test"}]], {"LEMMA": "cat", "MORPH": "Number=Sing|Case=Nom"})
    a.add([[{"ORTH": "test"}]], {"LEMMA": "dog"})

    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"


def test_attributeruler_tag_map(nlp, tag_map):
    a = AttributeRuler(nlp.vocab)
    a.load_from_tag_map(tag_map)
    doc = get_doc(nlp.vocab, words=["This", "is", "a", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
    doc = a(doc)

    for i in range(len(doc)):
        if i == 4:
            assert doc[i].pos_ == "PUNCT"
            assert doc[i].morph_ == "PunctType=peri"
        else:
            assert doc[i].pos_ == ""
            assert doc[i].morph_ == ""


def test_attributeruler_morph_rules(nlp, morph_rules):
    a = AttributeRuler(nlp.vocab)
    a.load_from_morph_rules(morph_rules)
    doc = get_doc(nlp.vocab, words=["This", "is", "the", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
    doc = a(doc)

    for i in range(len(doc)):
        if i != 2:
            assert doc[i].pos_ == ""
            assert doc[i].morph_ == ""
        else:
            assert doc[2].pos_ == "DET"
            assert doc[2].lemma_ == "a"
            assert doc[2].morph_ == "Case=Nom"


def test_attributeruler_indices(nlp):
    a = nlp.add_pipe("attribute_ruler")
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
    a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)

    text = "This is a test."
    doc = nlp(text)

    for i in range(len(doc)):
        if i == 1:
            assert doc[i].lemma_ == "was"
            assert doc[i].morph_ == "Case=Nom|Number=Sing"
        elif i == 2:
            assert doc[i].lemma_ == "the"
            assert doc[i].morph_ == "Case=Nom|Number=Plur"
        elif i == 3:
            assert doc[i].lemma_ == "cat"
        else:
            assert doc[i].morph_ == ""

    # raises an error when trying to modify a token outside of the match
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
    with pytest.raises(ValueError):
        doc = nlp(text)

def test_attributeruler_serialize(nlp):
    a = nlp.add_pipe("attribute_ruler")
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
    a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)

    text = "This is a test."
    attrs = ["ORTH", "LEMMA", "MORPH"]
    doc = nlp(text)

    # bytes roundtrip
    a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
    assert a.to_bytes() == a_reloaded.to_bytes()
    doc1 = a_reloaded(nlp.make_doc(text))
    numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))

    # disk roundtrip
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(text)
        assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
        assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
Add AttributeRuler for token attribute exceptions Add the `AttributeRuler` to handle exceptions for token-level attributes. The `AttributeRuler` uses `Matcher` patterns to identify target spans and applies the specified attributes to the token at the provided index in the matched span. A negative index can be used to index from the end of the matched span. The retokenizer is used to "merge" the individual tokens and assign them the provided attributes. Helper functions can import existing tag maps and morph rules to the corresponding `Matcher` patterns. There is an additional minor bug fix for `MORPH` attributes in the retokenizer to correctly normalize the values and to handle `MORPH` alongside `_` in an attrs dict. 2020-07-30 10:00:01 +03:00			`import pytest`
			`import numpy`
			`from spacy.lang.en import English`
			`from spacy.pipeline import AttributeRuler`
			`from spacy import util`

			`from ..util import get_doc, make_tempdir`


			`@pytest.fixture`
			`def nlp():`
			`return English()`


			`@pytest.fixture`
			`def tag_map():`
			`return {`
			`".": {"POS": "PUNCT", "PunctType": "peri"},`
			`",": {"POS": "PUNCT", "PunctType": "comm"},`
			`}`


			`@pytest.fixture`
			`def morph_rules():`
			`return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}`


			`def test_attributeruler_init(nlp):`
			`a = AttributeRuler(nlp.vocab)`

			`a = nlp.add_pipe("attribute_ruler")`
			`a.add([[{"ORTH": "a"}]], {"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"})`
			`a.add([[{"ORTH": "test"}]], {"LEMMA": "cat", "MORPH": "Number=Sing\|Case=Nom"})`
			`a.add([[{"ORTH": "test"}]], {"LEMMA": "dog"})`

			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert doc[2].morph_ == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert doc[3].morph_ == "Case=Nom\|Number=Sing"`


			`def test_attributeruler_tag_map(nlp, tag_map):`
			`a = AttributeRuler(nlp.vocab)`
			`a.load_from_tag_map(tag_map)`
			`doc = get_doc(nlp.vocab, words=["This", "is", "a", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])`
			`doc = a(doc)`

			`for i in range(len(doc)):`
			`if i == 4:`
			`assert doc[i].pos_ == "PUNCT"`
			`assert doc[i].morph_ == "PunctType=peri"`
			`else:`
			`assert doc[i].pos_ == ""`
			`assert doc[i].morph_ == ""`


			`def test_attributeruler_morph_rules(nlp, morph_rules):`
			`a = AttributeRuler(nlp.vocab)`
			`a.load_from_morph_rules(morph_rules)`
			`doc = get_doc(nlp.vocab, words=["This", "is", "the", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])`
			`doc = a(doc)`

			`for i in range(len(doc)):`
			`if i != 2:`
			`assert doc[i].pos_ == ""`
			`assert doc[i].morph_ == ""`
			`else:`
			`assert doc[2].pos_ == "DET"`
			`assert doc[2].lemma_ == "a"`
			`assert doc[2].morph_ == "Case=Nom"`


			`def test_attributeruler_indices(nlp):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"}, index=0)`
			`a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom\|Number=Sing"}, index=1)`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)`

			`text = "This is a test."`
			`doc = nlp(text)`

			`for i in range(len(doc)):`
			`if i == 1:`
			`assert doc[i].lemma_ == "was"`
			`assert doc[i].morph_ == "Case=Nom\|Number=Sing"`
			`elif i == 2:`
			`assert doc[i].lemma_ == "the"`
			`assert doc[i].morph_ == "Case=Nom\|Number=Plur"`
			`elif i == 3:`
			`assert doc[i].lemma_ == "cat"`
			`else:`
			`assert doc[i].morph_ == ""`

			`# raises an error when trying to modify a token outside of the match`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)`
			`with pytest.raises(ValueError):`
			`doc = nlp(text)`

			`def test_attributeruler_serialize(nlp):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"}, index=0)`
			`a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom\|Number=Sing"}, index=1)`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)`

			`text = "This is a test."`
			`attrs = ["ORTH", "LEMMA", "MORPH"]`
			`doc = nlp(text)`

			`# bytes roundtrip`
			`a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())`
			`assert a.to_bytes() == a_reloaded.to_bytes()`
			`doc1 = a_reloaded(nlp.make_doc(text))`
			`numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))`

			`# disk roundtrip`
			`with make_tempdir() as tmp_dir:`
			`nlp.to_disk(tmp_dir)`
			`nlp2 = util.load_model_from_path(tmp_dir)`
			`doc2 = nlp2(text)`
			`assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()`
			`assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))`