mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Refactor Docs.is_ flags
* Add derived `Doc.has_annotation` method
  * `Doc.has_annotation(attr)` returns `True` for partial annotation
  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation
* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`
* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.
Notes on `Doc.has_annotation`:
* `HEAD` is converted to `DEP` because heads don't have an unset state
* Accept `IS_SENT_START` as a synonym of `SENT_START`
Additional changes:
* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`
* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`
* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`
* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`
* Fix call to set_children_form_heads
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
		
	
			
		
			
				
	
	
		
			254 lines
		
	
	
		
			7.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			254 lines
		
	
	
		
			7.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import pytest
 | 
						|
import numpy
 | 
						|
from spacy.training import Example
 | 
						|
from spacy.lang.en import English
 | 
						|
from spacy.pipeline import AttributeRuler
 | 
						|
from spacy import util, registry
 | 
						|
 | 
						|
from ..util import get_doc, make_tempdir
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def nlp():
 | 
						|
    return English()
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def pattern_dicts():
 | 
						|
    return [
 | 
						|
        {
 | 
						|
            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
 | 
						|
            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
 | 
						|
        },
 | 
						|
        # one pattern sets the lemma
 | 
						|
        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
 | 
						|
        # another pattern sets the morphology
 | 
						|
        {
 | 
						|
            "patterns": [[{"ORTH": "test"}]],
 | 
						|
            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
 | 
						|
            "index": 0,
 | 
						|
        },
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
@registry.misc("attribute_ruler_patterns")
 | 
						|
def attribute_ruler_patterns():
 | 
						|
    return [
 | 
						|
        {
 | 
						|
            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
 | 
						|
            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
 | 
						|
        },
 | 
						|
        # one pattern sets the lemma
 | 
						|
        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
 | 
						|
        # another pattern sets the morphology
 | 
						|
        {
 | 
						|
            "patterns": [[{"ORTH": "test"}]],
 | 
						|
            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
 | 
						|
            "index": 0,
 | 
						|
        },
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def tag_map():
 | 
						|
    return {
 | 
						|
        ".": {"POS": "PUNCT", "PunctType": "peri"},
 | 
						|
        ",": {"POS": "PUNCT", "PunctType": "comm"},
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def morph_rules():
 | 
						|
    return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_init(nlp, pattern_dicts):
 | 
						|
    a = nlp.add_pipe("attribute_ruler")
 | 
						|
    for p in pattern_dicts:
 | 
						|
        a.add(**p)
 | 
						|
 | 
						|
    doc = nlp("This is a test.")
 | 
						|
    assert doc[2].lemma_ == "the"
 | 
						|
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
						|
    assert doc[3].lemma_ == "cat"
 | 
						|
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
						|
    assert doc.has_annotation("LEMMA")
 | 
						|
    assert doc.has_annotation("MORPH")
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_init_patterns(nlp, pattern_dicts):
 | 
						|
    # initialize with patterns
 | 
						|
    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
 | 
						|
    doc = nlp("This is a test.")
 | 
						|
    assert doc[2].lemma_ == "the"
 | 
						|
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
						|
    assert doc[3].lemma_ == "cat"
 | 
						|
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
						|
    assert doc.has_annotation("LEMMA")
 | 
						|
    assert doc.has_annotation("MORPH")
 | 
						|
    nlp.remove_pipe("attribute_ruler")
 | 
						|
    # initialize with patterns from asset
 | 
						|
    nlp.add_pipe(
 | 
						|
        "attribute_ruler",
 | 
						|
        config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
 | 
						|
    )
 | 
						|
    doc = nlp("This is a test.")
 | 
						|
    assert doc[2].lemma_ == "the"
 | 
						|
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
						|
    assert doc[3].lemma_ == "cat"
 | 
						|
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
						|
    assert doc.has_annotation("LEMMA")
 | 
						|
    assert doc.has_annotation("MORPH")
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_score(nlp, pattern_dicts):
 | 
						|
    # initialize with patterns
 | 
						|
    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
 | 
						|
    doc = nlp("This is a test.")
 | 
						|
    assert doc[2].lemma_ == "the"
 | 
						|
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
						|
    assert doc[3].lemma_ == "cat"
 | 
						|
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
						|
 | 
						|
    dev_examples = [
 | 
						|
        Example.from_dict(
 | 
						|
            nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]}
 | 
						|
        )
 | 
						|
    ]
 | 
						|
    scores = nlp.evaluate(dev_examples)
 | 
						|
    # "cat" is the only correct lemma
 | 
						|
    assert scores["lemma_acc"] == pytest.approx(0.2)
 | 
						|
    # the empty morphs are correct
 | 
						|
    assert scores["morph_acc"] == pytest.approx(0.6)
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_rule_order(nlp):
 | 
						|
    a = AttributeRuler(nlp.vocab)
 | 
						|
    patterns = [
 | 
						|
        {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "VERB"}},
 | 
						|
        {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
 | 
						|
    ]
 | 
						|
    a.add_patterns(patterns)
 | 
						|
    doc = get_doc(
 | 
						|
        nlp.vocab,
 | 
						|
        words=["This", "is", "a", "test", "."],
 | 
						|
        tags=["DT", "VBZ", "DT", "NN", "."],
 | 
						|
    )
 | 
						|
    doc = a(doc)
 | 
						|
    assert doc[1].pos_ == "NOUN"
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_tag_map(nlp, tag_map):
 | 
						|
    a = AttributeRuler(nlp.vocab)
 | 
						|
    a.load_from_tag_map(tag_map)
 | 
						|
    doc = get_doc(
 | 
						|
        nlp.vocab,
 | 
						|
        words=["This", "is", "a", "test", "."],
 | 
						|
        tags=["DT", "VBZ", "DT", "NN", "."],
 | 
						|
    )
 | 
						|
    doc = a(doc)
 | 
						|
 | 
						|
    for i in range(len(doc)):
 | 
						|
        if i == 4:
 | 
						|
            assert doc[i].pos_ == "PUNCT"
 | 
						|
            assert doc[i].morph_ == "PunctType=peri"
 | 
						|
        else:
 | 
						|
            assert doc[i].pos_ == ""
 | 
						|
            assert doc[i].morph_ == ""
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_morph_rules(nlp, morph_rules):
 | 
						|
    a = AttributeRuler(nlp.vocab)
 | 
						|
    a.load_from_morph_rules(morph_rules)
 | 
						|
    doc = get_doc(
 | 
						|
        nlp.vocab,
 | 
						|
        words=["This", "is", "the", "test", "."],
 | 
						|
        tags=["DT", "VBZ", "DT", "NN", "."],
 | 
						|
    )
 | 
						|
    doc = a(doc)
 | 
						|
 | 
						|
    for i in range(len(doc)):
 | 
						|
        if i != 2:
 | 
						|
            assert doc[i].pos_ == ""
 | 
						|
            assert doc[i].morph_ == ""
 | 
						|
        else:
 | 
						|
            assert doc[2].pos_ == "DET"
 | 
						|
            assert doc[2].lemma_ == "a"
 | 
						|
            assert doc[2].morph_ == "Case=Nom"
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_indices(nlp):
 | 
						|
    a = nlp.add_pipe("attribute_ruler")
 | 
						|
    a.add(
 | 
						|
        [[{"ORTH": "a"}, {"ORTH": "test"}]],
 | 
						|
        {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
 | 
						|
        index=0,
 | 
						|
    )
 | 
						|
    a.add(
 | 
						|
        [[{"ORTH": "This"}, {"ORTH": "is"}]],
 | 
						|
        {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
 | 
						|
        index=1,
 | 
						|
    )
 | 
						|
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
 | 
						|
 | 
						|
    text = "This is a test."
 | 
						|
    doc = nlp(text)
 | 
						|
 | 
						|
    for i in range(len(doc)):
 | 
						|
        if i == 1:
 | 
						|
            assert doc[i].lemma_ == "was"
 | 
						|
            assert doc[i].morph_ == "Case=Nom|Number=Sing"
 | 
						|
        elif i == 2:
 | 
						|
            assert doc[i].lemma_ == "the"
 | 
						|
            assert doc[i].morph_ == "Case=Nom|Number=Plur"
 | 
						|
        elif i == 3:
 | 
						|
            assert doc[i].lemma_ == "cat"
 | 
						|
        else:
 | 
						|
            assert doc[i].morph_ == ""
 | 
						|
 | 
						|
    # raises an error when trying to modify a token outside of the match
 | 
						|
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        doc = nlp(text)
 | 
						|
 | 
						|
    # raises an error when trying to modify a token outside of the match
 | 
						|
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        doc = nlp(text)
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_patterns_prop(nlp, pattern_dicts):
 | 
						|
    a = nlp.add_pipe("attribute_ruler")
 | 
						|
    a.add_patterns(pattern_dicts)
 | 
						|
 | 
						|
    for p1, p2 in zip(pattern_dicts, a.patterns):
 | 
						|
        assert p1["patterns"] == p2["patterns"]
 | 
						|
        assert p1["attrs"] == p2["attrs"]
 | 
						|
        if p1.get("index"):
 | 
						|
            assert p1["index"] == p2["index"]
 | 
						|
 | 
						|
 | 
						|
def test_attributeruler_serialize(nlp, pattern_dicts):
 | 
						|
    a = nlp.add_pipe("attribute_ruler")
 | 
						|
    a.add_patterns(pattern_dicts)
 | 
						|
 | 
						|
    text = "This is a test."
 | 
						|
    attrs = ["ORTH", "LEMMA", "MORPH"]
 | 
						|
    doc = nlp(text)
 | 
						|
 | 
						|
    # bytes roundtrip
 | 
						|
    a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
 | 
						|
    assert a.to_bytes() == a_reloaded.to_bytes()
 | 
						|
    doc1 = a_reloaded(nlp.make_doc(text))
 | 
						|
    numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
 | 
						|
    assert a.patterns == a_reloaded.patterns
 | 
						|
 | 
						|
    # disk roundtrip
 | 
						|
    with make_tempdir() as tmp_dir:
 | 
						|
        nlp.to_disk(tmp_dir)
 | 
						|
        nlp2 = util.load_model_from_path(tmp_dir)
 | 
						|
        doc2 = nlp2(text)
 | 
						|
        assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
 | 
						|
        assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
 | 
						|
        assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns
 |