spaCy/spacy/tests/test_new_example.py

import pytest
from spacy.gold.example import Example
from spacy.tokens import Doc
from spacy.vocab import Vocab


def test_Example_init_requires_doc_objects():
    vocab = Vocab()
    with pytest.raises(TypeError):
        example = Example(None, None)
    with pytest.raises(TypeError):
        example = Example(Doc(vocab, words=["hi"]), None)
    with pytest.raises(TypeError):
        example = Example(None, Doc(vocab, words=["hi"]))


def test_Example_from_dict_basic():
    example = Example.from_dict(
        Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
    )
    assert isinstance(example.x, Doc)
    assert isinstance(example.y, Doc)


@pytest.mark.parametrize(
    "annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]
)
def test_Example_from_dict_invalid(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    with pytest.raises(KeyError):
        Example.from_dict(predicted, annots)


@pytest.mark.parametrize("pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]])
@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}])
def test_Example_from_dict_with_tags(pred_words, annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=pred_words)
    example = Example.from_dict(predicted, annots)
    for i, token in enumerate(example.reference):
        assert token.tag_ == annots["tags"][i]
    aligned_tags = example.get_aligned("tag")
    assert aligned_tags == ["NN" for _ in predicted]


def test_aligned_tags():
    pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
    gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
    gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"]
    annots = {"words": gold_words, "tags": gold_tags}
    vocab = Vocab()
    predicted = Doc(vocab, words=pred_words)
    example = Example.from_dict(predicted, annots)
    aligned_tags = example.get_aligned("tag")
    assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]


def test_aligned_tags_multi():
    pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"]
    gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"]
    gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"]
    annots = {"words": gold_words, "tags": gold_tags}
    vocab = Vocab()
    predicted = Doc(vocab, words=pred_words)
    example = Example.from_dict(predicted, annots)
    aligned_tags = example.get_aligned("tag")
    assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]


@pytest.mark.parametrize(
    "annots",
    [
        {
            "words": ["I", "like", "London", "and", "Berlin", "."],
            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
            "heads": [1, 1, 1, 2, 2, 1],
        }
    ],
)
def test_Example_from_dict_with_parse(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    for i, token in enumerate(example.reference):
        assert token.dep_ == annots["deps"][i]
        assert token.head.i == annots["heads"][i]


@pytest.mark.parametrize(
    "annots",
    [
        {
            "words": ["Sarah", "'s", "sister", "flew"],
            "morphs": [
                "NounType=prop|Number=sing",
                "Poss=yes",
                "Number=sing",
                "Tense=past|VerbForm=fin",
            ],
        }
    ],
)
def test_Example_from_dict_with_morphology(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    for i, token in enumerate(example.reference):
        assert token.morph_ == annots["morphs"][i]


@pytest.mark.parametrize(
    "annots",
    [
        {
            "words": ["This", "is", "one", "sentence", "this", "is", "another"],
            "sent_starts": [1, 0, 0, 0, 1, 0, 0],
        }
    ],
)
def test_Example_from_dict_with_sent_start(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.sents)) == 2
    for i, token in enumerate(example.reference):
        assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])


@pytest.mark.parametrize(
    "annots",
    [
        {
            "words": ["This", "is", "a", "sentence"],
            "cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},
        }
    ],
)
def test_Example_from_dict_with_cats(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.cats)) == 3
    assert example.reference.cats["cat1"] == 1.0
    assert example.reference.cats["cat2"] == 0.0
    assert example.reference.cats["cat3"] == 0.5


@pytest.mark.parametrize(
    "annots",
    [
        {
            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
        }
    ],
)
def test_Example_from_dict_with_entities(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.ents)) == 2
    assert example.reference[0].ent_iob_ == "O"
    assert example.reference[1].ent_iob_ == "O"
    assert example.reference[2].ent_iob_ == "B"
    assert example.reference[3].ent_iob_ == "I"
    assert example.reference[4].ent_iob_ == "O"
    assert example.reference[5].ent_iob_ == "B"
    assert example.reference[6].ent_iob_ == "O"
    assert example.reference[2].ent_type_ == "LOC"
    assert example.reference[3].ent_type_ == "LOC"
    assert example.reference[5].ent_type_ == "LOC"


@pytest.mark.parametrize(
    "annots",
    [
        {
            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
            "links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},
        }
    ],
)
def test_Example_from_dict_with_links(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert example.reference[0].ent_kb_id_ == ""
    assert example.reference[1].ent_kb_id_ == ""
    assert example.reference[2].ent_kb_id_ == "Q60"
    assert example.reference[3].ent_kb_id_ == "Q60"
    assert example.reference[4].ent_kb_id_ == ""
    assert example.reference[5].ent_kb_id_ == "Q64"
    assert example.reference[6].ent_kb_id_ == ""


@pytest.mark.parametrize(
    "annots",
    [
        {
            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
            "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
        }
    ],
)
def test_Example_from_dict_with_links_invalid(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    with pytest.raises(ValueError):
        Example.from_dict(predicted, annots)
Start tests for new example class 2020-06-09 16:29:05 +03:00			`import pytest`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`from spacy.gold.example import Example`
Start tests for new example class 2020-06-09 16:29:05 +03:00			`from spacy.tokens import Doc`
			`from spacy.vocab import Vocab`


Draft tests for new Example class 2020-06-09 16:43:08 +03:00			`def test_Example_init_requires_doc_objects():`
			`vocab = Vocab()`
Start tests for new example class 2020-06-09 16:29:05 +03:00			`with pytest.raises(TypeError):`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example(None, None)`
Start tests for new example class 2020-06-09 16:29:05 +03:00			`with pytest.raises(TypeError):`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example(Doc(vocab, words=["hi"]), None)`
Start tests for new example class 2020-06-09 16:29:05 +03:00			`with pytest.raises(TypeError):`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example(None, Doc(vocab, words=["hi"]))`
Start tests for new example class 2020-06-09 16:29:05 +03:00

Draft tests for new Example class 2020-06-09 16:43:08 +03:00			`def test_Example_from_dict_basic():`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}`
Start tests for new example class 2020-06-09 16:29:05 +03:00			`)`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`assert isinstance(example.x, Doc)`
			`assert isinstance(example.y, Doc)`
Draft tests for new Example class 2020-06-09 16:43:08 +03:00

adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`@pytest.mark.parametrize(`
			`"annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]`
			`)`
			`def test_Example_from_dict_invalid(annots):`
			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
start testing get_aligned 2020-06-15 18:16:01 +03:00			`with pytest.raises(KeyError):`
fix ENT_IOB conversion and enable unit test 2020-06-12 12:30:24 +03:00			`Example.from_dict(predicted, annots)`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00

additional tests for new get_aligned function 2020-06-15 18:42:40 +03:00			`@pytest.mark.parametrize("pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]])`
start testing get_aligned 2020-06-15 18:16:01 +03:00			`@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}])`
additional tests for new get_aligned function 2020-06-15 18:42:40 +03:00			`def test_Example_from_dict_with_tags(pred_words, annots):`
Draft tests for new Example class 2020-06-09 16:43:08 +03:00			`vocab = Vocab()`
additional tests for new get_aligned function 2020-06-15 18:42:40 +03:00			`predicted = Doc(vocab, words=pred_words)`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(predicted, annots)`
			`for i, token in enumerate(example.reference):`
Draft tests for new Example class 2020-06-09 16:43:08 +03:00			`assert token.tag_ == annots["tags"][i]`
start testing get_aligned 2020-06-15 18:16:01 +03:00			`aligned_tags = example.get_aligned("tag")`
			`assert aligned_tags == ["NN" for _ in predicted]`
Draft tests for new Example class 2020-06-09 16:43:08 +03:00

additional tests for new get_aligned function 2020-06-15 18:42:40 +03:00			`def test_aligned_tags():`
			`pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]`
			`gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]`
			`gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"]`
			`annots = {"words": gold_words, "tags": gold_tags}`
			`vocab = Vocab()`
			`predicted = Doc(vocab, words=pred_words)`
			`example = Example.from_dict(predicted, annots)`
			`aligned_tags = example.get_aligned("tag")`
			`assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]`


			`def test_aligned_tags_multi():`
			`pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"]`
			`gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"]`
			`gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"]`
			`annots = {"words": gold_words, "tags": gold_tags}`
			`vocab = Vocab()`
			`predicted = Doc(vocab, words=pred_words)`
			`example = Example.from_dict(predicted, annots)`
			`aligned_tags = example.get_aligned("tag")`
			`assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]`


adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`@pytest.mark.parametrize(`
			`"annots",`
			`[`
			`{`
			`"words": ["I", "like", "London", "and", "Berlin", "."],`
			`"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],`
			`"heads": [1, 1, 1, 2, 2, 1],`
			`}`
			`],`
			`)`
Update test stubs 2020-06-09 16:49:04 +03:00			`def test_Example_from_dict_with_parse(annots):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(predicted, annots)`
			`for i, token in enumerate(example.reference):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`assert token.dep_ == annots["deps"][i]`
			`assert token.head.i == annots["heads"][i]`

Draft tests for new Example class 2020-06-09 16:43:08 +03:00
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`@pytest.mark.parametrize(`
			`"annots",`
			`[`
			`{`
			`"words": ["Sarah", "'s", "sister", "flew"],`
			`"morphs": [`
			`"NounType=prop\|Number=sing",`
			`"Poss=yes",`
			`"Number=sing",`
			`"Tense=past\|VerbForm=fin",`
			`],`
			`}`
			`],`
			`)`
Update test stubs 2020-06-09 16:49:04 +03:00			`def test_Example_from_dict_with_morphology(annots):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(predicted, annots)`
			`for i, token in enumerate(example.reference):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`assert token.morph_ == annots["morphs"][i]`
Draft tests for new Example class 2020-06-09 16:43:08 +03:00
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00
			`@pytest.mark.parametrize(`
			`"annots",`
			`[`
			`{`
			`"words": ["This", "is", "one", "sentence", "this", "is", "another"],`
			`"sent_starts": [1, 0, 0, 0, 1, 0, 0],`
			`}`
			`],`
			`)`
Update test stubs 2020-06-09 16:49:04 +03:00			`def test_Example_from_dict_with_sent_start(annots):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(predicted, annots)`
			`assert len(list(example.reference.sents)) == 2`
			`for i, token in enumerate(example.reference):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])`

Draft tests for new Example class 2020-06-09 16:43:08 +03:00
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`@pytest.mark.parametrize(`
			`"annots",`
			`[`
			`{`
			`"words": ["This", "is", "a", "sentence"],`
			`"cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},`
			`}`
			`],`
			`)`
Update test stubs 2020-06-09 16:49:04 +03:00			`def test_Example_from_dict_with_cats(annots):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(predicted, annots)`
			`assert len(list(example.reference.cats)) == 3`
			`assert example.reference.cats["cat1"] == 1.0`
			`assert example.reference.cats["cat2"] == 0.0`
			`assert example.reference.cats["cat3"] == 0.5`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00
Draft tests for new Example class 2020-06-09 16:43:08 +03:00
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`@pytest.mark.parametrize(`
			`"annots",`
			`[`
			`{`
entities on doc_annotation, parse links and check their offsets against the entities. unit test works 2020-06-12 16:47:20 +03:00			`"words": ["I", "like", "New", "York", "and", "Berlin", "."],`
			`"entities": [(7, 15, "LOC"), (20, 26, "LOC")],`
			`}`
			`],`
			`)`
			`def test_Example_from_dict_with_entities(annots):`
			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(predicted, annots)`
			`assert len(list(example.reference.ents)) == 2`
			`assert example.reference[0].ent_iob_ == "O"`
			`assert example.reference[1].ent_iob_ == "O"`
			`assert example.reference[2].ent_iob_ == "B"`
			`assert example.reference[3].ent_iob_ == "I"`
			`assert example.reference[4].ent_iob_ == "O"`
			`assert example.reference[5].ent_iob_ == "B"`
			`assert example.reference[6].ent_iob_ == "O"`
			`assert example.reference[2].ent_type_ == "LOC"`
			`assert example.reference[3].ent_type_ == "LOC"`
			`assert example.reference[5].ent_type_ == "LOC"`
entities on doc_annotation, parse links and check their offsets against the entities. unit test works 2020-06-12 16:47:20 +03:00

			`@pytest.mark.parametrize(`
			`"annots",`
			`[`
			`{`
			`"words": ["I", "like", "New", "York", "and", "Berlin", "."],`
			`"entities": [(7, 15, "LOC"), (20, 26, "LOC")],`
			`"links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`}`
			`],`
			`)`
Update test stubs 2020-06-09 16:49:04 +03:00			`def test_Example_from_dict_with_links(annots):`
adding tests for new example class (some still failing - WIP) 2020-06-11 18:43:40 +03:00			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
fixing language and scoring tests 2020-06-15 16:02:05 +03:00			`example = Example.from_dict(predicted, annots)`
			`assert example.reference[0].ent_kb_id_ == ""`
			`assert example.reference[1].ent_kb_id_ == ""`
			`assert example.reference[2].ent_kb_id_ == "Q60"`
			`assert example.reference[3].ent_kb_id_ == "Q60"`
			`assert example.reference[4].ent_kb_id_ == ""`
			`assert example.reference[5].ent_kb_id_ == "Q64"`
			`assert example.reference[6].ent_kb_id_ == ""`
entities on doc_annotation, parse links and check their offsets against the entities. unit test works 2020-06-12 16:47:20 +03:00

			`@pytest.mark.parametrize(`
			`"annots",`
			`[`
			`{`
			`"words": ["I", "like", "New", "York", "and", "Berlin", "."],`
			`"entities": [(7, 15, "LOC"), (20, 26, "LOC")],`
			`"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},`
			`}`
			`],`
			`)`
			`def test_Example_from_dict_with_links_invalid(annots):`
			`vocab = Vocab()`
			`predicted = Doc(vocab, words=annots["words"])`
			`with pytest.raises(ValueError):`
			`Example.from_dict(predicted, annots)`