spaCy/spacy/tests/test_new_example.py

214 lines
6.9 KiB
Python
Raw Normal View History

2020-06-09 16:29:05 +03:00
import pytest
2020-06-15 16:02:05 +03:00
from spacy.gold.example import Example
2020-06-09 16:29:05 +03:00
from spacy.tokens import Doc
from spacy.vocab import Vocab
2020-06-09 16:43:08 +03:00
def test_Example_init_requires_doc_objects():
vocab = Vocab()
2020-06-09 16:29:05 +03:00
with pytest.raises(TypeError):
2020-06-15 16:02:05 +03:00
example = Example(None, None)
2020-06-09 16:29:05 +03:00
with pytest.raises(TypeError):
2020-06-15 16:02:05 +03:00
example = Example(Doc(vocab, words=["hi"]), None)
2020-06-09 16:29:05 +03:00
with pytest.raises(TypeError):
2020-06-15 16:02:05 +03:00
example = Example(None, Doc(vocab, words=["hi"]))
2020-06-09 16:29:05 +03:00
2020-06-09 16:43:08 +03:00
def test_Example_from_dict_basic():
2020-06-15 16:02:05 +03:00
example = Example.from_dict(
Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
2020-06-09 16:29:05 +03:00
)
2020-06-15 16:02:05 +03:00
assert isinstance(example.x, Doc)
assert isinstance(example.y, Doc)
2020-06-09 16:43:08 +03:00
@pytest.mark.parametrize(
"annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]
)
def test_Example_from_dict_invalid(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
2020-06-15 18:16:01 +03:00
with pytest.raises(KeyError):
Example.from_dict(predicted, annots)
@pytest.mark.parametrize("pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]])
2020-06-15 18:16:01 +03:00
@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}])
def test_Example_from_dict_with_tags(pred_words, annots):
2020-06-09 16:43:08 +03:00
vocab = Vocab()
predicted = Doc(vocab, words=pred_words)
2020-06-15 16:02:05 +03:00
example = Example.from_dict(predicted, annots)
for i, token in enumerate(example.reference):
2020-06-09 16:43:08 +03:00
assert token.tag_ == annots["tags"][i]
2020-06-15 18:16:01 +03:00
aligned_tags = example.get_aligned("tag")
assert aligned_tags == ["NN" for _ in predicted]
2020-06-09 16:43:08 +03:00
def test_aligned_tags():
pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"]
annots = {"words": gold_words, "tags": gold_tags}
vocab = Vocab()
predicted = Doc(vocab, words=pred_words)
example = Example.from_dict(predicted, annots)
aligned_tags = example.get_aligned("tag")
assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]
def test_aligned_tags_multi():
pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"]
gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"]
gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"]
annots = {"words": gold_words, "tags": gold_tags}
vocab = Vocab()
predicted = Doc(vocab, words=pred_words)
example = Example.from_dict(predicted, annots)
aligned_tags = example.get_aligned("tag")
assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "London", "and", "Berlin", "."],
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
"heads": [1, 1, 1, 2, 2, 1],
}
],
)
2020-06-09 16:49:04 +03:00
def test_Example_from_dict_with_parse(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
2020-06-15 16:02:05 +03:00
example = Example.from_dict(predicted, annots)
for i, token in enumerate(example.reference):
assert token.dep_ == annots["deps"][i]
assert token.head.i == annots["heads"][i]
2020-06-09 16:43:08 +03:00
@pytest.mark.parametrize(
"annots",
[
{
"words": ["Sarah", "'s", "sister", "flew"],
"morphs": [
"NounType=prop|Number=sing",
"Poss=yes",
"Number=sing",
"Tense=past|VerbForm=fin",
],
}
],
)
2020-06-09 16:49:04 +03:00
def test_Example_from_dict_with_morphology(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
2020-06-15 16:02:05 +03:00
example = Example.from_dict(predicted, annots)
for i, token in enumerate(example.reference):
assert token.morph_ == annots["morphs"][i]
2020-06-09 16:43:08 +03:00
@pytest.mark.parametrize(
"annots",
[
{
"words": ["This", "is", "one", "sentence", "this", "is", "another"],
"sent_starts": [1, 0, 0, 0, 1, 0, 0],
}
],
)
2020-06-09 16:49:04 +03:00
def test_Example_from_dict_with_sent_start(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
2020-06-15 16:02:05 +03:00
example = Example.from_dict(predicted, annots)
assert len(list(example.reference.sents)) == 2
for i, token in enumerate(example.reference):
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
2020-06-09 16:43:08 +03:00
@pytest.mark.parametrize(
"annots",
[
{
"words": ["This", "is", "a", "sentence"],
"cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},
}
],
)
2020-06-09 16:49:04 +03:00
def test_Example_from_dict_with_cats(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
2020-06-15 16:02:05 +03:00
example = Example.from_dict(predicted, annots)
assert len(list(example.reference.cats)) == 3
assert example.reference.cats["cat1"] == 1.0
assert example.reference.cats["cat2"] == 0.0
assert example.reference.cats["cat3"] == 0.5
2020-06-09 16:43:08 +03:00
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
}
],
)
def test_Example_from_dict_with_entities(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
2020-06-15 16:02:05 +03:00
example = Example.from_dict(predicted, annots)
assert len(list(example.reference.ents)) == 2
assert example.reference[0].ent_iob_ == "O"
assert example.reference[1].ent_iob_ == "O"
assert example.reference[2].ent_iob_ == "B"
assert example.reference[3].ent_iob_ == "I"
assert example.reference[4].ent_iob_ == "O"
assert example.reference[5].ent_iob_ == "B"
assert example.reference[6].ent_iob_ == "O"
assert example.reference[2].ent_type_ == "LOC"
assert example.reference[3].ent_type_ == "LOC"
assert example.reference[5].ent_type_ == "LOC"
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
"links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},
}
],
)
2020-06-09 16:49:04 +03:00
def test_Example_from_dict_with_links(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
2020-06-15 16:02:05 +03:00
example = Example.from_dict(predicted, annots)
assert example.reference[0].ent_kb_id_ == ""
assert example.reference[1].ent_kb_id_ == ""
assert example.reference[2].ent_kb_id_ == "Q60"
assert example.reference[3].ent_kb_id_ == "Q60"
assert example.reference[4].ent_kb_id_ == ""
assert example.reference[5].ent_kb_id_ == "Q64"
assert example.reference[6].ent_kb_id_ == ""
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
}
],
)
def test_Example_from_dict_with_links_invalid(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError):
Example.from_dict(predicted, annots)