spaCy/spacy/tests/test_scorer.py
Adriane Boyd a4b32b9552
Handle missing reference values in scorer (#6286)
* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
2020-11-03 15:47:18 +01:00

406 lines
13 KiB
Python

from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
from spacy.training import Example
from spacy.training.iob_utils import offsets_to_biluo_tags
from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from spacy.lang.en import English
from spacy.tokens import Doc
test_las_apple = [
[
"Apple is looking at buying U.K. startup for $ 1 billion",
{
"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
"deps": [
"nsubj",
"aux",
"ROOT",
"prep",
"pcomp",
"compound",
"dobj",
"prep",
"quantmod",
"compound",
"pobj",
],
},
]
]
test_ner_cardinal = [
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
]
test_ner_apple = [
[
"Apple is looking at buying U.K. startup for $1 billion",
{"entities": [(0, 5, "ORG"), (27, 31, "GPE"), (44, 54, "MONEY")]},
]
]
@pytest.fixture
def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London."
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
pos = [
"PROPN",
"PART",
"NOUN",
"VERB",
"ADP",
"PROPN",
"PROPN",
"ADP",
"PROPN",
"PUNCT",
]
morphs = [
"NounType=prop|Number=sing",
"Poss=yes",
"Number=sing",
"Tense=past|VerbForm=fin",
"",
"NounType=prop|Number=sing",
"NounType=prop|Number=sing",
"",
"NounType=prop|Number=sing",
"PunctType=peri",
]
nlp = English()
doc = nlp(text)
for i in range(len(tags)):
doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i]
doc[i].set_morph(morphs[i])
if i > 0:
doc[i].is_sent_start = False
return doc
@pytest.fixture
def sented_doc():
text = "One sentence. Two sentences. Three sentences."
nlp = English()
doc = nlp(text)
for i in range(len(doc)):
if i % 3 == 0:
doc[i].is_sent_start = True
else:
doc[i].is_sent_start = False
return doc
def test_tokenization(sented_doc):
scorer = Scorer()
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["token_acc"] == 1.0
nlp = English()
example.predicted = Doc(
nlp.vocab,
words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."],
spaces=[True, True, True, True, True, False],
)
example.predicted[1].is_sent_start = False
scores = scorer.score([example])
assert scores["token_acc"] == approx(0.66666666)
assert scores["token_p"] == 0.5
assert scores["token_r"] == approx(0.33333333)
assert scores["token_f"] == 0.4
def test_sents(sented_doc):
scorer = Scorer()
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["sents_f"] == 1.0
# One sentence start is moved
gold["sent_starts"][3] = 0
gold["sent_starts"][4] = 1
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["sents_f"] == approx(0.3333333)
def test_las_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
examples = []
for input_, annot in test_las_apple:
doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
)
gold = {"heads": annot["heads"], "deps": annot["deps"]}
example = Example.from_dict(doc, gold)
examples.append(example)
results = scorer.score(examples)
assert results["dep_uas"] == 1.0
assert results["dep_las"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["p"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["r"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["f"] == 1.0
assert results["dep_las_per_type"]["compound"]["p"] == 1.0
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
assert results["dep_las_per_type"]["compound"]["f"] == 1.0
# One dep is incorrect in Doc
scorer = Scorer()
examples = []
for input_, annot in test_las_apple:
doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
)
gold = {"heads": annot["heads"], "deps": annot["deps"]}
doc[0].dep_ = "compound"
example = Example.from_dict(doc, gold)
examples.append(example)
results = scorer.score(examples)
assert results["dep_uas"] == 1.0
assert_almost_equal(results["dep_las"], 0.9090909)
assert results["dep_las_per_type"]["nsubj"]["p"] == 0
assert results["dep_las_per_type"]["nsubj"]["r"] == 0
assert results["dep_las_per_type"]["nsubj"]["f"] == 0
assert_almost_equal(results["dep_las_per_type"]["compound"]["p"], 0.666666666)
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
assert results["dep_las_per_type"]["compound"]["f"] == 0.8
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
examples = []
for input_, annot in test_ner_cardinal:
doc = Doc(
en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"]
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False
example.reference[1].is_sent_start = False
examples.append(example)
results = scorer.score(examples)
assert results["ents_p"] == 1.0
assert results["ents_r"] == 1.0
assert results["ents_f"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0
# Doc has one missing and one extra entity
# Entity type MONEY is not present in Doc
scorer = Scorer()
examples = []
for input_, annot in test_ner_apple:
doc = Doc(
en_vocab,
words=input_.split(" "),
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False
example.reference[1].is_sent_start = False
examples.append(example)
results = scorer.score(examples)
assert results["ents_p"] == approx(0.6666666)
assert results["ents_r"] == approx(0.6666666)
assert results["ents_f"] == approx(0.6666666)
assert "GPE" in results["ents_per_type"]
assert "MONEY" in results["ents_per_type"]
assert "ORG" in results["ents_per_type"]
assert results["ents_per_type"]["GPE"]["p"] == 1.0
assert results["ents_per_type"]["GPE"]["r"] == 1.0
assert results["ents_per_type"]["GPE"]["f"] == 1.0
assert results["ents_per_type"]["MONEY"]["p"] == 0
assert results["ents_per_type"]["MONEY"]["r"] == 0
assert results["ents_per_type"]["MONEY"]["f"] == 0
assert results["ents_per_type"]["ORG"]["p"] == 0.5
assert results["ents_per_type"]["ORG"]["r"] == 1.0
assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)
def test_tag_score(tagged_doc):
# Gold and Doc are identical
scorer = Scorer()
gold = {
"tags": [t.tag_ for t in tagged_doc],
"pos": [t.pos_ for t in tagged_doc],
"morphs": [str(t.morph) for t in tagged_doc],
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
}
example = Example.from_dict(tagged_doc, gold)
results = scorer.score([example])
assert results["tag_acc"] == 1.0
assert results["pos_acc"] == 1.0
assert results["morph_acc"] == 1.0
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
# Gold annotation is modified
scorer = Scorer()
tags = [t.tag_ for t in tagged_doc]
tags[0] = "NN"
pos = [t.pos_ for t in tagged_doc]
pos[1] = "X"
morphs = [str(t.morph) for t in tagged_doc]
morphs[1] = "Number=sing"
morphs[2] = "Number=plur"
gold = {
"tags": tags,
"pos": pos,
"morphs": morphs,
"sent_starts": gold["sent_starts"],
}
example = Example.from_dict(tagged_doc, gold)
results = scorer.score([example])
assert results["tag_acc"] == 0.9
assert results["pos_acc"] == 0.9
assert results["morph_acc"] == approx(0.8)
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e")
pred_doc[0].tag_ = "A"
pred_doc[0].pos_ = "X"
pred_doc[0].set_morph("Feat=Val")
pred_doc[0].dep_ = "dep"
# unannotated reference
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
for key in scores:
# cats doesn't have an unset state
if key.startswith("cats"):
continue
assert scores[key] == None
# partially annotated reference, not overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[1].tag_ = "A"
ref_doc[1].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 0.0
assert scores["pos_acc"] == 0.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
# partially annotated reference, overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[0].tag_ = "A"
ref_doc[0].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 1.0
assert scores["pos_acc"] == 1.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1]
y_score = [0, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [0, 1]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1, 1])
assert_array_almost_equal(fpr, [0, 0, 1])
assert_almost_equal(roc_auc, 0.0)
y_true = [1, 0]
y_score = [1, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
y_true = [1, 0]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [1, 0]
y_score = [0.5, 0.5]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
# same result as above with ROCAUCScore wrapper
score = ROCAUCScore()
score.score_set(0.5, 1)
score.score_set(0.5, 0)
assert_almost_equal(score.score, 0.5)
# check that errors are raised in undefined cases and score is -inf
y_true = [0, 0]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 0)
score.score_set(0.75, 0)
with pytest.raises(ValueError):
s = score.score
y_true = [1, 1]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 1)
score.score_set(0.75, 1)
with pytest.raises(ValueError):
s = score.score