spaCy/spacy/tests/test_scorer.py
Adriane Boyd 9e0322de1a
Restore v2 token_acc score implementation (#12073)
In the v3 scorer refactoring, `token_acc` was implemented incorrectly.
It should use `precision` instead of `fscore` for the measure of
correctly aligned tokens / number of predicted tokens.

Fix the docs to reflect that the measure uses the number of predicted
tokens rather than the number of gold tokens.
2023-01-11 08:01:47 +01:00

524 lines
16 KiB
Python

from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
from spacy.training import Example
from spacy.training.iob_utils import offsets_to_biluo_tags
from spacy.scorer import Scorer, ROCAUCScore, PRFScore
from spacy.scorer import _roc_auc_score, _roc_curve
from spacy.lang.en import English
from spacy.tokens import Doc, Span
test_las_apple = [
[
"Apple is looking at buying U.K. startup for $ 1 billion",
{
"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
"deps": [
"nsubj",
"aux",
"ROOT",
"prep",
"pcomp",
"compound",
"dobj",
"prep",
"quantmod",
"compound",
"pobj",
],
},
]
]
test_ner_cardinal = [
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
]
test_ner_apple = [
[
"Apple is looking at buying U.K. startup for $1 billion",
{"entities": [(0, 5, "ORG"), (27, 31, "GPE"), (44, 54, "MONEY")]},
]
]
@pytest.fixture
def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London."
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
pos = [
"PROPN",
"PART",
"NOUN",
"VERB",
"ADP",
"PROPN",
"PROPN",
"ADP",
"PROPN",
"PUNCT",
]
morphs = [
"NounType=prop|Number=sing",
"Poss=yes",
"Number=sing",
"Tense=past|VerbForm=fin",
"",
"NounType=prop|Number=sing",
"NounType=prop|Number=sing",
"",
"NounType=prop|Number=sing",
"PunctType=peri",
]
nlp = English()
doc = nlp(text)
for i in range(len(tags)):
doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i]
doc[i].set_morph(morphs[i])
if i > 0:
doc[i].is_sent_start = False
return doc
@pytest.fixture
def sented_doc():
text = "One sentence. Two sentences. Three sentences."
nlp = English()
doc = nlp(text)
for i in range(len(doc)):
if i % 3 == 0:
doc[i].is_sent_start = True
else:
doc[i].is_sent_start = False
return doc
def test_tokenization(sented_doc):
scorer = Scorer()
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["token_acc"] == 1.0
nlp = English()
example.predicted = Doc(
nlp.vocab,
words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."],
spaces=[True, True, True, True, True, False],
)
example.predicted[1].is_sent_start = False
scores = scorer.score([example])
assert scores["token_acc"] == 0.5
assert scores["token_p"] == 0.5
assert scores["token_r"] == approx(0.33333333)
assert scores["token_f"] == 0.4
def test_sents(sented_doc):
scorer = Scorer()
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["sents_f"] == 1.0
# One sentence start is moved
gold["sent_starts"][3] = 0
gold["sent_starts"][4] = 1
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["sents_f"] == approx(0.3333333)
def test_las_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
examples = []
for input_, annot in test_las_apple:
doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
)
gold = {"heads": annot["heads"], "deps": annot["deps"]}
example = Example.from_dict(doc, gold)
examples.append(example)
results = scorer.score(examples)
assert results["dep_uas"] == 1.0
assert results["dep_las"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["p"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["r"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["f"] == 1.0
assert results["dep_las_per_type"]["compound"]["p"] == 1.0
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
assert results["dep_las_per_type"]["compound"]["f"] == 1.0
# One dep is incorrect in Doc
scorer = Scorer()
examples = []
for input_, annot in test_las_apple:
doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
)
gold = {"heads": annot["heads"], "deps": annot["deps"]}
doc[0].dep_ = "compound"
example = Example.from_dict(doc, gold)
examples.append(example)
results = scorer.score(examples)
assert results["dep_uas"] == 1.0
assert_almost_equal(results["dep_las"], 0.9090909)
assert results["dep_las_per_type"]["nsubj"]["p"] == 0
assert results["dep_las_per_type"]["nsubj"]["r"] == 0
assert results["dep_las_per_type"]["nsubj"]["f"] == 0
assert_almost_equal(results["dep_las_per_type"]["compound"]["p"], 0.666666666)
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
assert results["dep_las_per_type"]["compound"]["f"] == 0.8
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
examples = []
for input_, annot in test_ner_cardinal:
doc = Doc(
en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"]
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False
example.reference[1].is_sent_start = False
examples.append(example)
results = scorer.score(examples)
assert results["ents_p"] == 1.0
assert results["ents_r"] == 1.0
assert results["ents_f"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0
# Doc has one missing and one extra entity
# Entity type MONEY is not present in Doc
scorer = Scorer()
examples = []
for input_, annot in test_ner_apple:
doc = Doc(
en_vocab,
words=input_.split(" "),
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False
example.reference[1].is_sent_start = False
examples.append(example)
results = scorer.score(examples)
assert results["ents_p"] == approx(0.6666666)
assert results["ents_r"] == approx(0.6666666)
assert results["ents_f"] == approx(0.6666666)
assert "GPE" in results["ents_per_type"]
assert "MONEY" in results["ents_per_type"]
assert "ORG" in results["ents_per_type"]
assert results["ents_per_type"]["GPE"]["p"] == 1.0
assert results["ents_per_type"]["GPE"]["r"] == 1.0
assert results["ents_per_type"]["GPE"]["f"] == 1.0
assert results["ents_per_type"]["MONEY"]["p"] == 0
assert results["ents_per_type"]["MONEY"]["r"] == 0
assert results["ents_per_type"]["MONEY"]["f"] == 0
assert results["ents_per_type"]["ORG"]["p"] == 0.5
assert results["ents_per_type"]["ORG"]["r"] == 1.0
assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)
def test_tag_score(tagged_doc):
# Gold and Doc are identical
scorer = Scorer()
gold = {
"tags": [t.tag_ for t in tagged_doc],
"pos": [t.pos_ for t in tagged_doc],
"morphs": [str(t.morph) for t in tagged_doc],
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
}
example = Example.from_dict(tagged_doc, gold)
results = scorer.score([example])
assert results["tag_acc"] == 1.0
assert results["pos_acc"] == 1.0
assert results["morph_acc"] == 1.0
assert results["morph_micro_f"] == 1.0
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
# Gold annotation is modified
scorer = Scorer()
tags = [t.tag_ for t in tagged_doc]
tags[0] = "NN"
pos = [t.pos_ for t in tagged_doc]
pos[1] = "X"
morphs = [str(t.morph) for t in tagged_doc]
morphs[1] = "Number=sing"
morphs[2] = "Number=plur"
gold = {
"tags": tags,
"pos": pos,
"morphs": morphs,
"sent_starts": gold["sent_starts"],
}
example = Example.from_dict(tagged_doc, gold)
results = scorer.score([example])
assert results["tag_acc"] == 0.9
assert results["pos_acc"] == 0.9
assert results["morph_acc"] == approx(0.8)
assert results["morph_micro_f"] == approx(0.8461538)
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e")
pred_doc[0].tag_ = "A"
pred_doc[0].pos_ = "X"
pred_doc[0].set_morph("Feat=Val")
pred_doc[0].dep_ = "dep"
# unannotated reference
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
for key in scores:
# cats doesn't have an unset state
if key.startswith("cats"):
continue
assert scores[key] is None
# partially annotated reference, not overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[1].tag_ = "A"
ref_doc[1].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] is None
assert scores["tag_acc"] == 0.0
assert scores["pos_acc"] == 0.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] is None
# partially annotated reference, overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[0].tag_ = "A"
ref_doc[0].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] is None
assert scores["tag_acc"] == 1.0
assert scores["pos_acc"] == 1.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] is None
def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1]
y_score = [0, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [0, 1]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1, 1])
assert_array_almost_equal(fpr, [0, 0, 1])
assert_almost_equal(roc_auc, 0.0)
y_true = [1, 0]
y_score = [1, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
y_true = [1, 0]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [1, 0]
y_score = [0.5, 0.5]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
# same result as above with ROCAUCScore wrapper
score = ROCAUCScore()
score.score_set(0.5, 1)
score.score_set(0.5, 0)
assert_almost_equal(score.score, 0.5)
# check that errors are raised in undefined cases and score is -inf
y_true = [0, 0]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 0)
score.score_set(0.75, 0)
with pytest.raises(ValueError):
_ = score.score # noqa: F841
y_true = [1, 1]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 1)
score.score_set(0.75, 1)
with pytest.raises(ValueError):
_ = score.score # noqa: F841
def test_score_spans():
nlp = English()
text = "This is just a random sentence."
key = "my_spans"
gold = nlp.make_doc(text)
pred = nlp.make_doc(text)
spans = []
spans.append(gold.char_span(0, 4, label="PERSON"))
spans.append(gold.char_span(0, 7, label="ORG"))
spans.append(gold.char_span(8, 12, label="ORG"))
gold.spans[key] = spans
def span_getter(doc, span_key):
return doc.spans[span_key]
# Predict exactly the same, but overlapping spans will be discarded
pred.spans[key] = spans
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] < 1.0
# Allow overlapping, now both precision and recall should be 100%
pred.spans[key] = spans
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] == 1.0
# Change the predicted labels
new_spans = [Span(pred, span.start, span.end, label="WRONG") for span in spans]
pred.spans[key] = new_spans
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
assert scores[f"{key}_p"] == 0.0
assert scores[f"{key}_r"] == 0.0
assert f"{key}_per_type" in scores
# Discard labels from the evaluation
scores = Scorer.score_spans(
[eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False
)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] == 1.0
assert f"{key}_per_type" not in scores
def test_prf_score():
cand = {"hi", "ho"}
gold1 = {"yo", "hi"}
gold2 = set()
a = PRFScore()
a.score_set(cand=cand, gold=gold1)
assert (a.precision, a.recall, a.fscore) == approx((0.5, 0.5, 0.5))
b = PRFScore()
b.score_set(cand=cand, gold=gold2)
assert (b.precision, b.recall, b.fscore) == approx((0.0, 0.0, 0.0))
c = a + b
assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
a += b
assert (a.precision, a.recall, a.fscore) == approx(
(c.precision, c.recall, c.fscore)
)
def test_score_cats(en_tokenizer):
text = "some text"
gold_doc = en_tokenizer(text)
gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
pred_doc = en_tokenizer(text)
pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
example = Example(pred_doc, gold_doc)
# threshold is ignored for multi_label=False
scores1 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.1,
)
scores2 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.9,
)
assert scores1["cats_score"] == 1.0
assert scores2["cats_score"] == 1.0
assert scores1 == scores2
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.9,
)
assert scores["cats_macro_f"] == 0.0
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.1,
)
assert scores["cats_macro_f"] == 0.5