spaCy/spacy/tests/test_scorer.py
Adriane Boyd c4112a1da3
Require that all SpanGroup spans are from the current doc (#12569)
* Require that all SpanGroup spans are from the current doc

The restriction on only adding spans from the current doc were already
implemented for all operations except for `SpanGroup.__init__`.

Initialize copied spans for `SpanGroup.copy` with `Doc.char_span` in
order to validate the character offsets and to make it possible to copy
spans between documents with differing tokenization. Currently there is
no validation that the document texts are identical, but the span char
offsets must be valid spans in the target doc, which prevents you from
ending up with completely invalid spans.

* Undo change in test_beam_overfitting_IO
2023-06-01 19:19:17 +02:00

539 lines
17 KiB
Python

from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
from spacy.training import Example
from spacy.training.iob_utils import offsets_to_biluo_tags
from spacy.scorer import Scorer, ROCAUCScore, PRFScore
from spacy.scorer import _roc_auc_score, _roc_curve
from spacy.lang.en import English
from spacy.tokens import Doc, Span
test_las_apple = [
[
"Apple is looking at buying U.K. startup for $ 1 billion",
{
"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
"deps": [
"nsubj",
"aux",
"ROOT",
"prep",
"pcomp",
"compound",
"dobj",
"prep",
"quantmod",
"compound",
"pobj",
],
},
]
]
test_ner_cardinal = [
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
]
test_ner_apple = [
[
"Apple is looking at buying U.K. startup for $1 billion",
{"entities": [(0, 5, "ORG"), (27, 31, "GPE"), (44, 54, "MONEY")]},
]
]
@pytest.fixture
def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London."
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
pos = [
"PROPN",
"PART",
"NOUN",
"VERB",
"ADP",
"PROPN",
"PROPN",
"ADP",
"PROPN",
"PUNCT",
]
morphs = [
"NounType=prop|Number=sing",
"Poss=yes",
"Number=sing",
"Tense=past|VerbForm=fin",
"",
"NounType=prop|Number=sing",
"NounType=prop|Number=sing",
"",
"NounType=prop|Number=sing",
"PunctType=peri",
]
nlp = English()
doc = nlp(text)
for i in range(len(tags)):
doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i]
doc[i].set_morph(morphs[i])
if i > 0:
doc[i].is_sent_start = False
return doc
@pytest.fixture
def sented_doc():
text = "One sentence. Two sentences. Three sentences."
nlp = English()
doc = nlp(text)
for i in range(len(doc)):
if i % 3 == 0:
doc[i].is_sent_start = True
else:
doc[i].is_sent_start = False
return doc
def test_tokenization(sented_doc):
scorer = Scorer()
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["token_acc"] == 1.0
nlp = English()
example.predicted = Doc(
nlp.vocab,
words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."],
spaces=[True, True, True, True, True, False],
)
example.predicted[1].is_sent_start = False
scores = scorer.score([example])
assert scores["token_acc"] == 0.5
assert scores["token_p"] == 0.5
assert scores["token_r"] == approx(0.33333333)
assert scores["token_f"] == 0.4
# per-component scoring
scorer = Scorer()
scores = scorer.score([example], per_component=True)
assert scores["tokenizer"]["token_acc"] == 0.5
assert scores["tokenizer"]["token_p"] == 0.5
assert scores["tokenizer"]["token_r"] == approx(0.33333333)
assert scores["tokenizer"]["token_f"] == 0.4
def test_sents(sented_doc):
scorer = Scorer()
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["sents_f"] == 1.0
# One sentence start is moved
gold["sent_starts"][3] = 0
gold["sent_starts"][4] = 1
example = Example.from_dict(sented_doc, gold)
scores = scorer.score([example])
assert scores["sents_f"] == approx(0.3333333)
def test_las_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
examples = []
for input_, annot in test_las_apple:
doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
)
gold = {"heads": annot["heads"], "deps": annot["deps"]}
example = Example.from_dict(doc, gold)
examples.append(example)
results = scorer.score(examples)
assert results["dep_uas"] == 1.0
assert results["dep_las"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["p"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["r"] == 1.0
assert results["dep_las_per_type"]["nsubj"]["f"] == 1.0
assert results["dep_las_per_type"]["compound"]["p"] == 1.0
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
assert results["dep_las_per_type"]["compound"]["f"] == 1.0
# One dep is incorrect in Doc
scorer = Scorer()
examples = []
for input_, annot in test_las_apple:
doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
)
gold = {"heads": annot["heads"], "deps": annot["deps"]}
doc[0].dep_ = "compound"
example = Example.from_dict(doc, gold)
examples.append(example)
results = scorer.score(examples)
assert results["dep_uas"] == 1.0
assert_almost_equal(results["dep_las"], 0.9090909)
assert results["dep_las_per_type"]["nsubj"]["p"] == 0
assert results["dep_las_per_type"]["nsubj"]["r"] == 0
assert results["dep_las_per_type"]["nsubj"]["f"] == 0
assert_almost_equal(results["dep_las_per_type"]["compound"]["p"], 0.666666666)
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
assert results["dep_las_per_type"]["compound"]["f"] == 0.8
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
examples = []
for input_, annot in test_ner_cardinal:
doc = Doc(
en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"]
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False
example.reference[1].is_sent_start = False
examples.append(example)
results = scorer.score(examples)
assert results["ents_p"] == 1.0
assert results["ents_r"] == 1.0
assert results["ents_f"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0
assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0
# Doc has one missing and one extra entity
# Entity type MONEY is not present in Doc
scorer = Scorer()
examples = []
for input_, annot in test_ner_apple:
doc = Doc(
en_vocab,
words=input_.split(" "),
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False
example.reference[1].is_sent_start = False
examples.append(example)
results = scorer.score(examples)
assert results["ents_p"] == approx(0.6666666)
assert results["ents_r"] == approx(0.6666666)
assert results["ents_f"] == approx(0.6666666)
assert "GPE" in results["ents_per_type"]
assert "MONEY" in results["ents_per_type"]
assert "ORG" in results["ents_per_type"]
assert results["ents_per_type"]["GPE"]["p"] == 1.0
assert results["ents_per_type"]["GPE"]["r"] == 1.0
assert results["ents_per_type"]["GPE"]["f"] == 1.0
assert results["ents_per_type"]["MONEY"]["p"] == 0
assert results["ents_per_type"]["MONEY"]["r"] == 0
assert results["ents_per_type"]["MONEY"]["f"] == 0
assert results["ents_per_type"]["ORG"]["p"] == 0.5
assert results["ents_per_type"]["ORG"]["r"] == 1.0
assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)
def test_tag_score(tagged_doc):
# Gold and Doc are identical
scorer = Scorer()
gold = {
"tags": [t.tag_ for t in tagged_doc],
"pos": [t.pos_ for t in tagged_doc],
"morphs": [str(t.morph) for t in tagged_doc],
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
}
example = Example.from_dict(tagged_doc, gold)
results = scorer.score([example])
assert results["tag_acc"] == 1.0
assert results["pos_acc"] == 1.0
assert results["morph_acc"] == 1.0
assert results["morph_micro_f"] == 1.0
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
# Gold annotation is modified
scorer = Scorer()
tags = [t.tag_ for t in tagged_doc]
tags[0] = "NN"
pos = [t.pos_ for t in tagged_doc]
pos[1] = "X"
morphs = [str(t.morph) for t in tagged_doc]
morphs[1] = "Number=sing"
morphs[2] = "Number=plur"
gold = {
"tags": tags,
"pos": pos,
"morphs": morphs,
"sent_starts": gold["sent_starts"],
}
example = Example.from_dict(tagged_doc, gold)
results = scorer.score([example])
assert results["tag_acc"] == 0.9
assert results["pos_acc"] == 0.9
assert results["morph_acc"] == approx(0.8)
assert results["morph_micro_f"] == approx(0.8461538)
assert results["morph_per_feat"]["NounType"]["f"] == 1.0
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
# per-component scoring
scorer = Scorer()
results = scorer.score([example], per_component=True)
assert results["tagger"]["tag_acc"] == 0.9
assert results["morphologizer"]["pos_acc"] == 0.9
assert results["morphologizer"]["morph_acc"] == approx(0.8)
def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e")
pred_doc[0].tag_ = "A"
pred_doc[0].pos_ = "X"
pred_doc[0].set_morph("Feat=Val")
pred_doc[0].dep_ = "dep"
# unannotated reference
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
for key in scores:
# cats doesn't have an unset state
if key.startswith("cats"):
continue
assert scores[key] is None
# partially annotated reference, not overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[1].tag_ = "A"
ref_doc[1].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] is None
assert scores["tag_acc"] == 0.0
assert scores["pos_acc"] == 0.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] is None
# partially annotated reference, overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[0].tag_ = "A"
ref_doc[0].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] is None
assert scores["tag_acc"] == 1.0
assert scores["pos_acc"] == 1.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] is None
def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1]
y_score = [0, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [0, 1]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1, 1])
assert_array_almost_equal(fpr, [0, 0, 1])
assert_almost_equal(roc_auc, 0.0)
y_true = [1, 0]
y_score = [1, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
y_true = [1, 0]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [1, 0]
y_score = [0.5, 0.5]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
# same result as above with ROCAUCScore wrapper
score = ROCAUCScore()
score.score_set(0.5, 1)
score.score_set(0.5, 0)
assert_almost_equal(score.score, 0.5)
# check that errors are raised in undefined cases and score is -inf
y_true = [0, 0]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 0)
score.score_set(0.75, 0)
with pytest.raises(ValueError):
_ = score.score # noqa: F841
y_true = [1, 1]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 1)
score.score_set(0.75, 1)
with pytest.raises(ValueError):
_ = score.score # noqa: F841
def test_score_spans():
nlp = English()
text = "This is just a random sentence."
key = "my_spans"
gold = nlp.make_doc(text)
pred = nlp.make_doc(text)
spans = []
spans.append(gold.char_span(0, 4, label="PERSON"))
spans.append(gold.char_span(0, 7, label="ORG"))
spans.append(gold.char_span(8, 12, label="ORG"))
gold.spans[key] = spans
def span_getter(doc, span_key):
return doc.spans[span_key]
# Predict exactly the same, but overlapping spans will be discarded
pred.spans[key] = gold.spans[key].copy(doc=pred)
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] < 1.0
# Allow overlapping, now both precision and recall should be 100%
pred.spans[key] = gold.spans[key].copy(doc=pred)
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] == 1.0
# Change the predicted labels
new_spans = [Span(pred, span.start, span.end, label="WRONG") for span in spans]
pred.spans[key] = new_spans
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
assert scores[f"{key}_p"] == 0.0
assert scores[f"{key}_r"] == 0.0
assert f"{key}_per_type" in scores
# Discard labels from the evaluation
scores = Scorer.score_spans(
[eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False
)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] == 1.0
assert f"{key}_per_type" not in scores
def test_prf_score():
cand = {"hi", "ho"}
gold1 = {"yo", "hi"}
gold2 = set()
a = PRFScore()
a.score_set(cand=cand, gold=gold1)
assert (a.precision, a.recall, a.fscore) == approx((0.5, 0.5, 0.5))
b = PRFScore()
b.score_set(cand=cand, gold=gold2)
assert (b.precision, b.recall, b.fscore) == approx((0.0, 0.0, 0.0))
c = a + b
assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
a += b
assert (a.precision, a.recall, a.fscore) == approx(
(c.precision, c.recall, c.fscore)
)
def test_score_cats(en_tokenizer):
text = "some text"
gold_doc = en_tokenizer(text)
gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
pred_doc = en_tokenizer(text)
pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
example = Example(pred_doc, gold_doc)
# threshold is ignored for multi_label=False
scores1 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.1,
)
scores2 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.9,
)
assert scores1["cats_score"] == 1.0
assert scores2["cats_score"] == 1.0
assert scores1 == scores2
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.9,
)
assert scores["cats_macro_f"] == 0.0
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.1,
)
assert scores["cats_macro_f"] == 0.5