mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Be less choosy about reporting textcat scores (#5879)
* Set textcat scores more consistently * Refactor textcat scores * Fixes to scorer * Add comments * Add threshold * Rename just 'f' to micro_f in textcat scorer * Fix textcat score for two-class * Fix syntax * Fix textcat score * Fix docstring
This commit is contained in:
parent
5e683a6e46
commit
d4525816ef
141
spacy/scorer.py
141
spacy/scorer.py
|
@ -319,6 +319,7 @@ class Scorer:
|
||||||
labels: Iterable[str] = tuple(),
|
labels: Iterable[str] = tuple(),
|
||||||
multi_label: bool = True,
|
multi_label: bool = True,
|
||||||
positive_label: Optional[str] = None,
|
positive_label: Optional[str] = None,
|
||||||
|
threshold: Optional[float] = None,
|
||||||
**cfg,
|
**cfg,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
||||||
|
@ -334,94 +335,104 @@ class Scorer:
|
||||||
Defaults to True.
|
Defaults to True.
|
||||||
positive_label (str): The positive label for a binary task with
|
positive_label (str): The positive label for a binary task with
|
||||||
exclusive classes. Defaults to None.
|
exclusive classes. Defaults to None.
|
||||||
|
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
||||||
|
to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
|
||||||
|
otherwise.
|
||||||
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
|
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
|
||||||
inapplicable scores as None:
|
inapplicable scores as None:
|
||||||
for all:
|
for all:
|
||||||
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
|
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
||||||
attr_score_desc (text description of the overall score),
|
attr_score_desc (text description of the overall score),
|
||||||
|
attr_micro_f,
|
||||||
|
attr_macro_f,
|
||||||
|
attr_auc,
|
||||||
attr_f_per_type,
|
attr_f_per_type,
|
||||||
attr_auc_per_type
|
attr_auc_per_type
|
||||||
for binary exclusive with positive label: attr_p/r/f
|
|
||||||
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
|
|
||||||
for multilabel, macro-averaged AUC: attr_macro_auc
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_cats
|
DOCS: https://spacy.io/api/scorer#score_cats
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
if threshold is None:
|
||||||
f_per_type = dict()
|
threshold = 0.5 if multi_label else 0.0
|
||||||
auc_per_type = dict()
|
f_per_type = {label: PRFScore() for label in labels}
|
||||||
for label in labels:
|
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||||
f_per_type[label] = PRFScore()
|
labels = set(labels)
|
||||||
auc_per_type[label] = ROCAUCScore()
|
if labels:
|
||||||
|
for eg in examples:
|
||||||
|
labels.update(eg.predicted.cats.keys())
|
||||||
|
labels.update(eg.reference.cats.keys())
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_doc = example.reference
|
# Through this loop, None in the gold_cats indicates missing label.
|
||||||
pred_doc = example.predicted
|
pred_cats = getter(example.predicted, attr)
|
||||||
gold_values = getter(gold_doc, attr)
|
gold_cats = getter(example.reference, attr)
|
||||||
pred_values = getter(pred_doc, attr)
|
|
||||||
if (
|
# I think the AUC metric is applicable regardless of whether we're
|
||||||
len(gold_values) > 0
|
# doing multi-label classification? Unsure. If not, move this into
|
||||||
and set(f_per_type) == set(auc_per_type) == set(gold_values)
|
# the elif pred_cats and gold_cats block below.
|
||||||
and set(gold_values) == set(pred_values)
|
for label in labels:
|
||||||
):
|
pred_score = pred_cats.get(label, 0.0)
|
||||||
gold_val = max(gold_values, key=gold_values.get)
|
gold_score = gold_cats.get(label, 0.0)
|
||||||
pred_val = max(pred_values, key=pred_values.get)
|
if gold_score is not None:
|
||||||
if positive_label:
|
auc_per_type[label].score_set(pred_score, gold_score)
|
||||||
score.score_set(
|
if multi_label:
|
||||||
set([positive_label]) & set([pred_val]),
|
for label in labels:
|
||||||
set([positive_label]) & set([gold_val]),
|
pred_score = pred_cats.get(label, 0.0)
|
||||||
)
|
gold_score = gold_cats.get(label, 0.0)
|
||||||
for label in set(gold_values):
|
if gold_score is not None:
|
||||||
auc_per_type[label].score_set(
|
if pred_score >= threshold and gold_score > 0:
|
||||||
pred_values[label], gold_values[label]
|
f_per_type[label].tp += 1
|
||||||
)
|
elif pred_score >= threshold and gold_score == 0:
|
||||||
f_per_type[label].score_set(
|
f_per_type[label].fp += 1
|
||||||
set([label]) & set([pred_val]), set([label]) & set([gold_val])
|
elif pred_score < threshold and gold_score > 0:
|
||||||
)
|
f_per_type[label].fn += 1
|
||||||
elif len(f_per_type) > 0:
|
elif pred_cats and gold_cats:
|
||||||
model_labels = set(f_per_type)
|
# Get the highest-scoring for each.
|
||||||
eval_labels = set(gold_values)
|
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||||
raise ValueError(
|
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
||||||
Errors.E162.format(
|
if gold_score is not None:
|
||||||
model_labels=model_labels, eval_labels=eval_labels
|
if pred_label == gold_label and pred_score >= threshold:
|
||||||
)
|
f_per_type[pred_label].tp += 1
|
||||||
)
|
else:
|
||||||
elif len(auc_per_type) > 0:
|
f_per_type[gold_label].fn += 1
|
||||||
model_labels = set(auc_per_type)
|
if pred_score >= threshold:
|
||||||
eval_labels = set(gold_values)
|
f_per_type[pred_label].fp += 1
|
||||||
raise ValueError(
|
elif gold_cats:
|
||||||
Errors.E162.format(
|
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
||||||
model_labels=model_labels, eval_labels=eval_labels
|
if gold_score is not None and gold_score > 0:
|
||||||
)
|
f_per_type[gold_label].fn += 1
|
||||||
)
|
else:
|
||||||
|
pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
|
||||||
|
if pred_score >= threshold:
|
||||||
|
f_per_type[pred_label].fp += 1
|
||||||
|
micro_prf = PRFScore()
|
||||||
|
for label_prf in f_per_type.values():
|
||||||
|
micro_prf.tp = label_prf.tp
|
||||||
|
micro_prf.fn = label_prf.fn
|
||||||
|
micro_prf.fp = label_prf.fp
|
||||||
|
n_cats = len(f_per_type) + 1e-100
|
||||||
|
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
||||||
|
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
||||||
|
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
||||||
results = {
|
results = {
|
||||||
f"{attr}_score": None,
|
f"{attr}_score": None,
|
||||||
f"{attr}_score_desc": None,
|
f"{attr}_score_desc": None,
|
||||||
f"{attr}_p": None,
|
f"{attr}_micro_p": micro_prf.precision,
|
||||||
f"{attr}_r": None,
|
f"{attr}_micro_r": micro_prf.recall,
|
||||||
f"{attr}_f": None,
|
f"{attr}_micro_f": micro_prf.fscore,
|
||||||
f"{attr}_macro_f": None,
|
f"{attr}_macro_p": macro_p,
|
||||||
|
f"{attr}_macro_r": macro_r,
|
||||||
|
f"{attr}_macro_f": macro_f,
|
||||||
f"{attr}_macro_auc": None,
|
f"{attr}_macro_auc": None,
|
||||||
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||||
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||||
}
|
}
|
||||||
if len(labels) == 2 and not multi_label and positive_label:
|
if len(labels) == 2 and not multi_label and positive_label:
|
||||||
results[f"{attr}_p"] = score.precision
|
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
|
||||||
results[f"{attr}_r"] = score.recall
|
results[f"{attr}_score"] = positive_label_f
|
||||||
results[f"{attr}_f"] = score.fscore
|
|
||||||
results[f"{attr}_score"] = results[f"{attr}_f"]
|
|
||||||
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
||||||
elif not multi_label:
|
elif not multi_label:
|
||||||
results[f"{attr}_macro_f"] = sum(
|
|
||||||
[score.fscore for label, score in f_per_type.items()]
|
|
||||||
) / (len(f_per_type) + 1e-100)
|
|
||||||
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
|
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
|
||||||
results[f"{attr}_score_desc"] = "macro F"
|
results[f"{attr}_score_desc"] = "macro F"
|
||||||
else:
|
else:
|
||||||
results[f"{attr}_macro_auc"] = max(
|
|
||||||
sum([score.score for label, score in auc_per_type.items()])
|
|
||||||
/ (len(auc_per_type) + 1e-100),
|
|
||||||
-1,
|
|
||||||
)
|
|
||||||
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
|
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
|
||||||
results[f"{attr}_score_desc"] = "macro AUC"
|
results[f"{attr}_score_desc"] = "macro AUC"
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -117,8 +117,10 @@ def test_overfitting_IO():
|
||||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
scores = nlp.evaluate(
|
||||||
assert scores["cats_f"] == 1.0
|
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
|
||||||
|
)
|
||||||
|
assert scores["cats_micro_f"] == 1.0
|
||||||
assert scores["cats_score"] == 1.0
|
assert scores["cats_score"] == 1.0
|
||||||
assert "cats_score_desc" in scores
|
assert "cats_score_desc" in scores
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user