Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
This commit is contained in:
Matthew Honnibal 2020-08-06 16:24:13 +02:00 committed by GitHub
parent 5e683a6e46
commit d4525816ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 80 additions and 67 deletions

View File

@ -319,6 +319,7 @@ class Scorer:
labels: Iterable[str] = tuple(), labels: Iterable[str] = tuple(),
multi_label: bool = True, multi_label: bool = True,
positive_label: Optional[str] = None, positive_label: Optional[str] = None,
threshold: Optional[float] = None,
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns PRF and ROC AUC scores for a doc-level attribute with a """Returns PRF and ROC AUC scores for a doc-level attribute with a
@ -334,94 +335,104 @@ class Scorer:
Defaults to True. Defaults to True.
positive_label (str): The positive label for a binary task with positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None. exclusive classes. Defaults to None.
threshold (float): Cutoff to consider a prediction "positive". Defaults
to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
otherwise.
RETURNS (Dict[str, Any]): A dictionary containing the scores, with RETURNS (Dict[str, Any]): A dictionary containing the scores, with
inapplicable scores as None: inapplicable scores as None:
for all: for all:
attr_score (one of attr_f / attr_macro_f / attr_macro_auc), attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score), attr_score_desc (text description of the overall score),
attr_micro_f,
attr_macro_f,
attr_auc,
attr_f_per_type, attr_f_per_type,
attr_auc_per_type attr_auc_per_type
for binary exclusive with positive label: attr_p/r/f
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
for multilabel, macro-averaged AUC: attr_macro_auc
DOCS: https://spacy.io/api/scorer#score_cats DOCS: https://spacy.io/api/scorer#score_cats
""" """
score = PRFScore() if threshold is None:
f_per_type = dict() threshold = 0.5 if multi_label else 0.0
auc_per_type = dict() f_per_type = {label: PRFScore() for label in labels}
for label in labels: auc_per_type = {label: ROCAUCScore() for label in labels}
f_per_type[label] = PRFScore() labels = set(labels)
auc_per_type[label] = ROCAUCScore() if labels:
for eg in examples:
labels.update(eg.predicted.cats.keys())
labels.update(eg.reference.cats.keys())
for example in examples: for example in examples:
gold_doc = example.reference # Through this loop, None in the gold_cats indicates missing label.
pred_doc = example.predicted pred_cats = getter(example.predicted, attr)
gold_values = getter(gold_doc, attr) gold_cats = getter(example.reference, attr)
pred_values = getter(pred_doc, attr)
if ( # I think the AUC metric is applicable regardless of whether we're
len(gold_values) > 0 # doing multi-label classification? Unsure. If not, move this into
and set(f_per_type) == set(auc_per_type) == set(gold_values) # the elif pred_cats and gold_cats block below.
and set(gold_values) == set(pred_values) for label in labels:
): pred_score = pred_cats.get(label, 0.0)
gold_val = max(gold_values, key=gold_values.get) gold_score = gold_cats.get(label, 0.0)
pred_val = max(pred_values, key=pred_values.get) if gold_score is not None:
if positive_label: auc_per_type[label].score_set(pred_score, gold_score)
score.score_set( if multi_label:
set([positive_label]) & set([pred_val]), for label in labels:
set([positive_label]) & set([gold_val]), pred_score = pred_cats.get(label, 0.0)
) gold_score = gold_cats.get(label, 0.0)
for label in set(gold_values): if gold_score is not None:
auc_per_type[label].score_set( if pred_score >= threshold and gold_score > 0:
pred_values[label], gold_values[label] f_per_type[label].tp += 1
) elif pred_score >= threshold and gold_score == 0:
f_per_type[label].score_set( f_per_type[label].fp += 1
set([label]) & set([pred_val]), set([label]) & set([gold_val]) elif pred_score < threshold and gold_score > 0:
) f_per_type[label].fn += 1
elif len(f_per_type) > 0: elif pred_cats and gold_cats:
model_labels = set(f_per_type) # Get the highest-scoring for each.
eval_labels = set(gold_values) pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
raise ValueError( gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
Errors.E162.format( if gold_score is not None:
model_labels=model_labels, eval_labels=eval_labels if pred_label == gold_label and pred_score >= threshold:
) f_per_type[pred_label].tp += 1
) else:
elif len(auc_per_type) > 0: f_per_type[gold_label].fn += 1
model_labels = set(auc_per_type) if pred_score >= threshold:
eval_labels = set(gold_values) f_per_type[pred_label].fp += 1
raise ValueError( elif gold_cats:
Errors.E162.format( gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
model_labels=model_labels, eval_labels=eval_labels if gold_score is not None and gold_score > 0:
) f_per_type[gold_label].fn += 1
) else:
pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
if pred_score >= threshold:
f_per_type[pred_label].fp += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp = label_prf.tp
micro_prf.fn = label_prf.fn
micro_prf.fp = label_prf.fp
n_cats = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
results = { results = {
f"{attr}_score": None, f"{attr}_score": None,
f"{attr}_score_desc": None, f"{attr}_score_desc": None,
f"{attr}_p": None, f"{attr}_micro_p": micro_prf.precision,
f"{attr}_r": None, f"{attr}_micro_r": micro_prf.recall,
f"{attr}_f": None, f"{attr}_micro_f": micro_prf.fscore,
f"{attr}_macro_f": None, f"{attr}_macro_p": macro_p,
f"{attr}_macro_r": macro_r,
f"{attr}_macro_f": macro_f,
f"{attr}_macro_auc": None, f"{attr}_macro_auc": None,
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
} }
if len(labels) == 2 and not multi_label and positive_label: if len(labels) == 2 and not multi_label and positive_label:
results[f"{attr}_p"] = score.precision positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
results[f"{attr}_r"] = score.recall results[f"{attr}_score"] = positive_label_f
results[f"{attr}_f"] = score.fscore
results[f"{attr}_score"] = results[f"{attr}_f"]
results[f"{attr}_score_desc"] = f"F ({positive_label})" results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label: elif not multi_label:
results[f"{attr}_macro_f"] = sum(
[score.fscore for label, score in f_per_type.items()]
) / (len(f_per_type) + 1e-100)
results[f"{attr}_score"] = results[f"{attr}_macro_f"] results[f"{attr}_score"] = results[f"{attr}_macro_f"]
results[f"{attr}_score_desc"] = "macro F" results[f"{attr}_score_desc"] = "macro F"
else: else:
results[f"{attr}_macro_auc"] = max(
sum([score.score for label, score in auc_per_type.items()])
/ (len(auc_per_type) + 1e-100),
-1,
)
results[f"{attr}_score"] = results[f"{attr}_macro_auc"] results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
results[f"{attr}_score_desc"] = "macro AUC" results[f"{attr}_score_desc"] = "macro AUC"
return results return results

View File

@ -117,8 +117,10 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring # Test scoring
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"}) scores = nlp.evaluate(
assert scores["cats_f"] == 1.0 train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
assert scores["cats_micro_f"] == 1.0
assert scores["cats_score"] == 1.0 assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores assert "cats_score_desc" in scores