Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly
This commit is contained in:
Adriane Boyd 2020-07-27 11:17:52 +02:00
parent f8cf378be9
commit baf19fd652
3 changed files with 37 additions and 15 deletions

View File

@ -82,8 +82,7 @@ def evaluate(
"NER P": "ents_p", "NER P": "ents_p",
"NER R": "ents_r", "NER R": "ents_r",
"NER F": "ents_f", "NER F": "ents_f",
"Textcat AUC": "textcat_macro_auc", "Textcat": "cats_score",
"Textcat F": "textcat_macro_f",
"Sent P": "sents_p", "Sent P": "sents_p",
"Sent R": "sents_r", "Sent R": "sents_r",
"Sent F": "sents_f", "Sent F": "sents_f",
@ -91,6 +90,8 @@ def evaluate(
results = {} results = {}
for metric, key in metrics.items(): for metric, key in metrics.items():
if key in scores: if key in scores:
if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
results[metric] = f"{scores[key]*100:.2f}" results[metric] = f"{scores[key]*100:.2f}"
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
@ -99,12 +100,12 @@ def evaluate(
if "ents_per_type" in scores: if "ents_per_type" in scores:
if scores["ents_per_type"]: if scores["ents_per_type"]:
print_ents_per_type(msg, scores["ents_per_type"]) print_ents_per_type(msg, scores["ents_per_type"])
if "textcat_f_per_cat" in scores: if "cats_f_per_type" in scores:
if scores["textcat_f_per_cat"]: if scores["cats_f_per_type"]:
print_textcats_f_per_cat(msg, scores["textcat_f_per_cat"]) print_textcats_f_per_cat(msg, scores["cats_f_per_type"])
if "textcat_auc_per_cat" in scores: if "cats_auc_per_type" in scores:
if scores["textcat_auc_per_cat"]: if scores["cats_auc_per_type"]:
print_textcats_auc_per_cat(msg, scores["textcat_auc_per_cat"]) print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
if displacy_path: if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
@ -170,7 +171,7 @@ def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]])
data, data,
header=("", "P", "R", "F"), header=("", "P", "R", "F"),
aligns=("l", "r", "r", "r"), aligns=("l", "r", "r", "r"),
title="Textcat F (per type)", title="Textcat F (per label)",
) )

View File

@ -298,7 +298,8 @@ class Scorer:
**cfg **cfg
): ):
"""Returns PRF and ROC AUC scores for a doc-level attribute with a """Returns PRF and ROC AUC scores for a doc-level attribute with a
dict with scores for each label like Doc.cats. dict with scores for each label like Doc.cats. The reported overall
score depends on the scorer settings.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
@ -309,11 +310,16 @@ class Scorer:
Defaults to True. Defaults to True.
positive_label (str): The positive label for a binary task with positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None. exclusive classes. Defaults to None.
RETURNS (dict): A dictionary containing the scores: RETURNS (dict): A dictionary containing the scores, with inapplicable
for binary exclusive with positive label: attr_p/r/f, scores as None:
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f, for all:
for multilabel, macro-averaged AUC: attr_macro_auc, attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
for all: attr_f_per_type, attr_auc_per_type attr_score_desc (text description of the overall score),
attr_f_per_type,
attr_auc_per_type
for binary exclusive with positive label: attr_p/r/f
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
for multilabel, macro-averaged AUC: attr_macro_auc
""" """
score = PRFScore() score = PRFScore()
f_per_type = dict() f_per_type = dict()
@ -362,6 +368,13 @@ class Scorer:
) )
) )
results = { results = {
attr + "_score": None,
attr + "_score_desc": None,
attr + "_p": None,
attr + "_r": None,
attr + "_f": None,
attr + "_macro_f": None,
attr + "_macro_auc": None,
attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
} }
@ -369,16 +382,22 @@ class Scorer:
results[attr + "_p"] = score.precision results[attr + "_p"] = score.precision
results[attr + "_r"] = score.recall results[attr + "_r"] = score.recall
results[attr + "_f"] = score.fscore results[attr + "_f"] = score.fscore
results[attr + "_score"] = results[attr + "_f"]
results[attr + "_score_desc"] = "F (" + positive_label + ")"
elif not multi_label: elif not multi_label:
results[attr + "_macro_f"] = sum( results[attr + "_macro_f"] = sum(
[score.fscore for label, score in f_per_type.items()] [score.fscore for label, score in f_per_type.items()]
) / (len(f_per_type) + 1e-100) ) / (len(f_per_type) + 1e-100)
results[attr + "_score"] = results[attr + "_macro_f"]
results[attr + "_score_desc"] = "macro F"
else: else:
results[attr + "_macro_auc"] = max( results[attr + "_macro_auc"] = max(
sum([score.score for label, score in auc_per_type.items()]) sum([score.score for label, score in auc_per_type.items()])
/ (len(auc_per_type) + 1e-100), / (len(auc_per_type) + 1e-100),
-1, -1,
) )
results[attr + "_score"] = results[attr + "_macro_auc"]
results[attr + "_score_desc"] = "macro AUC"
return results return results
@staticmethod @staticmethod

View File

@ -121,6 +121,8 @@ def test_overfitting_IO():
train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}} train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}}
) )
assert scores["cats_f"] == 1.0 assert scores["cats_f"] == 1.0
assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores
# fmt: off # fmt: off