Evaluation of NER model per entity type, closes #3490 (#3911)

* Evaluation of NER model per entity type, closes ##3490

Now each ent score is tracked individually in order to have its own Precision, Recall and F1 Score

* Keep track of each entity individually using dicts

* Improving how to compute the scores for each entity

* Fixed bug computing scores for ents

* Formatting with black

* Added key ents_per_type to the scores function

The key `ents_per_type` contains the metrics Precision, Recall and F1-Score for each entity individually
This commit is contained in:
Alejandro Alcalde 2019-07-09 20:54:59 +02:00 committed by Ines Montani
parent 2eb925bd05
commit 6d577f0b92

View File

@ -52,6 +52,7 @@ class Scorer(object):
self.labelled = PRFScore() self.labelled = PRFScore()
self.tags = PRFScore() self.tags = PRFScore()
self.ner = PRFScore() self.ner = PRFScore()
self.ner_per_ents = dict()
self.eval_punct = eval_punct self.eval_punct = eval_punct
@property @property
@ -104,6 +105,15 @@ class Scorer(object):
"ents_f": self.ents_f, "ents_f": self.ents_f,
"tags_acc": self.tags_acc, "tags_acc": self.tags_acc,
"token_acc": self.token_acc, "token_acc": self.token_acc,
"ents_per_type": self.__scores_per_ents(),
}
def __scores_per_ents(self):
"""RETURNS (dict): Scores per NER entity
"""
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.ner_per_ents.items()
} }
def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")): def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
@ -149,13 +159,31 @@ class Scorer(object):
cand_deps.add((gold_i, gold_head, token.dep_.lower())) cand_deps.add((gold_i, gold_head, token.dep_.lower()))
if "-" not in [token[-1] for token in gold.orig_annot]: if "-" not in [token[-1] for token in gold.orig_annot]:
cand_ents = set() cand_ents = set()
current_ent = {k.label_: set() for k in doc.ents}
current_gold = {k.label_: set() for k in doc.ents}
for ent in doc.ents: for ent in doc.ents:
if ent.label_ not in self.ner_per_ents:
self.ner_per_ents[ent.label_] = PRFScore()
first = gold.cand_to_gold[ent.start] first = gold.cand_to_gold[ent.start]
last = gold.cand_to_gold[ent.end - 1] last = gold.cand_to_gold[ent.end - 1]
if first is None or last is None: if first is None or last is None:
self.ner.fp += 1 self.ner.fp += 1
self.ner_per_ents[ent.label_].fp += 1
else: else:
cand_ents.add((ent.label_, first, last)) cand_ents.add((ent.label_, first, last))
current_ent[ent.label_].add(
tuple(x for x in cand_ents if x[0] == ent.label_)
)
current_gold[ent.label_].add(
tuple(x for x in gold_ents if x[0] == ent.label_)
)
# Scores per ent
[
v.score_set(current_ent[k], current_gold[k])
for k, v in self.ner_per_ents.items()
if k in current_ent
]
# Score for all ents
self.ner.score_set(cand_ents, gold_ents) self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags) self.tags.score_set(cand_tags, gold_tags)
self.labelled.score_set(cand_deps, gold_deps) self.labelled.score_set(cand_deps, gold_deps)