Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
This commit is contained in:
Adriane Boyd 2020-11-03 15:47:18 +01:00 committed by GitHub
parent 5d2cb86c34
commit a4b32b9552
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 294 additions and 158 deletions

View File

@ -98,10 +98,13 @@ def evaluate(
if key in scores: if key in scores:
if key == "cats_score": if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if key == "speed": if isinstance(scores[key], (int, float)):
results[metric] = f"{scores[key]:.0f}" if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else: else:
results[metric] = f"{scores[key]*100:.2f}" results[metric] = "-"
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results") msg.table(results, title="Results")

View File

@ -226,6 +226,9 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/tagger#score DOCS: https://nightly.spacy.io/api/tagger#score
""" """
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "AttributeRuler.score") validate_examples(examples, "AttributeRuler.score")
results = {} results = {}
attrs = set() attrs = set()
@ -237,7 +240,8 @@ class AttributeRuler(Pipe):
elif attr == POS: elif attr == POS:
results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
elif attr == MORPH: elif attr == MORPH:
results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs))
elif attr == LEMMA: elif attr == LEMMA:
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results return results

View File

@ -155,13 +155,16 @@ cdef class DependencyParser(Parser):
DOCS: https://nightly.spacy.io/api/dependencyparser#score DOCS: https://nightly.spacy.io/api/dependencyparser#score
""" """
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "DependencyParser.score") validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr): def dep_getter(token, attr):
dep = getattr(token, attr) dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower() dep = token.vocab.strings.as_string(dep).lower()
return dep return dep
results = {} results = {}
results.update(Scorer.score_spans(examples, "sents", **kwargs)) results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter) kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct")) kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs)) results.update(Scorer.score_deps(examples, "dep", **kwargs))

View File

@ -10,7 +10,7 @@ from ..errors import Errors
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..scorer import Scorer from ..scorer import get_ner_prf
from ..training import validate_examples from ..training import validate_examples
@ -340,7 +340,7 @@ class EntityRuler(Pipe):
def score(self, examples, **kwargs): def score(self, examples, **kwargs):
validate_examples(examples, "EntityRuler.score") validate_examples(examples, "EntityRuler.score")
return Scorer.score_spans(examples, "ents", **kwargs) return get_ner_prf(examples)
def from_bytes( def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()

View File

@ -251,10 +251,13 @@ class Morphologizer(Tagger):
DOCS: https://nightly.spacy.io/api/morphologizer#score DOCS: https://nightly.spacy.io/api/morphologizer#score
""" """
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "Morphologizer.score") validate_examples(examples, "Morphologizer.score")
results = {} results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples, results.update(Scorer.score_token_attr_per_feat(examples,
"morph", **kwargs)) "morph", getter=morph_key_getter, **kwargs))
return results return results

View File

@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser):
DOCS: https://nightly.spacy.io/api/entityrecognizer#score DOCS: https://nightly.spacy.io/api/entityrecognizer#score
""" """
validate_examples(examples, "EntityRecognizer.score") validate_examples(examples, "EntityRecognizer.score")
score_per_type = get_ner_prf(examples) return get_ner_prf(examples)
totals = PRFScore()
for prf in score_per_type.values():
totals += prf
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}

View File

@ -155,8 +155,11 @@ class Sentencizer(Pipe):
DOCS: https://nightly.spacy.io/api/sentencizer#score DOCS: https://nightly.spacy.io/api/sentencizer#score
""" """
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "Sentencizer.score") validate_examples(examples, "Sentencizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs) results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"] del results["sents_per_type"]
return results return results

View File

@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger):
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
""" """
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "SentenceRecognizer.score") validate_examples(examples, "SentenceRecognizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs) results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"] del results["sents_per_type"]
return results return results

View File

@ -1,9 +1,9 @@
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
import numpy as np import numpy as np
from collections import defaultdict from collections import defaultdict
from .training import Example from .training import Example
from .tokens import Token, Doc, Span from .tokens import Token, Doc, Span, MorphAnalysis
from .errors import Errors from .errors import Errors
from .util import get_lang_class, SimpleFrozenList from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology from .morphology import Morphology
@ -13,7 +13,8 @@ if TYPE_CHECKING:
from .language import Language # noqa: F401 from .language import Language # noqa: F401
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
MISSING_VALUES = frozenset([None, 0, ""])
class PRFScore: class PRFScore:
@ -24,6 +25,9 @@ class PRFScore:
self.fp = 0 self.fp = 0
self.fn = 0 self.fn = 0
def __len__(self) -> int:
return self.tp + self.fp + self.fn
def __iadd__(self, other): def __iadd__(self, other):
self.tp += other.tp self.tp += other.tp
self.fp += other.fp self.fp += other.fp
@ -94,7 +98,7 @@ class Scorer:
self, self,
nlp: Optional["Language"] = None, nlp: Optional["Language"] = None,
default_lang: str = "xx", default_lang: str = "xx",
default_pipeline=DEFAULT_PIPELINE, default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
**cfg, **cfg,
) -> None: ) -> None:
"""Initialize the Scorer. """Initialize the Scorer.
@ -126,13 +130,13 @@ class Scorer:
return scores return scores
@staticmethod @staticmethod
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
"""Returns accuracy and PRF scores for tokenization. """Returns accuracy and PRF scores for tokenization.
* token_acc: # correct tokens / # gold tokens * token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for token character spans * token_p/r/f: PRF for token character spans
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
RETURNS (Dict[str, float]): A dictionary containing the scores RETURNS (Dict[str, Any]): A dictionary containing the scores
token_acc/p/r/f. token_acc/p/r/f.
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
@ -142,6 +146,8 @@ class Scorer:
for example in examples: for example in examples:
gold_doc = example.reference gold_doc = example.reference
pred_doc = example.predicted pred_doc = example.predicted
if gold_doc.has_unknown_spaces:
continue
align = example.alignment align = example.alignment
gold_spans = set() gold_spans = set()
pred_spans = set() pred_spans = set()
@ -158,12 +164,20 @@ class Scorer:
else: else:
acc_score.tp += 1 acc_score.tp += 1
prf_score.score_set(pred_spans, gold_spans) prf_score.score_set(pred_spans, gold_spans)
return { if len(acc_score) > 0:
"token_acc": acc_score.fscore, return {
"token_p": prf_score.precision, "token_acc": acc_score.fscore,
"token_r": prf_score.recall, "token_p": prf_score.precision,
"token_f": prf_score.fscore, "token_r": prf_score.recall,
} "token_f": prf_score.fscore,
}
else:
return {
"token_acc": None,
"token_p": None,
"token_r": None,
"token_f": None
}
@staticmethod @staticmethod
def score_token_attr( def score_token_attr(
@ -171,8 +185,9 @@ class Scorer:
attr: str, attr: str,
*, *,
getter: Callable[[Token, str], Any] = getattr, getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES,
**cfg, **cfg,
) -> Dict[str, float]: ) -> Dict[str, Any]:
"""Returns an accuracy score for a token-level attribute. """Returns an accuracy score for a token-level attribute.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
@ -180,7 +195,7 @@ class Scorer:
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
RETURNS (Dict[str, float]): A dictionary containing the accuracy score RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
under the key attr_acc. under the key attr_acc.
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
@ -191,17 +206,27 @@ class Scorer:
pred_doc = example.predicted pred_doc = example.predicted
align = example.alignment align = example.alignment
gold_tags = set() gold_tags = set()
missing_indices = set()
for gold_i, token in enumerate(gold_doc): for gold_i, token in enumerate(gold_doc):
gold_tags.add((gold_i, getter(token, attr))) value = getter(token, attr)
if value not in missing_values:
gold_tags.add((gold_i, getter(token, attr)))
else:
missing_indices.add(gold_i)
pred_tags = set() pred_tags = set()
for token in pred_doc: for token in pred_doc:
if token.orth_.isspace(): if token.orth_.isspace():
continue continue
if align.x2y.lengths[token.i] == 1: if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0] gold_i = align.x2y[token.i].dataXd[0, 0]
pred_tags.add((gold_i, getter(token, attr))) if gold_i not in missing_indices:
pred_tags.add((gold_i, getter(token, attr)))
tag_score.score_set(pred_tags, gold_tags) tag_score.score_set(pred_tags, gold_tags)
return {f"{attr}_acc": tag_score.fscore} score_key = f"{attr}_acc"
if len(tag_score) == 0:
return {score_key: None}
else:
return {score_key: tag_score.fscore}
@staticmethod @staticmethod
def score_token_attr_per_feat( def score_token_attr_per_feat(
@ -209,8 +234,9 @@ class Scorer:
attr: str, attr: str,
*, *,
getter: Callable[[Token, str], Any] = getattr, getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES,
**cfg, **cfg,
): ) -> Dict[str, Any]:
"""Return PRF scores per feat for a token attribute in UFEATS format. """Return PRF scores per feat for a token attribute in UFEATS format.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
@ -218,7 +244,7 @@ class Scorer:
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
RETURNS (dict): A dictionary containing the per-feat PRF scores unders RETURNS (dict): A dictionary containing the per-feat PRF scores under
the key attr_per_feat. the key attr_per_feat.
""" """
per_feat = {} per_feat = {}
@ -227,9 +253,11 @@ class Scorer:
gold_doc = example.reference gold_doc = example.reference
align = example.alignment align = example.alignment
gold_per_feat = {} gold_per_feat = {}
missing_indices = set()
for gold_i, token in enumerate(gold_doc): for gold_i, token in enumerate(gold_doc):
morph = str(getter(token, attr)) value = getter(token, attr)
if morph: morph = gold_doc.vocab.strings[value]
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
for feat in morph.split(Morphology.FEATURE_SEP): for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP) field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat: if field not in per_feat:
@ -237,27 +265,35 @@ class Scorer:
if field not in gold_per_feat: if field not in gold_per_feat:
gold_per_feat[field] = set() gold_per_feat[field] = set()
gold_per_feat[field].add((gold_i, feat)) gold_per_feat[field].add((gold_i, feat))
else:
missing_indices.add(gold_i)
pred_per_feat = {} pred_per_feat = {}
for token in pred_doc: for token in pred_doc:
if token.orth_.isspace(): if token.orth_.isspace():
continue continue
if align.x2y.lengths[token.i] == 1: if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0] gold_i = align.x2y[token.i].dataXd[0, 0]
morph = str(getter(token, attr)) if gold_i not in missing_indices:
if morph: value = getter(token, attr)
for feat in morph.split("|"): morph = gold_doc.vocab.strings[value]
field, values = feat.split("=") if value not in missing_values and morph != Morphology.EMPTY_MORPH:
if field not in per_feat: for feat in morph.split(Morphology.FEATURE_SEP):
per_feat[field] = PRFScore() field, values = feat.split(Morphology.FIELD_SEP)
if field not in pred_per_feat: if field not in per_feat:
pred_per_feat[field] = set() per_feat[field] = PRFScore()
pred_per_feat[field].add((gold_i, feat)) if field not in pred_per_feat:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
for field in per_feat: for field in per_feat:
per_feat[field].score_set( per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
) )
result = {k: v.to_dict() for k, v in per_feat.items()} score_key = f"{attr}_per_feat"
return {f"{attr}_per_feat": result} if any([len(v) for v in per_feat.values()]):
result = {k: v.to_dict() for k, v in per_feat.items()}
return {score_key: result}
else:
return {score_key: None}
@staticmethod @staticmethod
def score_spans( def score_spans(
@ -265,6 +301,7 @@ class Scorer:
attr: str, attr: str,
*, *,
getter: Callable[[Doc, str], Iterable[Span]] = getattr, getter: Callable[[Doc, str], Iterable[Span]] = getattr,
has_annotation: Optional[Callable[[Doc], bool]] = None,
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns PRF scores for labeled spans. """Returns PRF scores for labeled spans.
@ -284,18 +321,10 @@ class Scorer:
for example in examples: for example in examples:
pred_doc = example.predicted pred_doc = example.predicted
gold_doc = example.reference gold_doc = example.reference
# TODO # Option to handle docs without sents
# This is a temporary hack to work around the problem that the scorer if has_annotation is not None:
# fails if you have examples that are not fully annotated for all if not has_annotation(gold_doc):
# the tasks in your pipeline. For instance, you might have a corpus continue
# of NER annotations that does not set sentence boundaries, but the
# pipeline includes a parser or senter, and then the score_weights
# are used to evaluate that component. When the scorer attempts
# to read the sentences from the gold document, it fails.
try:
list(getter(gold_doc, attr))
except ValueError:
continue
# Find all labels in gold and doc # Find all labels in gold and doc
labels = set( labels = set(
[k.label_ for k in getter(gold_doc, attr)] [k.label_ for k in getter(gold_doc, attr)]
@ -323,13 +352,21 @@ class Scorer:
v.score_set(pred_per_type[k], gold_per_type[k]) v.score_set(pred_per_type[k], gold_per_type[k])
# Score for all labels # Score for all labels
score.score_set(pred_spans, gold_spans) score.score_set(pred_spans, gold_spans)
results = { if len(score) > 0:
f"{attr}_p": score.precision, return {
f"{attr}_r": score.recall, f"{attr}_p": score.precision,
f"{attr}_f": score.fscore, f"{attr}_r": score.recall,
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, f"{attr}_f": score.fscore,
} f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
return results }
else:
return {
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
f"{attr}_per_type": None,
}
@staticmethod @staticmethod
def score_cats( def score_cats(
@ -390,9 +427,6 @@ class Scorer:
pred_cats = getter(example.predicted, attr) pred_cats = getter(example.predicted, attr)
gold_cats = getter(example.reference, attr) gold_cats = getter(example.reference, attr)
# I think the AUC metric is applicable regardless of whether we're
# doing multi-label classification? Unsure. If not, move this into
# the elif pred_cats and gold_cats block below.
for label in labels: for label in labels:
pred_score = pred_cats.get(label, 0.0) pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0) gold_score = gold_cats.get(label, 0.0)
@ -542,6 +576,7 @@ class Scorer:
head_attr: str = "head", head_attr: str = "head",
head_getter: Callable[[Token, str], Token] = getattr, head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Iterable[str] = SimpleFrozenList(), ignore_labels: Iterable[str] = SimpleFrozenList(),
missing_values: Set[Any] = MISSING_VALUES,
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency """Returns the UAS, LAS, and LAS per type scores for dependency
@ -566,6 +601,7 @@ class Scorer:
unlabelled = PRFScore() unlabelled = PRFScore()
labelled = PRFScore() labelled = PRFScore()
labelled_per_dep = dict() labelled_per_dep = dict()
missing_indices = set()
for example in examples: for example in examples:
gold_doc = example.reference gold_doc = example.reference
pred_doc = example.predicted pred_doc = example.predicted
@ -575,13 +611,16 @@ class Scorer:
for gold_i, token in enumerate(gold_doc): for gold_i, token in enumerate(gold_doc):
dep = getter(token, attr) dep = getter(token, attr)
head = head_getter(token, head_attr) head = head_getter(token, head_attr)
if dep not in ignore_labels: if dep not in missing_values:
gold_deps.add((gold_i, head.i, dep)) if dep not in ignore_labels:
if dep not in labelled_per_dep: gold_deps.add((gold_i, head.i, dep))
labelled_per_dep[dep] = PRFScore() if dep not in labelled_per_dep:
if dep not in gold_deps_per_dep: labelled_per_dep[dep] = PRFScore()
gold_deps_per_dep[dep] = set() if dep not in gold_deps_per_dep:
gold_deps_per_dep[dep].add((gold_i, head.i, dep)) gold_deps_per_dep[dep] = set()
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
else:
missing_indices.add(gold_i)
pred_deps = set() pred_deps = set()
pred_deps_per_dep = {} pred_deps_per_dep = {}
for token in pred_doc: for token in pred_doc:
@ -591,25 +630,26 @@ class Scorer:
gold_i = None gold_i = None
else: else:
gold_i = align.x2y[token.i].dataXd[0, 0] gold_i = align.x2y[token.i].dataXd[0, 0]
dep = getter(token, attr) if gold_i not in missing_indices:
head = head_getter(token, head_attr) dep = getter(token, attr)
if dep not in ignore_labels and token.orth_.strip(): head = head_getter(token, head_attr)
if align.x2y.lengths[head.i] == 1: if dep not in ignore_labels and token.orth_.strip():
gold_head = align.x2y[head.i].dataXd[0, 0] if align.x2y.lengths[head.i] == 1:
else: gold_head = align.x2y[head.i].dataXd[0, 0]
gold_head = None else:
# None is indistinct, so we can't just add it to the set gold_head = None
# Multiple (None, None) deps are possible # None is indistinct, so we can't just add it to the set
if gold_i is None or gold_head is None: # Multiple (None, None) deps are possible
unlabelled.fp += 1 if gold_i is None or gold_head is None:
labelled.fp += 1 unlabelled.fp += 1
else: labelled.fp += 1
pred_deps.add((gold_i, gold_head, dep)) else:
if dep not in labelled_per_dep: pred_deps.add((gold_i, gold_head, dep))
labelled_per_dep[dep] = PRFScore() if dep not in labelled_per_dep:
if dep not in pred_deps_per_dep: labelled_per_dep[dep] = PRFScore()
pred_deps_per_dep[dep] = set() if dep not in pred_deps_per_dep:
pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) pred_deps_per_dep[dep] = set()
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
labelled.score_set(pred_deps, gold_deps) labelled.score_set(pred_deps, gold_deps)
for dep in labelled_per_dep: for dep in labelled_per_dep:
labelled_per_dep[dep].score_set( labelled_per_dep[dep].score_set(
@ -618,29 +658,34 @@ class Scorer:
unlabelled.score_set( unlabelled.score_set(
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
) )
return { if len(unlabelled) > 0:
f"{attr}_uas": unlabelled.fscore, return {
f"{attr}_las": labelled.fscore, f"{attr}_uas": unlabelled.fscore,
f"{attr}_las_per_type": { f"{attr}_las": labelled.fscore,
k: v.to_dict() for k, v in labelled_per_dep.items() f"{attr}_las_per_type": {
}, k: v.to_dict() for k, v in labelled_per_dep.items()
} },
}
else:
return {
f"{attr}_uas": None,
f"{attr}_las": None,
f"{attr}_las_per_type": None,
}
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
"""Compute per-entity PRFScore objects for a sequence of examples. The """Compute micro-PRF and per-entity PRF scores for a sequence of examples.
results are returned as a dictionary keyed by the entity type. You can
add the PRFScore objects to get micro-averaged total.
""" """
scores = defaultdict(PRFScore) score_per_type = defaultdict(PRFScore)
for eg in examples: for eg in examples:
if not eg.y.has_annotation("ENT_IOB"): if not eg.y.has_annotation("ENT_IOB"):
continue continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents} golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y align_x2y = eg.alignment.x2y
for pred_ent in eg.x.ents: for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores: if pred_ent.label_ not in score_per_type:
scores[pred_ent.label_] = PRFScore() score_per_type[pred_ent.label_] = PRFScore()
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
if len(indices): if len(indices):
g_span = eg.y[indices[0] : indices[-1] + 1] g_span = eg.y[indices[0] : indices[-1] + 1]
@ -650,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
if all(token.ent_iob != 0 for token in g_span): if all(token.ent_iob != 0 for token in g_span):
key = (pred_ent.label_, indices[0], indices[-1] + 1) key = (pred_ent.label_, indices[0], indices[-1] + 1)
if key in golds: if key in golds:
scores[pred_ent.label_].tp += 1 score_per_type[pred_ent.label_].tp += 1
golds.remove(key) golds.remove(key)
else: else:
scores[pred_ent.label_].fp += 1 score_per_type[pred_ent.label_].fp += 1
for label, start, end in golds: for label, start, end in golds:
scores[label].fn += 1 score_per_type[label].fn += 1
return scores totals = PRFScore()
for prf in score_per_type.values():
totals += prf
if len(totals) > 0:
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}
else:
return {
"ents_p": None,
"ents_r": None,
"ents_f": None,
"ents_per_type": None,
}
############################################################################# #############################################################################

View File

@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts):
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
# "cat" is the only correct lemma # "cat" is the only correct lemma
assert scores["lemma_acc"] == pytest.approx(0.2) assert scores["lemma_acc"] == pytest.approx(0.2)
# the empty morphs are correct # no morphs are set
assert scores["morph_acc"] == pytest.approx(0.6) assert scores["morph_acc"] == None
def test_attributeruler_rule_order(nlp): def test_attributeruler_rule_order(nlp):

View File

@ -277,6 +277,62 @@ def test_tag_score(tagged_doc):
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e")
pred_doc[0].tag_ = "A"
pred_doc[0].pos_ = "X"
pred_doc[0].set_morph("Feat=Val")
pred_doc[0].dep_ = "dep"
# unannotated reference
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
for key in scores:
# cats doesn't have an unset state
if key.startswith("cats"):
continue
assert scores[key] == None
# partially annotated reference, not overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[1].tag_ = "A"
ref_doc[1].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 0.0
assert scores["pos_acc"] == 0.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
# partially annotated reference, overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[0].tag_ = "A"
ref_doc[0].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 1.0
assert scores["pos_acc"] == 1.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
def test_roc_auc_score(): def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite # Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1] y_true = [0, 1]

View File

@ -399,14 +399,13 @@ cdef class Doc:
return True return True
cdef int i cdef int i
cdef int range_start = 0 cdef int range_start = 0
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
attr = SENT_START
attr = intify_attr(attr) attr = intify_attr(attr)
# adjust attributes # adjust attributes
if attr == HEAD: if attr == HEAD:
# HEAD does not have an unset state, so rely on DEP # HEAD does not have an unset state, so rely on DEP
attr = DEP attr = DEP
elif attr == self.vocab.strings["IS_SENT_START"]:
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
attr = SENT_START
# special cases for sentence boundaries # special cases for sentence boundaries
if attr == SENT_START: if attr == SENT_START:
if "sents" in self.user_hooks: if "sents" in self.user_hooks:

View File

@ -683,6 +683,7 @@ The L2 norm of the document's vector representation.
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -68,6 +68,8 @@ Scores the tokenization:
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
character spans character spans
Docs with `has_unknown_spaces` are skipped during scoring.
> #### Example > #### Example
> >
> ```python > ```python
@ -81,7 +83,8 @@ Scores the tokenization:
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
Scores a single token attribute. Scores a single token attribute. Tokens with missing values in the reference doc
are skipped during scoring.
> #### Example > #### Example
> >
@ -90,20 +93,22 @@ Scores a single token attribute.
> print(scores["pos_acc"]) > print(scores["pos_acc"])
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
Scores a single token attribute per feature for a token attribute in the Scores a single token attribute per feature for a token attribute in the
Universal Dependencies Universal Dependencies
[FEATS](https://universaldependencies.org/format.html#morphological-annotation) [FEATS](https://universaldependencies.org/format.html#morphological-annotation)
format. format. Tokens with missing values in the reference doc are skipped during
scoring.
> #### Example > #### Example
> >
@ -112,13 +117,14 @@ format.
> print(scores["morph_per_feat"]) > print(scores["morph_per_feat"])
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans.
> print(scores["ents_f"]) > print(scores["ents_f"])
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ |
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
Calculate the UAS, LAS, and LAS per type scores for dependency parses. Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens
with missing values for the `attr` (typically `dep`) are skipped during scoring.
> #### Example > #### Example
> >
@ -160,16 +168,17 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
> print(scores["dep_uas"], scores["dep_las"]) > print(scores["dep_uas"], scores["dep_las"])
> ``` > ```
| Name | Description | | Name | Description |
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `head_attr` | The attribute containing the head token. ~~str~~ | | `head_attr` | The attribute containing the head token. ~~str~~ |
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ | | `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ | | `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}