Merge pull request #5833 from explosion/feature/scorer-adjustments

This commit is contained in:
Ines Montani 2020-07-31 14:00:39 +02:00 committed by GitHub
commit 6365837ca9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 264 additions and 154 deletions

View File

@ -1099,6 +1099,7 @@ class Language:
batch_size: int = 256, batch_size: int = 256,
scorer: Optional[Scorer] = None, scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None,
) -> Dict[str, Union[float, dict]]: ) -> Dict[str, Union[float, dict]]:
"""Evaluate a model's pipeline components. """Evaluate a model's pipeline components.
@ -1109,6 +1110,8 @@ class Language:
will be created. will be created.
component_cfg (dict): An optional dictionary with extra keyword component_cfg (dict): An optional dictionary with extra keyword
arguments for specific components. arguments for specific components.
scorer_cfg (dict): An optional dictionary with extra keyword arguments
for the scorer.
RETURNS (Scorer): The scorer containing the evaluation results. RETURNS (Scorer): The scorer containing the evaluation results.
DOCS: https://spacy.io/api/language#evaluate DOCS: https://spacy.io/api/language#evaluate
@ -1126,8 +1129,10 @@ class Language:
raise TypeError(err) raise TypeError(err)
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
if scorer_cfg is None:
scorer_cfg = {}
if scorer is None: if scorer is None:
kwargs = component_cfg.get("scorer", {}) kwargs = dict(scorer_cfg)
kwargs.setdefault("verbose", verbose) kwargs.setdefault("verbose", verbose)
kwargs.setdefault("nlp", self) kwargs.setdefault("nlp", self)
scorer = Scorer(**kwargs) scorer = Scorer(**kwargs)

View File

@ -34,7 +34,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"parser", "parser",
assigns=["token.dep", "token.is_sent_start", "doc.sents"], assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={ default_config={
"moves": None, "moves": None,
"update_with_oracle_cut_size": 100, "update_with_oracle_cut_size": 100,
@ -120,7 +120,8 @@ cdef class DependencyParser(Parser):
return dep return dep
results = {} results = {}
results.update(Scorer.score_spans(examples, "sents", **kwargs)) results.update(Scorer.score_spans(examples, "sents", **kwargs))
results.update(Scorer.score_deps(examples, "dep", getter=dep_getter, kwargs.setdefault("getter", dep_getter)
ignore_labels=("p", "punct"), **kwargs)) kwargs.setdefault("ignore_label", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"] del results["sents_per_type"]
return results return results

View File

@ -1,55 +1,61 @@
from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
import numpy as np import numpy as np
from .gold import Example
from .tokens import Token, Doc
from .errors import Errors from .errors import Errors
from .util import get_lang_class from .util import get_lang_class
from .morphology import Morphology from .morphology import Morphology
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
class PRFScore: class PRFScore:
""" """A precision / recall / F score."""
A precision / recall / F score
"""
def __init__(self): def __init__(self) -> None:
self.tp = 0 self.tp = 0
self.fp = 0 self.fp = 0
self.fn = 0 self.fn = 0
def score_set(self, cand, gold): def score_set(self, cand: set, gold: set) -> None:
self.tp += len(cand.intersection(gold)) self.tp += len(cand.intersection(gold))
self.fp += len(cand - gold) self.fp += len(cand - gold)
self.fn += len(gold - cand) self.fn += len(gold - cand)
@property @property
def precision(self): def precision(self) -> float:
return self.tp / (self.tp + self.fp + 1e-100) return self.tp / (self.tp + self.fp + 1e-100)
@property @property
def recall(self): def recall(self) -> float:
return self.tp / (self.tp + self.fn + 1e-100) return self.tp / (self.tp + self.fn + 1e-100)
@property @property
def fscore(self): def fscore(self) -> float:
p = self.precision p = self.precision
r = self.recall r = self.recall
return 2 * ((p * r) / (p + r + 1e-100)) return 2 * ((p * r) / (p + r + 1e-100))
def to_dict(self): def to_dict(self) -> Dict[str, float]:
return {"p": self.precision, "r": self.recall, "f": self.fscore} return {"p": self.precision, "r": self.recall, "f": self.fscore}
class ROCAUCScore: class ROCAUCScore:
""" """An AUC ROC score."""
An AUC ROC score.
"""
def __init__(self): def __init__(self) -> None:
self.golds = [] self.golds = []
self.cands = [] self.cands = []
self.saved_score = 0.0 self.saved_score = 0.0
self.saved_score_at_len = 0 self.saved_score_at_len = 0
def score_set(self, cand, gold): def score_set(self, cand, gold) -> None:
self.cands.append(cand) self.cands.append(cand)
self.golds.append(gold) self.golds.append(gold)
@ -70,51 +76,52 @@ class ROCAUCScore:
class Scorer: class Scorer:
"""Compute evaluation scores.""" """Compute evaluation scores."""
def __init__(self, nlp=None, **cfg): def __init__(
self,
nlp: Optional["Language"] = None,
default_lang: str = "xx",
default_pipeline=DEFAULT_PIPELINE,
**cfg,
) -> None:
"""Initialize the Scorer. """Initialize the Scorer.
DOCS: https://spacy.io/api/scorer#init DOCS: https://spacy.io/api/scorer#init
""" """
self.nlp = nlp self.nlp = nlp
self.cfg = cfg self.cfg = cfg
if not nlp: if not nlp:
# create a default pipeline nlp = get_lang_class(default_lang)()
nlp = get_lang_class("xx")() for pipe in default_pipeline:
nlp.add_pipe("senter") nlp.add_pipe(pipe)
nlp.add_pipe("tagger")
nlp.add_pipe("morphologizer")
nlp.add_pipe("parser")
nlp.add_pipe("ner")
nlp.add_pipe("textcat")
self.nlp = nlp self.nlp = nlp
def score(self, examples): def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
"""Evaluate a list of Examples. """Evaluate a list of Examples.
examples (Iterable[Example]): The predicted annotations + correct annotations. examples (Iterable[Example]): The predicted annotations + correct annotations.
RETURNS (Dict): A dictionary of scores. RETURNS (Dict): A dictionary of scores.
DOCS: https://spacy.io/api/scorer#score DOCS: https://spacy.io/api/scorer#score
""" """
scores = {} scores = {}
if hasattr(self.nlp.tokenizer, "score"): if hasattr(self.nlp.tokenizer, "score"):
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
for name, component in self.nlp.pipeline: for name, component in self.nlp.pipeline:
if hasattr(component, "score"): if hasattr(component, "score"):
scores.update(component.score(examples, **self.cfg)) scores.update(component.score(examples, **self.cfg))
return scores return scores
@staticmethod @staticmethod
def score_tokenization(examples, **cfg): def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
"""Returns accuracy and PRF scores for tokenization. """Returns accuracy and PRF scores for tokenization.
* token_acc: # correct tokens / # gold tokens * token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for token character spans * token_p/r/f: PRF for token character spans
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
RETURNS (dict): A dictionary containing the scores token_acc/p/r/f. RETURNS (Dict[str, float]): A dictionary containing the scores
token_acc/p/r/f.
DOCS: https://spacy.io/api/scorer#score_tokenization
""" """
acc_score = PRFScore() acc_score = PRFScore()
prf_score = PRFScore() prf_score = PRFScore()
@ -145,16 +152,24 @@ class Scorer:
} }
@staticmethod @staticmethod
def score_token_attr(examples, attr, getter=getattr, **cfg): def score_token_attr(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
**cfg,
) -> Dict[str, float]:
"""Returns an accuracy score for a token-level attribute. """Returns an accuracy score for a token-level attribute.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
getter (callable): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
RETURNS (dict): A dictionary containing the accuracy score under the RETURNS (Dict[str, float]): A dictionary containing the accuracy score
key attr_acc. under the key attr_acc.
DOCS: https://spacy.io/api/scorer#score_token_attr
""" """
tag_score = PRFScore() tag_score = PRFScore()
for example in examples: for example in examples:
@ -172,17 +187,21 @@ class Scorer:
gold_i = align.x2y[token.i].dataXd[0, 0] gold_i = align.x2y[token.i].dataXd[0, 0]
pred_tags.add((gold_i, getter(token, attr))) pred_tags.add((gold_i, getter(token, attr)))
tag_score.score_set(pred_tags, gold_tags) tag_score.score_set(pred_tags, gold_tags)
return { return {f"{attr}_acc": tag_score.fscore}
attr + "_acc": tag_score.fscore,
}
@staticmethod @staticmethod
def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg): def score_token_attr_per_feat(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
**cfg,
):
"""Return PRF scores per feat for a token attribute in UFEATS format. """Return PRF scores per feat for a token attribute in UFEATS format.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
getter (callable): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
RETURNS (dict): A dictionary containing the per-feat PRF scores unders RETURNS (dict): A dictionary containing the per-feat PRF scores unders
@ -223,20 +242,26 @@ class Scorer:
per_feat[field].score_set( per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()), pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
) )
return { return {f"{attr}_per_feat": per_feat}
attr + "_per_feat": per_feat,
}
@staticmethod @staticmethod
def score_spans(examples, attr, getter=getattr, **cfg): def score_spans(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Doc, str], Any] = getattr,
**cfg,
) -> Dict[str, Any]:
"""Returns PRF scores for labeled spans. """Returns PRF scores for labeled spans.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
getter (callable): Defaults to getattr. If provided, getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
getter(doc, attr) should return the spans for the individual doc. getter(doc, attr) should return the spans for the individual doc.
RETURNS (dict): A dictionary containing the PRF scores under the RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
keys attr_p/r/f and the per-type PRF scores under attr_per_type. the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
DOCS: https://spacy.io/api/scorer#score_spans
""" """
score = PRFScore() score = PRFScore()
score_per_type = dict() score_per_type = dict()
@ -256,14 +281,12 @@ class Scorer:
# Find all predidate labels, for all and per type # Find all predidate labels, for all and per type
gold_spans = set() gold_spans = set()
pred_spans = set() pred_spans = set()
# Special case for ents: # Special case for ents:
# If we have missing values in the gold, we can't easily tell # If we have missing values in the gold, we can't easily tell
# whether our NER predictions are true. # whether our NER predictions are true.
# It seems bad but it's what we've always done. # It seems bad but it's what we've always done.
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc): if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
continue continue
for span in getter(gold_doc, attr): for span in getter(gold_doc, attr):
gold_span = (span.label_, span.start, span.end - 1) gold_span = (span.label_, span.start, span.end - 1)
gold_spans.add(gold_span) gold_spans.add(gold_span)
@ -279,38 +302,39 @@ class Scorer:
# Score for all labels # Score for all labels
score.score_set(pred_spans, gold_spans) score.score_set(pred_spans, gold_spans)
results = { results = {
attr + "_p": score.precision, f"{attr}_p": score.precision,
attr + "_r": score.recall, f"{attr}_r": score.recall,
attr + "_f": score.fscore, f"{attr}_f": score.fscore,
attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
} }
return results return results
@staticmethod @staticmethod
def score_cats( def score_cats(
examples, examples: Iterable[Example],
attr, attr: str,
getter=getattr, *,
labels=[], getter: Callable[[Doc, str], Any] = getattr,
multi_label=True, labels: Iterable[str] = tuple(),
positive_label=None, multi_label: bool = True,
**cfg positive_label: Optional[str] = None,
): **cfg,
) -> Dict[str, Any]:
"""Returns PRF and ROC AUC scores for a doc-level attribute with a """Returns PRF and ROC AUC scores for a doc-level attribute with a
dict with scores for each label like Doc.cats. The reported overall dict with scores for each label like Doc.cats. The reported overall
score depends on the scorer settings. score depends on the scorer settings.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
getter (callable): Defaults to getattr. If provided, getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
getter(doc, attr) should return the values for the individual doc. getter(doc, attr) should return the values for the individual doc.
labels (Iterable[str]): The set of possible labels. Defaults to []. labels (Iterable[str]): The set of possible labels. Defaults to [].
multi_label (bool): Whether the attribute allows multiple labels. multi_label (bool): Whether the attribute allows multiple labels.
Defaults to True. Defaults to True.
positive_label (str): The positive label for a binary task with positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None. exclusive classes. Defaults to None.
RETURNS (dict): A dictionary containing the scores, with inapplicable RETURNS (Dict[str, Any]): A dictionary containing the scores, with
scores as None: inapplicable scores as None:
for all: for all:
attr_score (one of attr_f / attr_macro_f / attr_macro_auc), attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score), attr_score_desc (text description of the overall score),
@ -319,6 +343,8 @@ class Scorer:
for binary exclusive with positive label: attr_p/r/f for binary exclusive with positive label: attr_p/r/f
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
for multilabel, macro-averaged AUC: attr_macro_auc for multilabel, macro-averaged AUC: attr_macro_auc
DOCS: https://spacy.io/api/scorer#score_cats
""" """
score = PRFScore() score = PRFScore()
f_per_type = dict() f_per_type = dict()
@ -367,64 +393,67 @@ class Scorer:
) )
) )
results = { results = {
attr + "_score": None, f"{attr}_score": None,
attr + "_score_desc": None, f"{attr}_score_desc": None,
attr + "_p": None, f"{attr}_p": None,
attr + "_r": None, f"{attr}_r": None,
attr + "_f": None, f"{attr}_f": None,
attr + "_macro_f": None, f"{attr}_macro_f": None,
attr + "_macro_auc": None, f"{attr}_macro_auc": None,
attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
} }
if len(labels) == 2 and not multi_label and positive_label: if len(labels) == 2 and not multi_label and positive_label:
results[attr + "_p"] = score.precision results[f"{attr}_p"] = score.precision
results[attr + "_r"] = score.recall results[f"{attr}_r"] = score.recall
results[attr + "_f"] = score.fscore results[f"{attr}_f"] = score.fscore
results[attr + "_score"] = results[attr + "_f"] results[f"{attr}_score"] = results[f"{attr}_f"]
results[attr + "_score_desc"] = "F (" + positive_label + ")" results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label: elif not multi_label:
results[attr + "_macro_f"] = sum( results[f"{attr}_macro_f"] = sum(
[score.fscore for label, score in f_per_type.items()] [score.fscore for label, score in f_per_type.items()]
) / (len(f_per_type) + 1e-100) ) / (len(f_per_type) + 1e-100)
results[attr + "_score"] = results[attr + "_macro_f"] results[f"{attr}_score"] = results[f"{attr}_macro_f"]
results[attr + "_score_desc"] = "macro F" results[f"{attr}_score_desc"] = "macro F"
else: else:
results[attr + "_macro_auc"] = max( results[f"{attr}_macro_auc"] = max(
sum([score.score for label, score in auc_per_type.items()]) sum([score.score for label, score in auc_per_type.items()])
/ (len(auc_per_type) + 1e-100), / (len(auc_per_type) + 1e-100),
-1, -1,
) )
results[attr + "_score"] = results[attr + "_macro_auc"] results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
results[attr + "_score_desc"] = "macro AUC" results[f"{attr}_score_desc"] = "macro AUC"
return results return results
@staticmethod @staticmethod
def score_deps( def score_deps(
examples, examples: Iterable[Example],
attr, attr: str,
getter=getattr, *,
head_attr="head", getter: Callable[[Token, str], Any] = getattr,
head_getter=getattr, head_attr: str = "head",
ignore_labels=tuple(), head_getter: Callable[[Token, str], Any] = getattr,
**cfg ignore_labels: Tuple[str] = tuple(),
): **cfg,
) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency """Returns the UAS, LAS, and LAS per type scores for dependency
parses. parses.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute containing the dependency label. attr (str): The attribute containing the dependency label.
getter (callable): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
head_attr (str): The attribute containing the head token. Defaults to head_attr (str): The attribute containing the head token. Defaults to
'head'. 'head'.
head_getter (callable): Defaults to getattr. If provided, head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
head_getter(token, attr) should return the value of the head for an head_getter(token, attr) should return the value of the head for an
individual token. individual token.
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct). ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
RETURNS (dict): A dictionary containing the scores: RETURNS (Dict[str, Any]): A dictionary containing the scores:
attr_uas, attr_las, and attr_las_per_type. attr_uas, attr_las, and attr_las_per_type.
DOCS: https://spacy.io/api/scorer#score_deps
""" """
unlabelled = PRFScore() unlabelled = PRFScore()
labelled = PRFScore() labelled = PRFScore()
@ -482,10 +511,11 @@ class Scorer:
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
) )
return { return {
attr + "_uas": unlabelled.fscore, f"{attr}_uas": unlabelled.fscore,
attr + "_las": labelled.fscore, f"{attr}_las": labelled.fscore,
attr f"{attr}_las_per_type": {
+ "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()}, k: v.to_dict() for k, v in labelled_per_dep.items()
},
} }

View File

@ -118,7 +118,7 @@ def test_overfitting_IO():
# Test scoring # Test scoring
scores = nlp.evaluate( scores = nlp.evaluate(
train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}} train_examples, scorer_cfg={"positive_label": "POSITIVE"}
) )
assert scores["cats_f"] == 1.0 assert scores["cats_f"] == 1.0
assert scores["cats_score"] == 1.0 assert scores["cats_score"] == 1.0

View File

@ -302,6 +302,7 @@ Evaluate a model's pipeline components.
| `batch_size` | int | The batch size to use. | | `batch_size` | int | The batch size to use. |
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | | `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | | `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| `scorer_cfg` | `Dict[str, Any]` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. |
| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. | | **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
## Language.use_params {#use_params tag="contextmanager, method"} ## Language.use_params {#use_params tag="contextmanager, method"}

View File

@ -6,10 +6,9 @@ source: spacy/scorer.py
--- ---
The `Scorer` computes evaluation scores. It's typically created by The `Scorer` computes evaluation scores. It's typically created by
[`Language.evaluate`](/api/language#evaluate). [`Language.evaluate`](/api/language#evaluate). In addition, the `Scorer`
provides a number of evaluation methods for evaluating [`Token`](/api/token) and
In addition, the `Scorer` provides a number of evaluation methods for evaluating [`Doc`](/api/doc) attributes.
`Token` and `Doc` attributes.
## Scorer.\_\_init\_\_ {#init tag="method"} ## Scorer.\_\_init\_\_ {#init tag="method"}
@ -20,10 +19,10 @@ Create a new `Scorer`.
> ```python > ```python
> from spacy.scorer import Scorer > from spacy.scorer import Scorer
> >
> # default scoring pipeline > # Default scoring pipeline
> scorer = Scorer() > scorer = Scorer()
> >
> # provided scoring pipeline > # Provided scoring pipeline
> nlp = spacy.load("en_core_web_sm") > nlp = spacy.load("en_core_web_sm")
> scorer = Scorer(nlp) > scorer = Scorer(nlp)
> ``` > ```
@ -40,16 +39,20 @@ scoring methods provided by the components in the pipeline.
The returned `Dict` contains the scores provided by the individual pipeline The returned `Dict` contains the scores provided by the individual pipeline
components. For the scoring methods provided by the `Scorer` and use by the core components. For the scoring methods provided by the `Scorer` and use by the core
pipeline components, the individual score names start with the `Token` or `Doc` pipeline components, the individual score names start with the `Token` or `Doc`
attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`, attribute being scored:
`pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`, `dep_las`,
`dep_las_per_type`, `ents_p/r/f`, `ents_per_type`, `textcat_macro_auc`, - `token_acc`, `token_p`, `token_r`, `token_f`,
`textcat_macro_f`. - `sents_p`, `sents_r`, `sents_f`
- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`
- `dep_uas`, `dep_las`, `dep_las_per_type`
- `ents_p`, `ents_r` `ents_f`, `ents_per_type`
- `textcat_macro_auc`, `textcat_macro_f`
> #### Example > #### Example
> >
> ```python > ```python
> scorer = Scorer() > scorer = Scorer()
> scorer.score(examples) > scores = scorer.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -57,78 +60,148 @@ attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`,
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
| **RETURNS** | `Dict` | A dictionary of scores. | | **RETURNS** | `Dict` | A dictionary of scores. |
## Scorer.score_tokenization {#score_tokenization tag="staticmethod"} ## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"}
Scores the tokenization: Scores the tokenization:
- `token_acc`: # correct tokens / # gold tokens - `token_acc`: number of correct tokens / number of gold tokens
- `token_p/r/f`: PRF for token character spans - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
character spans
> #### Example
>
> ```python
> scores = Scorer.score_tokenization(examples)
> ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- | | ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc/p/r/f`. | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. |
## Scorer.score_token_attr {#score_token_attr tag="staticmethod"} ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
Scores a single token attribute. Scores a single token attribute.
| Name | Type | Description | > #### Example
| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | >
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | > ```python
| `attr` | `str` | The attribute to score. | > scores = Scorer.score_token_attr(examples, "pos")
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | > print(scores["pos_acc"])
| **RETURNS** | `Dict` | A dictionary containing the score `attr_acc`. | > ```
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"} | Name | Type | Description |
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
| `attr` | `str` | The attribute to score. |
| _keyword-only_ | | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
| **RETURNS** | `Dict[str, float]` | A dictionary containing the score `{attr}_acc`. |
Scores a single token attribute per feature for a token attribute in UFEATS ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
Scores a single token attribute per feature for a token attribute in
[UFEATS](https://universaldependencies.org/format.html#morphological-annotation)
format. format.
| Name | Type | Description | > #### Example
| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | >
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | > ```python
| `attr` | `str` | The attribute to score. | > scores = Scorer.score_token_attr_per_feat(examples, "morph")
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | > print(scores["morph_per_feat"])
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. | > ```
## Scorer.score_spans {#score_spans tag="staticmethod"} | Name | Type | Description |
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
| `attr` | `str` | The attribute to score. |
| _keyword-only_ | | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. |
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
Returns PRF scores for labeled or unlabeled spans. Returns PRF scores for labeled or unlabeled spans.
| Name | Type | Description | > #### Example
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------- | >
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | > ```python
| `attr` | `str` | The attribute to score. | > scores = Scorer.score_spans(examples, "ents")
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. | > print(scores["ents_f"])
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. | > ```
## Scorer.score_deps {#score_deps tag="staticmethod"} | Name | Type | Description |
| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
| `attr` | `str` | The attribute to score. |
| _keyword-only_ | | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. |
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
Calculate the UAS, LAS, and LAS per type scores for dependency parses. Calculate the UAS, LAS, and LAS per type scores for dependency parses.
> #### Example
>
> ```python
> def dep_getter(token, attr):
> dep = getattr(token, attr)
> dep = token.vocab.strings.as_string(dep).lower()
> return dep
>
> scores = Scorer.score_deps(
> examples,
> "dep",
> getter=dep_getter,
> ignore_labels=("p", "punct")
> )
> print(scores["dep_uas"], scores["dep_las"])
> ```
| Name | Type | Description | | Name | Type | Description |
| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
| `attr` | `str` | The attribute containing the dependency label. | | `attr` | `str` | The attribute containing the dependency label. |
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | | _keyword-only_ | | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
| `head_attr` | `str` | The attribute containing the head token. | | `head_attr` | `str` | The attribute containing the head token. |
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. | | `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). | | `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). |
| **RETURNS** | `Dict` | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. | | **RETURNS** | `Dict` | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. |
## Scorer.score_cats {#score_cats tag="staticmethod"} ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
containing scores for each label like `Doc.cats`. The reported overall score containing scores for each label like `Doc.cats`. The reported overall score
depends on the scorer settings. depends on the scorer settings:
| Name | Type | Description | 1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | score), `{attr}_f_per_type`, `{attr}_auc_per_type`
| `attr` | `str` | The attribute to score. | 2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. | 3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. | 4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. | > #### Example
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`: 1) for all: `attr_score` (one of `attr_f` / `attr_macro_f` / `attr_macro_auc`), `attr_score_desc` (text description of the overall score), `attr_f_per_type`, `attr_auc_per_type`; 2) for binary exclusive with positive label: `attr_p/r/f`; 3) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 4) for multilabel, macro-averaged AUC: `attr_macro_auc` | >
> ```python
> labels = ["LABEL_A", "LABEL_B", "LABEL_C"]
> scores = Scorer.score_cats(
> examples,
> "cats",
> labels=labels
> )
> print(scores["cats_macro_auc"])
> ```
| Name | Type | Description |
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
| `attr` | `str` | The attribute to score. |
| _keyword-only_ | | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. |
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. |
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`. |