mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Small adjustments to Scorer and docs
This commit is contained in:
parent
256b24b720
commit
ac24adec73
224
spacy/scorer.py
224
spacy/scorer.py
|
@ -1,47 +1,53 @@
|
||||||
|
from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from .gold import Example
|
||||||
|
from .tokens import Token
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .util import get_lang_class
|
from .util import get_lang_class
|
||||||
from .morphology import Morphology
|
from .morphology import Morphology
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from .language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
|
||||||
|
|
||||||
|
|
||||||
class PRFScore:
|
class PRFScore:
|
||||||
"""
|
"""A precision / recall / F score."""
|
||||||
A precision / recall / F score
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.tp = 0
|
self.tp = 0
|
||||||
self.fp = 0
|
self.fp = 0
|
||||||
self.fn = 0
|
self.fn = 0
|
||||||
|
|
||||||
def score_set(self, cand, gold):
|
def score_set(self, cand: set, gold: set) -> None:
|
||||||
self.tp += len(cand.intersection(gold))
|
self.tp += len(cand.intersection(gold))
|
||||||
self.fp += len(cand - gold)
|
self.fp += len(cand - gold)
|
||||||
self.fn += len(gold - cand)
|
self.fn += len(gold - cand)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def precision(self):
|
def precision(self) -> float:
|
||||||
return self.tp / (self.tp + self.fp + 1e-100)
|
return self.tp / (self.tp + self.fp + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def recall(self):
|
def recall(self) -> float:
|
||||||
return self.tp / (self.tp + self.fn + 1e-100)
|
return self.tp / (self.tp + self.fn + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def fscore(self):
|
def fscore(self) -> float:
|
||||||
p = self.precision
|
p = self.precision
|
||||||
r = self.recall
|
r = self.recall
|
||||||
return 2 * ((p * r) / (p + r + 1e-100))
|
return 2 * ((p * r) / (p + r + 1e-100))
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self) -> Dict[str, float]:
|
||||||
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
||||||
|
|
||||||
|
|
||||||
class ROCAUCScore:
|
class ROCAUCScore:
|
||||||
"""
|
"""An AUC ROC score."""
|
||||||
An AUC ROC score.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.golds = []
|
self.golds = []
|
||||||
|
@ -49,7 +55,7 @@ class ROCAUCScore:
|
||||||
self.saved_score = 0.0
|
self.saved_score = 0.0
|
||||||
self.saved_score_at_len = 0
|
self.saved_score_at_len = 0
|
||||||
|
|
||||||
def score_set(self, cand, gold):
|
def score_set(self, cand, gold) -> None:
|
||||||
self.cands.append(cand)
|
self.cands.append(cand)
|
||||||
self.golds.append(gold)
|
self.golds.append(gold)
|
||||||
|
|
||||||
|
@ -70,7 +76,13 @@ class ROCAUCScore:
|
||||||
class Scorer:
|
class Scorer:
|
||||||
"""Compute evaluation scores."""
|
"""Compute evaluation scores."""
|
||||||
|
|
||||||
def __init__(self, nlp=None, **cfg):
|
def __init__(
|
||||||
|
self,
|
||||||
|
nlp: Optional["Language"] = None,
|
||||||
|
default_lang: str = "xx",
|
||||||
|
default_pipeline=DEFAULT_PIPELINE,
|
||||||
|
**cfg,
|
||||||
|
) -> None:
|
||||||
"""Initialize the Scorer.
|
"""Initialize the Scorer.
|
||||||
RETURNS (Scorer): The newly created object.
|
RETURNS (Scorer): The newly created object.
|
||||||
|
|
||||||
|
@ -78,44 +90,39 @@ class Scorer:
|
||||||
"""
|
"""
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
|
||||||
if not nlp:
|
if not nlp:
|
||||||
# create a default pipeline
|
nlp = get_lang_class(default_lang)()
|
||||||
nlp = get_lang_class("xx")()
|
for pipe in default_pipeline:
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe(pipe)
|
||||||
nlp.add_pipe("tagger")
|
|
||||||
nlp.add_pipe("morphologizer")
|
|
||||||
nlp.add_pipe("parser")
|
|
||||||
nlp.add_pipe("ner")
|
|
||||||
nlp.add_pipe("textcat")
|
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
|
|
||||||
def score(self, examples):
|
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||||
"""Evaluate a list of Examples.
|
"""Evaluate a list of Examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||||
RETURNS (Dict): A dictionary of scores.
|
RETURNS (Dict): A dictionary of scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
scores = {}
|
scores = {}
|
||||||
|
|
||||||
if hasattr(self.nlp.tokenizer, "score"):
|
if hasattr(self.nlp.tokenizer, "score"):
|
||||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
|
scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
|
||||||
for name, component in self.nlp.pipeline:
|
for name, component in self.nlp.pipeline:
|
||||||
if hasattr(component, "score"):
|
if hasattr(component, "score"):
|
||||||
scores.update(component.score(examples, **self.cfg))
|
scores.update(component.score(examples, **self.cfg))
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_tokenization(examples, **cfg):
|
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
|
||||||
"""Returns accuracy and PRF scores for tokenization.
|
"""Returns accuracy and PRF scores for tokenization.
|
||||||
|
|
||||||
* token_acc: # correct tokens / # gold tokens
|
* token_acc: # correct tokens / # gold tokens
|
||||||
* token_p/r/f: PRF for token character spans
|
* token_p/r/f: PRF for token character spans
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
|
RETURNS (Dict[str, float]): A dictionary containing the scores
|
||||||
|
token_acc/p/r/f.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_tokenization
|
||||||
"""
|
"""
|
||||||
acc_score = PRFScore()
|
acc_score = PRFScore()
|
||||||
prf_score = PRFScore()
|
prf_score = PRFScore()
|
||||||
|
@ -146,16 +153,24 @@ class Scorer:
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_token_attr(examples, attr, getter=getattr, **cfg):
|
def score_token_attr(
|
||||||
|
examples: Iterable[Example],
|
||||||
|
attr: str,
|
||||||
|
*,
|
||||||
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
|
**cfg,
|
||||||
|
) -> Dict[str, float]:
|
||||||
"""Returns an accuracy score for a token-level attribute.
|
"""Returns an accuracy score for a token-level attribute.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
RETURNS (dict): A dictionary containing the accuracy score under the
|
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
||||||
key attr_acc.
|
under the key attr_acc.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_token_attr
|
||||||
"""
|
"""
|
||||||
tag_score = PRFScore()
|
tag_score = PRFScore()
|
||||||
for example in examples:
|
for example in examples:
|
||||||
|
@ -173,17 +188,21 @@ class Scorer:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||||
pred_tags.add((gold_i, getter(token, attr)))
|
pred_tags.add((gold_i, getter(token, attr)))
|
||||||
tag_score.score_set(pred_tags, gold_tags)
|
tag_score.score_set(pred_tags, gold_tags)
|
||||||
return {
|
return {f"{attr}_acc": tag_score.fscore}
|
||||||
attr + "_acc": tag_score.fscore,
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
|
def score_token_attr_per_feat(
|
||||||
|
examples: Iterable[Example],
|
||||||
|
attr: str,
|
||||||
|
*,
|
||||||
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
|
**cfg,
|
||||||
|
):
|
||||||
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
||||||
|
@ -224,20 +243,26 @@ class Scorer:
|
||||||
per_feat[field].score_set(
|
per_feat[field].score_set(
|
||||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
||||||
)
|
)
|
||||||
return {
|
return {f"{attr}_per_feat": per_feat}
|
||||||
attr + "_per_feat": per_feat,
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_spans(examples, attr, getter=getattr, **cfg):
|
def score_spans(
|
||||||
|
examples: Iterable[Example],
|
||||||
|
attr: str,
|
||||||
|
*,
|
||||||
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
|
**cfg,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF scores for labeled spans.
|
"""Returns PRF scores for labeled spans.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(doc, attr) should return the spans for the individual doc.
|
getter(doc, attr) should return the spans for the individual doc.
|
||||||
RETURNS (dict): A dictionary containing the PRF scores under the
|
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
||||||
keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_spans
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
score = PRFScore()
|
||||||
score_per_type = dict()
|
score_per_type = dict()
|
||||||
|
@ -257,14 +282,12 @@ class Scorer:
|
||||||
# Find all predidate labels, for all and per type
|
# Find all predidate labels, for all and per type
|
||||||
gold_spans = set()
|
gold_spans = set()
|
||||||
pred_spans = set()
|
pred_spans = set()
|
||||||
|
|
||||||
# Special case for ents:
|
# Special case for ents:
|
||||||
# If we have missing values in the gold, we can't easily tell
|
# If we have missing values in the gold, we can't easily tell
|
||||||
# whether our NER predictions are true.
|
# whether our NER predictions are true.
|
||||||
# It seems bad but it's what we've always done.
|
# It seems bad but it's what we've always done.
|
||||||
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
|
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for span in getter(gold_doc, attr):
|
for span in getter(gold_doc, attr):
|
||||||
gold_span = (span.label_, span.start, span.end - 1)
|
gold_span = (span.label_, span.start, span.end - 1)
|
||||||
gold_spans.add(gold_span)
|
gold_spans.add(gold_span)
|
||||||
|
@ -280,38 +303,39 @@ class Scorer:
|
||||||
# Score for all labels
|
# Score for all labels
|
||||||
score.score_set(pred_spans, gold_spans)
|
score.score_set(pred_spans, gold_spans)
|
||||||
results = {
|
results = {
|
||||||
attr + "_p": score.precision,
|
f"{attr}_p": score.precision,
|
||||||
attr + "_r": score.recall,
|
f"{attr}_r": score.recall,
|
||||||
attr + "_f": score.fscore,
|
f"{attr}_f": score.fscore,
|
||||||
attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||||
}
|
}
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_cats(
|
def score_cats(
|
||||||
examples,
|
examples: Iterable[Example],
|
||||||
attr,
|
attr: str,
|
||||||
getter=getattr,
|
*,
|
||||||
labels=[],
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
multi_label=True,
|
labels: Iterable[str] = tuple(),
|
||||||
positive_label=None,
|
multi_label: bool = True,
|
||||||
**cfg
|
positive_label: Optional[str] = None,
|
||||||
):
|
**cfg,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
||||||
dict with scores for each label like Doc.cats. The reported overall
|
dict with scores for each label like Doc.cats. The reported overall
|
||||||
score depends on the scorer settings.
|
score depends on the scorer settings.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(doc, attr) should return the values for the individual doc.
|
getter(doc, attr) should return the values for the individual doc.
|
||||||
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
||||||
multi_label (bool): Whether the attribute allows multiple labels.
|
multi_label (bool): Whether the attribute allows multiple labels.
|
||||||
Defaults to True.
|
Defaults to True.
|
||||||
positive_label (str): The positive label for a binary task with
|
positive_label (str): The positive label for a binary task with
|
||||||
exclusive classes. Defaults to None.
|
exclusive classes. Defaults to None.
|
||||||
RETURNS (dict): A dictionary containing the scores, with inapplicable
|
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
|
||||||
scores as None:
|
inapplicable scores as None:
|
||||||
for all:
|
for all:
|
||||||
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
|
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
|
||||||
attr_score_desc (text description of the overall score),
|
attr_score_desc (text description of the overall score),
|
||||||
|
@ -320,6 +344,8 @@ class Scorer:
|
||||||
for binary exclusive with positive label: attr_p/r/f
|
for binary exclusive with positive label: attr_p/r/f
|
||||||
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
|
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
|
||||||
for multilabel, macro-averaged AUC: attr_macro_auc
|
for multilabel, macro-averaged AUC: attr_macro_auc
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_cats
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
score = PRFScore()
|
||||||
f_per_type = dict()
|
f_per_type = dict()
|
||||||
|
@ -368,64 +394,67 @@ class Scorer:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
results = {
|
results = {
|
||||||
attr + "_score": None,
|
f"{attr}_score": None,
|
||||||
attr + "_score_desc": None,
|
f"{attr}_score_desc": None,
|
||||||
attr + "_p": None,
|
f"{attr}_p": None,
|
||||||
attr + "_r": None,
|
f"{attr}_r": None,
|
||||||
attr + "_f": None,
|
f"{attr}_f": None,
|
||||||
attr + "_macro_f": None,
|
f"{attr}_macro_f": None,
|
||||||
attr + "_macro_auc": None,
|
f"{attr}_macro_auc": None,
|
||||||
attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||||
attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||||
}
|
}
|
||||||
if len(labels) == 2 and not multi_label and positive_label:
|
if len(labels) == 2 and not multi_label and positive_label:
|
||||||
results[attr + "_p"] = score.precision
|
results[f"{attr}_p"] = score.precision
|
||||||
results[attr + "_r"] = score.recall
|
results[f"{attr}_r"] = score.recall
|
||||||
results[attr + "_f"] = score.fscore
|
results[f"{attr}_f"] = score.fscore
|
||||||
results[attr + "_score"] = results[attr + "_f"]
|
results[f"{attr}_score"] = results[f"{attr}_f"]
|
||||||
results[attr + "_score_desc"] = "F (" + positive_label + ")"
|
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
||||||
elif not multi_label:
|
elif not multi_label:
|
||||||
results[attr + "_macro_f"] = sum(
|
results[f"{attr}_macro_f"] = sum(
|
||||||
[score.fscore for label, score in f_per_type.items()]
|
[score.fscore for label, score in f_per_type.items()]
|
||||||
) / (len(f_per_type) + 1e-100)
|
) / (len(f_per_type) + 1e-100)
|
||||||
results[attr + "_score"] = results[attr + "_macro_f"]
|
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
|
||||||
results[attr + "_score_desc"] = "macro F"
|
results[f"{attr}_score_desc"] = "macro F"
|
||||||
else:
|
else:
|
||||||
results[attr + "_macro_auc"] = max(
|
results[f"{attr}_macro_auc"] = max(
|
||||||
sum([score.score for label, score in auc_per_type.items()])
|
sum([score.score for label, score in auc_per_type.items()])
|
||||||
/ (len(auc_per_type) + 1e-100),
|
/ (len(auc_per_type) + 1e-100),
|
||||||
-1,
|
-1,
|
||||||
)
|
)
|
||||||
results[attr + "_score"] = results[attr + "_macro_auc"]
|
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
|
||||||
results[attr + "_score_desc"] = "macro AUC"
|
results[f"{attr}_score_desc"] = "macro AUC"
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_deps(
|
def score_deps(
|
||||||
examples,
|
examples: Iterable[Example],
|
||||||
attr,
|
attr: str,
|
||||||
getter=getattr,
|
*,
|
||||||
head_attr="head",
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
head_getter=getattr,
|
head_attr: str = "head",
|
||||||
ignore_labels=tuple(),
|
head_getter: Callable[[Token, str], Any] = getattr,
|
||||||
**cfg
|
ignore_labels: Tuple[str] = tuple(),
|
||||||
):
|
**cfg,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
||||||
parses.
|
parses.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute containing the dependency label.
|
attr (str): The attribute containing the dependency label.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
head_attr (str): The attribute containing the head token. Defaults to
|
head_attr (str): The attribute containing the head token. Defaults to
|
||||||
'head'.
|
'head'.
|
||||||
head_getter (callable): Defaults to getattr. If provided,
|
head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
head_getter(token, attr) should return the value of the head for an
|
head_getter(token, attr) should return the value of the head for an
|
||||||
individual token.
|
individual token.
|
||||||
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
|
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
|
||||||
RETURNS (dict): A dictionary containing the scores:
|
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
||||||
attr_uas, attr_las, and attr_las_per_type.
|
attr_uas, attr_las, and attr_las_per_type.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_deps
|
||||||
"""
|
"""
|
||||||
unlabelled = PRFScore()
|
unlabelled = PRFScore()
|
||||||
labelled = PRFScore()
|
labelled = PRFScore()
|
||||||
|
@ -483,10 +512,11 @@ class Scorer:
|
||||||
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
||||||
)
|
)
|
||||||
return {
|
return {
|
||||||
attr + "_uas": unlabelled.fscore,
|
f"{attr}_uas": unlabelled.fscore,
|
||||||
attr + "_las": labelled.fscore,
|
f"{attr}_las": labelled.fscore,
|
||||||
attr
|
f"{attr}_las_per_type": {
|
||||||
+ "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
|
k: v.to_dict() for k, v in labelled_per_dep.items()
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,9 @@ source: spacy/scorer.py
|
||||||
---
|
---
|
||||||
|
|
||||||
The `Scorer` computes evaluation scores. It's typically created by
|
The `Scorer` computes evaluation scores. It's typically created by
|
||||||
[`Language.evaluate`](/api/language#evaluate).
|
[`Language.evaluate`](/api/language#evaluate). In addition, the `Scorer`
|
||||||
|
provides a number of evaluation methods for evaluating [`Token`](/api/token) and
|
||||||
In addition, the `Scorer` provides a number of evaluation methods for evaluating
|
[`Doc`](/api/doc) attributes.
|
||||||
`Token` and `Doc` attributes.
|
|
||||||
|
|
||||||
## Scorer.\_\_init\_\_ {#init tag="method"}
|
## Scorer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
@ -20,10 +19,10 @@ Create a new `Scorer`.
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.scorer import Scorer
|
> from spacy.scorer import Scorer
|
||||||
>
|
>
|
||||||
> # default scoring pipeline
|
> # Default scoring pipeline
|
||||||
> scorer = Scorer()
|
> scorer = Scorer()
|
||||||
>
|
>
|
||||||
> # provided scoring pipeline
|
> # Provided scoring pipeline
|
||||||
> nlp = spacy.load("en_core_web_sm")
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
> scorer = Scorer(nlp)
|
> scorer = Scorer(nlp)
|
||||||
> ```
|
> ```
|
||||||
|
@ -41,16 +40,20 @@ scoring methods provided by the components in the pipeline.
|
||||||
The returned `Dict` contains the scores provided by the individual pipeline
|
The returned `Dict` contains the scores provided by the individual pipeline
|
||||||
components. For the scoring methods provided by the `Scorer` and use by the core
|
components. For the scoring methods provided by the `Scorer` and use by the core
|
||||||
pipeline components, the individual score names start with the `Token` or `Doc`
|
pipeline components, the individual score names start with the `Token` or `Doc`
|
||||||
attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`,
|
attribute being scored:
|
||||||
`pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`, `dep_las`,
|
|
||||||
`dep_las_per_type`, `ents_p/r/f`, `ents_per_type`, `textcat_macro_auc`,
|
- `token_acc`, `token_p`, `token_r`, `token_f`,
|
||||||
`textcat_macro_f`.
|
- `sents_p`, `sents_r`, `sents_f`
|
||||||
|
- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`
|
||||||
|
- `dep_uas`, `dep_las`, `dep_las_per_type`
|
||||||
|
- `ents_p`, `ents_r` `ents_f`, `ents_per_type`
|
||||||
|
- `textcat_macro_auc`, `textcat_macro_f`
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> scorer = Scorer()
|
> scorer = Scorer()
|
||||||
> scorer.score(examples)
|
> scores = scorer.score(examples)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -58,78 +61,148 @@ attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`,
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| **RETURNS** | `Dict` | A dictionary of scores. |
|
| **RETURNS** | `Dict` | A dictionary of scores. |
|
||||||
|
|
||||||
## Scorer.score_tokenization {#score_tokenization tag="staticmethod"}
|
## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Scores the tokenization:
|
Scores the tokenization:
|
||||||
|
|
||||||
- `token_acc`: # correct tokens / # gold tokens
|
- `token_acc`: number of correct tokens / number of gold tokens
|
||||||
- `token_p/r/f`: PRF for token character spans
|
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
|
||||||
|
character spans
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = Scorer.score_tokenization(examples)
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc/p/r/f`. |
|
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. |
|
||||||
|
|
||||||
## Scorer.score_token_attr {#score_token_attr tag="staticmethod"}
|
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Scores a single token attribute.
|
Scores a single token attribute.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = Scorer.score_token_attr(examples, "pos")
|
||||||
|
> print(scores["pos_acc"])
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| `attr` | `str` | The attribute to score. |
|
| `attr` | `str` | The attribute to score. |
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
| _keyword-only_ | | |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the score `attr_acc`. |
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | A dictionary containing the score `{attr}_acc`. |
|
||||||
|
|
||||||
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"}
|
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Scores a single token attribute per feature for a token attribute in UFEATS
|
Scores a single token attribute per feature for a token attribute in
|
||||||
|
[UFEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||||
format.
|
format.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = Scorer.score_token_attr_per_feat(examples, "morph")
|
||||||
|
> print(scores["morph_per_feat"])
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| `attr` | `str` | The attribute to score. |
|
| `attr` | `str` | The attribute to score. |
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
| _keyword-only_ | | |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. |
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||||
|
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. |
|
||||||
|
|
||||||
## Scorer.score_spans {#score_spans tag="staticmethod"}
|
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Returns PRF scores for labeled or unlabeled spans.
|
Returns PRF scores for labeled or unlabeled spans.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = Scorer.score_spans(examples, "ents")
|
||||||
|
> print(scores["ents_f"])
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| `attr` | `str` | The attribute to score. |
|
| `attr` | `str` | The attribute to score. |
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
|
| _keyword-only_ | | |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. |
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
|
||||||
|
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. |
|
||||||
|
|
||||||
## Scorer.score_deps {#score_deps tag="staticmethod"}
|
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> def dep_getter(token, attr):
|
||||||
|
> dep = getattr(token, attr)
|
||||||
|
> dep = token.vocab.strings.as_string(dep).lower()
|
||||||
|
> return dep
|
||||||
|
>
|
||||||
|
> scores = Scorer.score_deps(
|
||||||
|
> examples,
|
||||||
|
> "dep",
|
||||||
|
> getter=dep_getter,
|
||||||
|
> ignore_labels=("p", "punct")
|
||||||
|
> )
|
||||||
|
> print(scores["dep_uas"], scores["dep_las"])
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| `attr` | `str` | The attribute containing the dependency label. |
|
| `attr` | `str` | The attribute containing the dependency label. |
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
| _keyword-only_ | | |
|
||||||
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||||
| `head_attr` | `str` | The attribute containing the head token. |
|
| `head_attr` | `str` | The attribute containing the head token. |
|
||||||
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
|
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
|
||||||
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). |
|
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. |
|
| **RETURNS** | `Dict` | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. |
|
||||||
|
|
||||||
## Scorer.score_cats {#score_cats tag="staticmethod"}
|
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
||||||
containing scores for each label like `Doc.cats`. The reported overall score
|
containing scores for each label like `Doc.cats`. The reported overall score
|
||||||
depends on the scorer settings.
|
depends on the scorer settings:
|
||||||
|
|
||||||
|
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
|
||||||
|
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
|
||||||
|
score), `{attr}_f_per_type`, `{attr}_auc_per_type`
|
||||||
|
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
|
||||||
|
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
|
||||||
|
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> labels = ["LABEL_A", "LABEL_B", "LABEL_C"]
|
||||||
|
> scores = Scorer.score_cats(
|
||||||
|
> examples,
|
||||||
|
> "cats",
|
||||||
|
> labels=labels
|
||||||
|
> )
|
||||||
|
> print(scores["cats_macro_auc"])
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| `attr` | `str` | The attribute to score. |
|
| `attr` | `str` | The attribute to score. |
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
|
| _keyword-only_ | | |
|
||||||
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
|
||||||
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. |
|
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. |
|
||||||
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
|
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
|
||||||
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. |
|
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`: 1) for all: `attr_score` (one of `attr_f` / `attr_macro_f` / `attr_macro_auc`), `attr_score_desc` (text description of the overall score), `attr_f_per_type`, `attr_auc_per_type`; 2) for binary exclusive with positive label: `attr_p/r/f`; 3) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 4) for multilabel, macro-averaged AUC: `attr_macro_auc` |
|
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user