mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-08 06:04:57 +03:00
Add beta parameter to Scorer and PRFScore.
This commit is contained in:
parent
51863cd267
commit
ea9737a664
|
@ -7,6 +7,8 @@ from typing import Optional, Tuple, Any, Dict, List
|
|||
import numpy
|
||||
import wasabi.tables
|
||||
|
||||
from ..pipeline import TrainablePipe, Pipe
|
||||
from ..errors import Errors
|
||||
from ..training import Corpus
|
||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
||||
from .. import util
|
||||
|
@ -47,7 +49,7 @@ def find_threshold_cli(
|
|||
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||
scores_key (str): Name of score to metric to optimize.
|
||||
n_trials (int): Number of trials to determine optimal thresholds
|
||||
beta (float): Beta for F1 calculation.
|
||||
beta (float): Beta for F-score calculation.
|
||||
code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported.
|
||||
use_gpu (int): GPU ID or -1 for CPU.
|
||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||
|
@ -79,10 +81,10 @@ def find_threshold(
|
|||
threshold_key: str,
|
||||
scores_key: str,
|
||||
*,
|
||||
n_trials: int = _DEFAULTS["n_trials"],
|
||||
beta: float = _DEFAULTS["beta"],
|
||||
use_gpu: int = _DEFAULTS["use_gpu"],
|
||||
gold_preproc: bool = _DEFAULTS["gold_preproc"],
|
||||
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
|
||||
beta: float = _DEFAULTS["beta"], # type: ignore
|
||||
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
|
||||
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
|
||||
silent: bool = True,
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
|
@ -93,7 +95,7 @@ def find_threshold(
|
|||
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||
scores_key (str): Name of score to metric to optimize.
|
||||
n_trials (int): Number of trials to determine optimal thresholds.
|
||||
beta (float): Beta for F1 calculation.
|
||||
beta (float): Beta for F-score calculation.
|
||||
use_gpu (int): GPU ID or -1 for CPU.
|
||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
|
||||
|
@ -108,10 +110,13 @@ def find_threshold(
|
|||
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
|
||||
nlp = util.load_model(model)
|
||||
|
||||
pipe: Optional[Pipe] = None
|
||||
try:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
except KeyError as err:
|
||||
wasabi.msg.fail(title=str(err), exits=1)
|
||||
if not isinstance(pipe, TrainablePipe):
|
||||
raise TypeError(Errors.E1044)
|
||||
|
||||
if not silent:
|
||||
wasabi.msg.info(
|
||||
|
@ -140,7 +145,9 @@ def find_threshold(
|
|||
scores: Dict[float, float] = {}
|
||||
for threshold in numpy.linspace(0, 1, n_trials):
|
||||
pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold)
|
||||
scores[threshold] = nlp.evaluate(dev_dataset)[scores_key]
|
||||
scores[threshold] = nlp.evaluate(dev_dataset, scorer_cfg={"beta": beta})[
|
||||
scores_key
|
||||
]
|
||||
if not (
|
||||
isinstance(scores[threshold], float) or isinstance(scores[threshold], int)
|
||||
):
|
||||
|
|
|
@ -939,6 +939,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"`{arg2}`={arg2_values} but these arguments are conflicting.")
|
||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||
"{value}.")
|
||||
E1044 = ("Only components of type `TrainablePipe` are supported by `find_threshold()`.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -22,15 +22,12 @@ class PRFScore:
|
|||
"""A precision / recall / F score."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
tp: int = 0,
|
||||
fp: int = 0,
|
||||
fn: int = 0,
|
||||
self, *, tp: int = 0, fp: int = 0, fn: int = 0, beta: float = 1
|
||||
) -> None:
|
||||
self.tp = tp
|
||||
self.fp = fp
|
||||
self.fn = fn
|
||||
self.beta = beta
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.tp + self.fp + self.fn
|
||||
|
@ -42,8 +39,12 @@ class PRFScore:
|
|||
return self
|
||||
|
||||
def __add__(self, other):
|
||||
assert self.beta == other.beta
|
||||
return PRFScore(
|
||||
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
|
||||
tp=self.tp + other.tp,
|
||||
fp=self.fp + other.fp,
|
||||
fn=self.fn + other.fn,
|
||||
beta=self.beta,
|
||||
)
|
||||
|
||||
def score_set(self, cand: set, gold: set) -> None:
|
||||
|
@ -63,7 +64,7 @@ class PRFScore:
|
|||
def fscore(self) -> float:
|
||||
p = self.precision
|
||||
r = self.recall
|
||||
return 2 * ((p * r) / (p + r + 1e-100))
|
||||
return (1 + self.beta**2) * ((p * r) / ((self.beta**2 * p) + r + 1e-100))
|
||||
|
||||
def to_dict(self) -> Dict[str, float]:
|
||||
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
||||
|
@ -101,6 +102,8 @@ class ROCAUCScore:
|
|||
class Scorer:
|
||||
"""Compute evaluation scores."""
|
||||
|
||||
BETA = 1
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
nlp: Optional["Language"] = None,
|
||||
|
@ -149,8 +152,9 @@ class Scorer:
|
|||
|
||||
DOCS: https://spacy.io/api/scorer#score_tokenization
|
||||
"""
|
||||
acc_score = PRFScore()
|
||||
prf_score = PRFScore()
|
||||
beta = cfg.get("beta", Scorer.BETA)
|
||||
acc_score = PRFScore(beta=beta)
|
||||
prf_score = PRFScore(beta=beta)
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
|
@ -210,7 +214,7 @@ class Scorer:
|
|||
|
||||
DOCS: https://spacy.io/api/scorer#score_token_attr
|
||||
"""
|
||||
tag_score = PRFScore()
|
||||
tag_score = PRFScore(beta=cfg.get("beta", Scorer.BETA))
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
|
@ -261,7 +265,8 @@ class Scorer:
|
|||
key attr_micro_p/r/f and the per-feat PRF scores under
|
||||
attr_per_feat.
|
||||
"""
|
||||
micro_score = PRFScore()
|
||||
beta = cfg.get("beta", Scorer.BETA)
|
||||
micro_score = PRFScore(beta=beta)
|
||||
per_feat = {}
|
||||
for example in examples:
|
||||
pred_doc = example.predicted
|
||||
|
@ -276,7 +281,7 @@ class Scorer:
|
|||
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||
field, values = feat.split(Morphology.FIELD_SEP)
|
||||
if field not in per_feat:
|
||||
per_feat[field] = PRFScore()
|
||||
per_feat[field] = PRFScore(beta=beta)
|
||||
if field not in gold_per_feat:
|
||||
gold_per_feat[field] = set()
|
||||
gold_per_feat[field].add((gold_i, feat))
|
||||
|
@ -298,7 +303,7 @@ class Scorer:
|
|||
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||
field, values = feat.split(Morphology.FIELD_SEP)
|
||||
if field not in per_feat:
|
||||
per_feat[field] = PRFScore()
|
||||
per_feat[field] = PRFScore(beta=beta)
|
||||
if field not in pred_per_feat:
|
||||
pred_per_feat[field] = set()
|
||||
pred_per_feat[field].add((gold_i, feat))
|
||||
|
@ -353,7 +358,8 @@ class Scorer:
|
|||
|
||||
DOCS: https://spacy.io/api/scorer#score_spans
|
||||
"""
|
||||
score = PRFScore()
|
||||
beta = cfg.get("beta", Scorer.BETA)
|
||||
score = PRFScore(beta=beta)
|
||||
score_per_type = dict()
|
||||
for example in examples:
|
||||
pred_doc = example.predicted
|
||||
|
@ -372,7 +378,7 @@ class Scorer:
|
|||
gold_per_type: Dict[str, Set] = {label: set() for label in labels}
|
||||
for label in labels:
|
||||
if label not in score_per_type:
|
||||
score_per_type[label] = PRFScore()
|
||||
score_per_type[label] = PRFScore(beta=beta)
|
||||
# Find all predidate labels, for all and per type
|
||||
gold_spans = set()
|
||||
pred_spans = set()
|
||||
|
@ -469,9 +475,10 @@ class Scorer:
|
|||
|
||||
DOCS: https://spacy.io/api/scorer#score_cats
|
||||
"""
|
||||
beta = cfg.get("beta", Scorer.BETA)
|
||||
if threshold is None:
|
||||
threshold = 0.5 if multi_label else 0.0
|
||||
f_per_type = {label: PRFScore() for label in labels}
|
||||
f_per_type = {label: PRFScore(beta=beta) for label in labels}
|
||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||
labels = set(labels)
|
||||
if labels:
|
||||
|
@ -519,7 +526,7 @@ class Scorer:
|
|||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||
if pred_score >= threshold:
|
||||
f_per_type[pred_label].fp += 1
|
||||
micro_prf = PRFScore()
|
||||
micro_prf = PRFScore(beta=beta)
|
||||
for label_prf in f_per_type.values():
|
||||
micro_prf.tp += label_prf.tp
|
||||
micro_prf.fn += label_prf.fn
|
||||
|
@ -576,6 +583,7 @@ class Scorer:
|
|||
|
||||
DOCS: https://spacy.io/api/scorer#score_links
|
||||
"""
|
||||
beta = cfg.get("beta", Scorer.BETA)
|
||||
f_per_type = {}
|
||||
for example in examples:
|
||||
gold_ent_by_offset = {}
|
||||
|
@ -589,7 +597,7 @@ class Scorer:
|
|||
if gold_span is not None:
|
||||
label = gold_span.label_
|
||||
if label not in f_per_type:
|
||||
f_per_type[label] = PRFScore()
|
||||
f_per_type[label] = PRFScore(beta=beta)
|
||||
gold = gold_span.kb_id_
|
||||
# only evaluating entities that overlap between gold and pred,
|
||||
# to disentangle the performance of the NEL from the NER
|
||||
|
@ -608,7 +616,7 @@ class Scorer:
|
|||
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
|
||||
f_per_type[label].fp += 1
|
||||
f_per_type[label].fn += 1
|
||||
micro_prf = PRFScore()
|
||||
micro_prf = PRFScore(beta=beta)
|
||||
for label_prf in f_per_type.values():
|
||||
micro_prf.tp += label_prf.tp
|
||||
micro_prf.fn += label_prf.fn
|
||||
|
@ -663,8 +671,9 @@ class Scorer:
|
|||
|
||||
DOCS: https://spacy.io/api/scorer#score_deps
|
||||
"""
|
||||
unlabelled = PRFScore()
|
||||
labelled = PRFScore()
|
||||
beta = cfg.get("beta", Scorer.BETA)
|
||||
unlabelled = PRFScore(beta=beta)
|
||||
labelled = PRFScore(beta=beta)
|
||||
labelled_per_dep = dict()
|
||||
missing_indices = set()
|
||||
for example in examples:
|
||||
|
@ -680,7 +689,7 @@ class Scorer:
|
|||
if dep not in ignore_labels:
|
||||
gold_deps.add((gold_i, head.i, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
labelled_per_dep[dep] = PRFScore(beta=beta)
|
||||
if dep not in gold_deps_per_dep:
|
||||
gold_deps_per_dep[dep] = set()
|
||||
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
||||
|
@ -711,7 +720,7 @@ class Scorer:
|
|||
else:
|
||||
pred_deps.add((gold_i, gold_head, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
labelled_per_dep[dep] = PRFScore(beta=beta)
|
||||
if dep not in pred_deps_per_dep:
|
||||
pred_deps_per_dep[dep] = set()
|
||||
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
||||
|
@ -742,6 +751,7 @@ class Scorer:
|
|||
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
||||
score_per_type = defaultdict(PRFScore)
|
||||
beta = kwargs.get("beta", Scorer.BETA)
|
||||
for eg in examples:
|
||||
if not eg.y.has_annotation("ENT_IOB"):
|
||||
continue
|
||||
|
@ -749,7 +759,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|||
align_x2y = eg.alignment.x2y
|
||||
for pred_ent in eg.x.ents:
|
||||
if pred_ent.label_ not in score_per_type:
|
||||
score_per_type[pred_ent.label_] = PRFScore()
|
||||
score_per_type[pred_ent.label_] = PRFScore(beta=beta)
|
||||
indices = align_x2y[pred_ent.start : pred_ent.end]
|
||||
if len(indices):
|
||||
g_span = eg.y[indices[0] : indices[-1] + 1]
|
||||
|
@ -765,7 +775,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|||
score_per_type[pred_ent.label_].fp += 1
|
||||
for label, start, end in golds:
|
||||
score_per_type[label].fn += 1
|
||||
totals = PRFScore()
|
||||
totals = PRFScore(beta=beta)
|
||||
for prf in score_per_type.values():
|
||||
totals += prf
|
||||
if len(totals) > 0:
|
||||
|
|
Loading…
Reference in New Issue
Block a user