mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-08 06:04:57 +03:00
Add beta parameter to Scorer and PRFScore.
This commit is contained in:
parent
51863cd267
commit
ea9737a664
|
@ -7,6 +7,8 @@ from typing import Optional, Tuple, Any, Dict, List
|
||||||
import numpy
|
import numpy
|
||||||
import wasabi.tables
|
import wasabi.tables
|
||||||
|
|
||||||
|
from ..pipeline import TrainablePipe, Pipe
|
||||||
|
from ..errors import Errors
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
from ._util import app, Arg, Opt, import_code, setup_gpu
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -47,7 +49,7 @@ def find_threshold_cli(
|
||||||
threshold_key (str): Key of threshold attribute in component's configuration.
|
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||||
scores_key (str): Name of score to metric to optimize.
|
scores_key (str): Name of score to metric to optimize.
|
||||||
n_trials (int): Number of trials to determine optimal thresholds
|
n_trials (int): Number of trials to determine optimal thresholds
|
||||||
beta (float): Beta for F1 calculation.
|
beta (float): Beta for F-score calculation.
|
||||||
code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported.
|
code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported.
|
||||||
use_gpu (int): GPU ID or -1 for CPU.
|
use_gpu (int): GPU ID or -1 for CPU.
|
||||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||||
|
@ -79,10 +81,10 @@ def find_threshold(
|
||||||
threshold_key: str,
|
threshold_key: str,
|
||||||
scores_key: str,
|
scores_key: str,
|
||||||
*,
|
*,
|
||||||
n_trials: int = _DEFAULTS["n_trials"],
|
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
|
||||||
beta: float = _DEFAULTS["beta"],
|
beta: float = _DEFAULTS["beta"], # type: ignore
|
||||||
use_gpu: int = _DEFAULTS["use_gpu"],
|
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
|
||||||
gold_preproc: bool = _DEFAULTS["gold_preproc"],
|
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Tuple[float, float]:
|
) -> Tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
|
@ -93,7 +95,7 @@ def find_threshold(
|
||||||
threshold_key (str): Key of threshold attribute in component's configuration.
|
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||||
scores_key (str): Name of score to metric to optimize.
|
scores_key (str): Name of score to metric to optimize.
|
||||||
n_trials (int): Number of trials to determine optimal thresholds.
|
n_trials (int): Number of trials to determine optimal thresholds.
|
||||||
beta (float): Beta for F1 calculation.
|
beta (float): Beta for F-score calculation.
|
||||||
use_gpu (int): GPU ID or -1 for CPU.
|
use_gpu (int): GPU ID or -1 for CPU.
|
||||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||||
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
|
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
|
||||||
|
@ -108,10 +110,13 @@ def find_threshold(
|
||||||
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
|
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
|
|
||||||
|
pipe: Optional[Pipe] = None
|
||||||
try:
|
try:
|
||||||
pipe = nlp.get_pipe(pipe_name)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
except KeyError as err:
|
except KeyError as err:
|
||||||
wasabi.msg.fail(title=str(err), exits=1)
|
wasabi.msg.fail(title=str(err), exits=1)
|
||||||
|
if not isinstance(pipe, TrainablePipe):
|
||||||
|
raise TypeError(Errors.E1044)
|
||||||
|
|
||||||
if not silent:
|
if not silent:
|
||||||
wasabi.msg.info(
|
wasabi.msg.info(
|
||||||
|
@ -140,7 +145,9 @@ def find_threshold(
|
||||||
scores: Dict[float, float] = {}
|
scores: Dict[float, float] = {}
|
||||||
for threshold in numpy.linspace(0, 1, n_trials):
|
for threshold in numpy.linspace(0, 1, n_trials):
|
||||||
pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold)
|
pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold)
|
||||||
scores[threshold] = nlp.evaluate(dev_dataset)[scores_key]
|
scores[threshold] = nlp.evaluate(dev_dataset, scorer_cfg={"beta": beta})[
|
||||||
|
scores_key
|
||||||
|
]
|
||||||
if not (
|
if not (
|
||||||
isinstance(scores[threshold], float) or isinstance(scores[threshold], int)
|
isinstance(scores[threshold], float) or isinstance(scores[threshold], int)
|
||||||
):
|
):
|
||||||
|
|
|
@ -939,6 +939,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"`{arg2}`={arg2_values} but these arguments are conflicting.")
|
"`{arg2}`={arg2_values} but these arguments are conflicting.")
|
||||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||||
"{value}.")
|
"{value}.")
|
||||||
|
E1044 = ("Only components of type `TrainablePipe` are supported by `find_threshold()`.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -22,15 +22,12 @@ class PRFScore:
|
||||||
"""A precision / recall / F score."""
|
"""A precision / recall / F score."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, *, tp: int = 0, fp: int = 0, fn: int = 0, beta: float = 1
|
||||||
*,
|
|
||||||
tp: int = 0,
|
|
||||||
fp: int = 0,
|
|
||||||
fn: int = 0,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
self.tp = tp
|
self.tp = tp
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.fn = fn
|
self.fn = fn
|
||||||
|
self.beta = beta
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
return self.tp + self.fp + self.fn
|
return self.tp + self.fp + self.fn
|
||||||
|
@ -42,8 +39,12 @@ class PRFScore:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
|
assert self.beta == other.beta
|
||||||
return PRFScore(
|
return PRFScore(
|
||||||
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
|
tp=self.tp + other.tp,
|
||||||
|
fp=self.fp + other.fp,
|
||||||
|
fn=self.fn + other.fn,
|
||||||
|
beta=self.beta,
|
||||||
)
|
)
|
||||||
|
|
||||||
def score_set(self, cand: set, gold: set) -> None:
|
def score_set(self, cand: set, gold: set) -> None:
|
||||||
|
@ -63,7 +64,7 @@ class PRFScore:
|
||||||
def fscore(self) -> float:
|
def fscore(self) -> float:
|
||||||
p = self.precision
|
p = self.precision
|
||||||
r = self.recall
|
r = self.recall
|
||||||
return 2 * ((p * r) / (p + r + 1e-100))
|
return (1 + self.beta**2) * ((p * r) / ((self.beta**2 * p) + r + 1e-100))
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, float]:
|
def to_dict(self) -> Dict[str, float]:
|
||||||
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
||||||
|
@ -101,6 +102,8 @@ class ROCAUCScore:
|
||||||
class Scorer:
|
class Scorer:
|
||||||
"""Compute evaluation scores."""
|
"""Compute evaluation scores."""
|
||||||
|
|
||||||
|
BETA = 1
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
nlp: Optional["Language"] = None,
|
nlp: Optional["Language"] = None,
|
||||||
|
@ -149,8 +152,9 @@ class Scorer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_tokenization
|
DOCS: https://spacy.io/api/scorer#score_tokenization
|
||||||
"""
|
"""
|
||||||
acc_score = PRFScore()
|
beta = cfg.get("beta", Scorer.BETA)
|
||||||
prf_score = PRFScore()
|
acc_score = PRFScore(beta=beta)
|
||||||
|
prf_score = PRFScore(beta=beta)
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_doc = example.reference
|
gold_doc = example.reference
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
|
@ -210,7 +214,7 @@ class Scorer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_token_attr
|
DOCS: https://spacy.io/api/scorer#score_token_attr
|
||||||
"""
|
"""
|
||||||
tag_score = PRFScore()
|
tag_score = PRFScore(beta=cfg.get("beta", Scorer.BETA))
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_doc = example.reference
|
gold_doc = example.reference
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
|
@ -261,7 +265,8 @@ class Scorer:
|
||||||
key attr_micro_p/r/f and the per-feat PRF scores under
|
key attr_micro_p/r/f and the per-feat PRF scores under
|
||||||
attr_per_feat.
|
attr_per_feat.
|
||||||
"""
|
"""
|
||||||
micro_score = PRFScore()
|
beta = cfg.get("beta", Scorer.BETA)
|
||||||
|
micro_score = PRFScore(beta=beta)
|
||||||
per_feat = {}
|
per_feat = {}
|
||||||
for example in examples:
|
for example in examples:
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
|
@ -276,7 +281,7 @@ class Scorer:
|
||||||
for feat in morph.split(Morphology.FEATURE_SEP):
|
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||||
field, values = feat.split(Morphology.FIELD_SEP)
|
field, values = feat.split(Morphology.FIELD_SEP)
|
||||||
if field not in per_feat:
|
if field not in per_feat:
|
||||||
per_feat[field] = PRFScore()
|
per_feat[field] = PRFScore(beta=beta)
|
||||||
if field not in gold_per_feat:
|
if field not in gold_per_feat:
|
||||||
gold_per_feat[field] = set()
|
gold_per_feat[field] = set()
|
||||||
gold_per_feat[field].add((gold_i, feat))
|
gold_per_feat[field].add((gold_i, feat))
|
||||||
|
@ -298,7 +303,7 @@ class Scorer:
|
||||||
for feat in morph.split(Morphology.FEATURE_SEP):
|
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||||
field, values = feat.split(Morphology.FIELD_SEP)
|
field, values = feat.split(Morphology.FIELD_SEP)
|
||||||
if field not in per_feat:
|
if field not in per_feat:
|
||||||
per_feat[field] = PRFScore()
|
per_feat[field] = PRFScore(beta=beta)
|
||||||
if field not in pred_per_feat:
|
if field not in pred_per_feat:
|
||||||
pred_per_feat[field] = set()
|
pred_per_feat[field] = set()
|
||||||
pred_per_feat[field].add((gold_i, feat))
|
pred_per_feat[field].add((gold_i, feat))
|
||||||
|
@ -353,7 +358,8 @@ class Scorer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_spans
|
DOCS: https://spacy.io/api/scorer#score_spans
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
beta = cfg.get("beta", Scorer.BETA)
|
||||||
|
score = PRFScore(beta=beta)
|
||||||
score_per_type = dict()
|
score_per_type = dict()
|
||||||
for example in examples:
|
for example in examples:
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
|
@ -372,7 +378,7 @@ class Scorer:
|
||||||
gold_per_type: Dict[str, Set] = {label: set() for label in labels}
|
gold_per_type: Dict[str, Set] = {label: set() for label in labels}
|
||||||
for label in labels:
|
for label in labels:
|
||||||
if label not in score_per_type:
|
if label not in score_per_type:
|
||||||
score_per_type[label] = PRFScore()
|
score_per_type[label] = PRFScore(beta=beta)
|
||||||
# Find all predidate labels, for all and per type
|
# Find all predidate labels, for all and per type
|
||||||
gold_spans = set()
|
gold_spans = set()
|
||||||
pred_spans = set()
|
pred_spans = set()
|
||||||
|
@ -469,9 +475,10 @@ class Scorer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_cats
|
DOCS: https://spacy.io/api/scorer#score_cats
|
||||||
"""
|
"""
|
||||||
|
beta = cfg.get("beta", Scorer.BETA)
|
||||||
if threshold is None:
|
if threshold is None:
|
||||||
threshold = 0.5 if multi_label else 0.0
|
threshold = 0.5 if multi_label else 0.0
|
||||||
f_per_type = {label: PRFScore() for label in labels}
|
f_per_type = {label: PRFScore(beta=beta) for label in labels}
|
||||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||||
labels = set(labels)
|
labels = set(labels)
|
||||||
if labels:
|
if labels:
|
||||||
|
@ -519,7 +526,7 @@ class Scorer:
|
||||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||||
if pred_score >= threshold:
|
if pred_score >= threshold:
|
||||||
f_per_type[pred_label].fp += 1
|
f_per_type[pred_label].fp += 1
|
||||||
micro_prf = PRFScore()
|
micro_prf = PRFScore(beta=beta)
|
||||||
for label_prf in f_per_type.values():
|
for label_prf in f_per_type.values():
|
||||||
micro_prf.tp += label_prf.tp
|
micro_prf.tp += label_prf.tp
|
||||||
micro_prf.fn += label_prf.fn
|
micro_prf.fn += label_prf.fn
|
||||||
|
@ -576,6 +583,7 @@ class Scorer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_links
|
DOCS: https://spacy.io/api/scorer#score_links
|
||||||
"""
|
"""
|
||||||
|
beta = cfg.get("beta", Scorer.BETA)
|
||||||
f_per_type = {}
|
f_per_type = {}
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_ent_by_offset = {}
|
gold_ent_by_offset = {}
|
||||||
|
@ -589,7 +597,7 @@ class Scorer:
|
||||||
if gold_span is not None:
|
if gold_span is not None:
|
||||||
label = gold_span.label_
|
label = gold_span.label_
|
||||||
if label not in f_per_type:
|
if label not in f_per_type:
|
||||||
f_per_type[label] = PRFScore()
|
f_per_type[label] = PRFScore(beta=beta)
|
||||||
gold = gold_span.kb_id_
|
gold = gold_span.kb_id_
|
||||||
# only evaluating entities that overlap between gold and pred,
|
# only evaluating entities that overlap between gold and pred,
|
||||||
# to disentangle the performance of the NEL from the NER
|
# to disentangle the performance of the NEL from the NER
|
||||||
|
@ -608,7 +616,7 @@ class Scorer:
|
||||||
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
|
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
|
||||||
f_per_type[label].fp += 1
|
f_per_type[label].fp += 1
|
||||||
f_per_type[label].fn += 1
|
f_per_type[label].fn += 1
|
||||||
micro_prf = PRFScore()
|
micro_prf = PRFScore(beta=beta)
|
||||||
for label_prf in f_per_type.values():
|
for label_prf in f_per_type.values():
|
||||||
micro_prf.tp += label_prf.tp
|
micro_prf.tp += label_prf.tp
|
||||||
micro_prf.fn += label_prf.fn
|
micro_prf.fn += label_prf.fn
|
||||||
|
@ -663,8 +671,9 @@ class Scorer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_deps
|
DOCS: https://spacy.io/api/scorer#score_deps
|
||||||
"""
|
"""
|
||||||
unlabelled = PRFScore()
|
beta = cfg.get("beta", Scorer.BETA)
|
||||||
labelled = PRFScore()
|
unlabelled = PRFScore(beta=beta)
|
||||||
|
labelled = PRFScore(beta=beta)
|
||||||
labelled_per_dep = dict()
|
labelled_per_dep = dict()
|
||||||
missing_indices = set()
|
missing_indices = set()
|
||||||
for example in examples:
|
for example in examples:
|
||||||
|
@ -680,7 +689,7 @@ class Scorer:
|
||||||
if dep not in ignore_labels:
|
if dep not in ignore_labels:
|
||||||
gold_deps.add((gold_i, head.i, dep))
|
gold_deps.add((gold_i, head.i, dep))
|
||||||
if dep not in labelled_per_dep:
|
if dep not in labelled_per_dep:
|
||||||
labelled_per_dep[dep] = PRFScore()
|
labelled_per_dep[dep] = PRFScore(beta=beta)
|
||||||
if dep not in gold_deps_per_dep:
|
if dep not in gold_deps_per_dep:
|
||||||
gold_deps_per_dep[dep] = set()
|
gold_deps_per_dep[dep] = set()
|
||||||
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
||||||
|
@ -711,7 +720,7 @@ class Scorer:
|
||||||
else:
|
else:
|
||||||
pred_deps.add((gold_i, gold_head, dep))
|
pred_deps.add((gold_i, gold_head, dep))
|
||||||
if dep not in labelled_per_dep:
|
if dep not in labelled_per_dep:
|
||||||
labelled_per_dep[dep] = PRFScore()
|
labelled_per_dep[dep] = PRFScore(beta=beta)
|
||||||
if dep not in pred_deps_per_dep:
|
if dep not in pred_deps_per_dep:
|
||||||
pred_deps_per_dep[dep] = set()
|
pred_deps_per_dep[dep] = set()
|
||||||
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
||||||
|
@ -742,6 +751,7 @@ class Scorer:
|
||||||
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
||||||
score_per_type = defaultdict(PRFScore)
|
score_per_type = defaultdict(PRFScore)
|
||||||
|
beta = kwargs.get("beta", Scorer.BETA)
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
if not eg.y.has_annotation("ENT_IOB"):
|
if not eg.y.has_annotation("ENT_IOB"):
|
||||||
continue
|
continue
|
||||||
|
@ -749,7 +759,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
align_x2y = eg.alignment.x2y
|
align_x2y = eg.alignment.x2y
|
||||||
for pred_ent in eg.x.ents:
|
for pred_ent in eg.x.ents:
|
||||||
if pred_ent.label_ not in score_per_type:
|
if pred_ent.label_ not in score_per_type:
|
||||||
score_per_type[pred_ent.label_] = PRFScore()
|
score_per_type[pred_ent.label_] = PRFScore(beta=beta)
|
||||||
indices = align_x2y[pred_ent.start : pred_ent.end]
|
indices = align_x2y[pred_ent.start : pred_ent.end]
|
||||||
if len(indices):
|
if len(indices):
|
||||||
g_span = eg.y[indices[0] : indices[-1] + 1]
|
g_span = eg.y[indices[0] : indices[-1] + 1]
|
||||||
|
@ -765,7 +775,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
score_per_type[pred_ent.label_].fp += 1
|
score_per_type[pred_ent.label_].fp += 1
|
||||||
for label, start, end in golds:
|
for label, start, end in golds:
|
||||||
score_per_type[label].fn += 1
|
score_per_type[label].fn += 1
|
||||||
totals = PRFScore()
|
totals = PRFScore(beta=beta)
|
||||||
for prf in score_per_type.values():
|
for prf in score_per_type.values():
|
||||||
totals += prf
|
totals += prf
|
||||||
if len(totals) > 0:
|
if len(totals) > 0:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user