mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility Refactor the `Scorer` to improve flexibility for arbitrary pipeline components. * Individual pipeline components provide their own `evaluate` methods that score a list of `Example`s and return a dictionary of scores * `Scorer` is initialized either: * with a provided pipeline containing components to be scored * with a default pipeline containing the built-in statistical components (senter, tagger, morphologizer, parser, ner) * `Scorer.score` evaluates a list of `Example`s and returns a dictionary of scores referring to the scores provided by the components in the pipeline Significant differences: * `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc` and the new `morph_acc`, `pos_acc`, and `lemma_acc` * Scoring is no longer cumulative: `Scorer.score` scores a list of examples rather than a single example and does not retain any state about previously scored examples * PRF values in the returned scores are no longer multiplied by 100 * Add kwargs to Morphologizer.evaluate * Create generalized scoring methods in Scorer * Generalized static scoring methods are added to `Scorer` * Methods require an attribute (either on Token or Doc) that is used to key the returned scores Naming differences: * `uas`, `las`, and `las_per_type` in the scores dict are renamed to `dep_uas`, `dep_las`, and `dep_las_per_type` Scoring differences: * `Doc.sents` is now scored as spans rather than on sentence-initial token positions so that `Doc.sents` and `Doc.ents` can be scored with the same method (this lowers scores since a single incorrect sentence start results in two incorrect spans) * Simplify / extend hasattr check for eval method * Add hasattr check to tokenizer scoring * Simplify to hasattr check for component scoring * Reset Example alignment if docs are set Reset the Example alignment if either doc is set in case the tokenization has changed. * Add PRF tokenization scoring for tokens as spans Add PRF scores for tokens as character spans. The scores are: * token_acc: # correct tokens / # gold tokens * token_p/r/f: PRF for (token.idx, token.idx + len(token)) * Add docstring to Scorer.score_tokenization * Rename component.evaluate() to component.score() * Update Scorer API docs * Update scoring for positive_label in textcat * Fix TextCategorizer.score kwargs * Update Language.evaluate docs * Update score names in default config
This commit is contained in:
parent
656574a01a
commit
2bcceb80c4
|
@ -68,41 +68,43 @@ def evaluate(
|
|||
nlp = util.load_model(model)
|
||||
dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
|
||||
begin = timer()
|
||||
scorer = nlp.evaluate(dev_dataset, verbose=False)
|
||||
scores = nlp.evaluate(dev_dataset, verbose=False)
|
||||
end = timer()
|
||||
nwords = sum(len(ex.predicted) for ex in dev_dataset)
|
||||
results = {
|
||||
"Time": f"{end - begin:.2f} s",
|
||||
"Words": nwords,
|
||||
"Words/s": f"{nwords / (end - begin):.0f}",
|
||||
"TOK": f"{scorer.token_acc:.2f}",
|
||||
"TAG": f"{scorer.tags_acc:.2f}",
|
||||
"POS": f"{scorer.pos_acc:.2f}",
|
||||
"MORPH": f"{scorer.morphs_acc:.2f}",
|
||||
"UAS": f"{scorer.uas:.2f}",
|
||||
"LAS": f"{scorer.las:.2f}",
|
||||
"NER P": f"{scorer.ents_p:.2f}",
|
||||
"NER R": f"{scorer.ents_r:.2f}",
|
||||
"NER F": f"{scorer.ents_f:.2f}",
|
||||
"Textcat AUC": f"{scorer.textcat_auc:.2f}",
|
||||
"Textcat F": f"{scorer.textcat_f:.2f}",
|
||||
"Sent P": f"{scorer.sent_p:.2f}",
|
||||
"Sent R": f"{scorer.sent_r:.2f}",
|
||||
"Sent F": f"{scorer.sent_f:.2f}",
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"Textcat AUC": 'textcat_macro_auc',
|
||||
"Textcat F": 'textcat_macro_f',
|
||||
"Sent P": 'sents_p',
|
||||
"Sent R": 'sents_r',
|
||||
"Sent F": 'sents_f',
|
||||
}
|
||||
results = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
|
||||
|
||||
msg.table(results, title="Results")
|
||||
|
||||
if scorer.ents_per_type:
|
||||
data["ents_per_type"] = scorer.ents_per_type
|
||||
print_ents_per_type(msg, scorer.ents_per_type)
|
||||
if scorer.textcats_f_per_cat:
|
||||
data["textcats_f_per_cat"] = scorer.textcats_f_per_cat
|
||||
print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat)
|
||||
if scorer.textcats_auc_per_cat:
|
||||
data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat
|
||||
print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
|
||||
if "ents_per_type" in scores:
|
||||
if scores["ents_per_type"]:
|
||||
print_ents_per_type(msg, scores["ents_per_type"])
|
||||
if "textcat_f_per_cat" in scores:
|
||||
if scores["textcat_f_per_cat"]:
|
||||
print_textcats_f_per_cat(msg, scores["textcat_f_per_cat"])
|
||||
if "textcat_auc_per_cat" in scores:
|
||||
if scores["textcat_auc_per_cat"]:
|
||||
print_textcats_auc_per_cat(msg, scores["textcat_auc_per_cat"])
|
||||
|
||||
if displacy_path:
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
|
@ -148,7 +150,7 @@ def render_parses(
|
|||
|
||||
def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
||||
data = [
|
||||
(k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
|
||||
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
||||
for k, v in scores.items()
|
||||
]
|
||||
msg.table(
|
||||
|
@ -161,7 +163,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No
|
|||
|
||||
def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
||||
data = [
|
||||
(k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
|
||||
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
||||
for k, v in scores.items()
|
||||
]
|
||||
msg.table(
|
||||
|
@ -176,7 +178,7 @@ def print_textcats_auc_per_cat(
|
|||
msg: Printer, scores: Dict[str, Dict[str, float]]
|
||||
) -> None:
|
||||
msg.table(
|
||||
[(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()],
|
||||
[(k, f"{v:.2f}") for k, v in scores.items()],
|
||||
header=("", "ROC AUC"),
|
||||
aligns=("l", "r"),
|
||||
title="Textcat ROC AUC (per label)",
|
||||
|
|
|
@ -179,6 +179,7 @@ def train(
|
|||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||
except Exception as e:
|
||||
if output_path is not None:
|
||||
raise e
|
||||
msg.warn(
|
||||
f"Aborting and saving the final best model. "
|
||||
f"Encountered exception: {str(e)}",
|
||||
|
@ -259,12 +260,11 @@ def create_evaluation_callback(
|
|||
start_time = timer()
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
|
||||
scores = nlp.evaluate(dev_examples, batch_size=batch_size)
|
||||
else:
|
||||
scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
|
||||
scores = nlp.evaluate(dev_examples, batch_size=batch_size)
|
||||
end_time = timer()
|
||||
wps = n_words / (end_time - start_time)
|
||||
scores = scorer.scores
|
||||
# Calculate a weighted sum based on score_weights for the main score
|
||||
weights = cfg["score_weights"]
|
||||
try:
|
||||
|
|
|
@ -40,8 +40,8 @@ seed = 0
|
|||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
|
||||
scores = ["speed", "tag_acc", "dep_uas", "dep_las", "ents_f"]
|
||||
score_weights = {"tag_acc": 0.2, "dep_las": 0.4, "ents_f": 0.4}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
|
|
|
@ -45,6 +45,7 @@ cdef class Example:
|
|||
|
||||
def __set__(self, doc):
|
||||
self.x = doc
|
||||
self._alignment = None
|
||||
|
||||
property reference:
|
||||
def __get__(self):
|
||||
|
@ -52,6 +53,7 @@ cdef class Example:
|
|||
|
||||
def __set__(self, doc):
|
||||
self.y = doc
|
||||
self._alignment = None
|
||||
|
||||
def copy(self):
|
||||
return Example(
|
||||
|
|
|
@ -1011,10 +1011,13 @@ class Language:
|
|||
name="language", method="evaluate", types=wrong_types
|
||||
)
|
||||
raise TypeError(err)
|
||||
if scorer is None:
|
||||
scorer = Scorer(pipeline=self.pipeline)
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
if scorer is None:
|
||||
kwargs = component_cfg.get("scorer", {})
|
||||
kwargs.setdefault("verbose", verbose)
|
||||
kwargs.setdefault("nlp", self)
|
||||
scorer = Scorer(**kwargs)
|
||||
docs = list(eg.predicted for eg in examples)
|
||||
for name, pipe in self.pipeline:
|
||||
kwargs = component_cfg.get(name, {})
|
||||
|
@ -1027,10 +1030,7 @@ class Language:
|
|||
if verbose:
|
||||
print(doc)
|
||||
eg.predicted = doc
|
||||
kwargs = component_cfg.get("scorer", {})
|
||||
kwargs.setdefault("verbose", verbose)
|
||||
scorer.score(eg, **kwargs)
|
||||
return scorer
|
||||
return scorer.score(examples)
|
||||
|
||||
@contextmanager
|
||||
def use_params(self, params: dict, **cfg):
|
||||
|
|
|
@ -8,6 +8,7 @@ from ..syntax.arc_eager cimport ArcEager
|
|||
from .functions import merge_subtokens
|
||||
from ..language import Language
|
||||
from ..syntax import nonproj
|
||||
from ..scorer import Scorer
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -102,3 +103,14 @@ cdef class DependencyParser(Parser):
|
|||
label = label.split("||")[1]
|
||||
labels.add(label)
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
def dep_getter(token, attr):
|
||||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", **kwargs))
|
||||
results.update(Scorer.score_deps(examples, "dep", getter=dep_getter,
|
||||
ignore_labels=("p", "punct"), **kwargs))
|
||||
return results
|
||||
|
|
|
@ -14,6 +14,7 @@ from ..errors import Errors
|
|||
from .pipe import deserialize_config
|
||||
from .tagger import Tagger
|
||||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -162,6 +163,14 @@ class Morphologizer(Tagger):
|
|||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", **kwargs))
|
||||
return results
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
|
|
|
@ -6,6 +6,7 @@ from ..syntax.nn_parser cimport Parser
|
|||
from ..syntax.ner cimport BiluoPushDown
|
||||
|
||||
from ..language import Language
|
||||
from ..scorer import Scorer
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -88,3 +89,6 @@ cdef class EntityRecognizer(Parser):
|
|||
labels = set(move.split("-")[1] for move in self.move_names
|
||||
if move[0] in ("B", "I", "L", "U"))
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||
|
|
|
@ -117,6 +117,9 @@ class Pipe:
|
|||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
return {}
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
|
|
1519
spacy/pipeline/pipes.pyx
Normal file
1519
spacy/pipeline/pipes.pyx
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -6,6 +6,7 @@ from ..tokens.doc cimport Doc
|
|||
|
||||
from .pipe import Pipe
|
||||
from ..language import Language
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -130,6 +131,9 @@ class Sentencizer(Pipe):
|
|||
else:
|
||||
doc.c[j].sent_start = -1
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
return Scorer.score_spans(examples, "sents", **kwargs)
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
"""Serialize the sentencizer to a bytestring.
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ from .pipe import deserialize_config
|
|||
from .tagger import Tagger
|
||||
from ..language import Language
|
||||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -104,6 +105,9 @@ class SentenceRecognizer(Tagger):
|
|||
def add_label(self, label, values=None):
|
||||
raise NotImplementedError
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
return Scorer.score_spans(examples, "sents", **kwargs)
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
|
|
|
@ -14,6 +14,7 @@ from ..language import Language
|
|||
from ..attrs import POS, ID
|
||||
from ..parts_of_speech import X
|
||||
from ..errors import Errors, TempErrors, Warnings
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -250,6 +251,13 @@ class Tagger(Pipe):
|
|||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
scores = {}
|
||||
scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||
scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return scores
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
|
|
|
@ -6,6 +6,7 @@ from .pipe import Pipe
|
|||
from ..language import Language
|
||||
from ..gold import Example
|
||||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from ..tokens import Doc
|
||||
from ..vocab import Vocab
|
||||
|
@ -250,3 +251,9 @@ class TextCategorizer(Pipe):
|
|||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def score(self, examples, positive_label=None, **kwargs):
|
||||
return Scorer.score_cats(examples, "cats", labels=self.labels,
|
||||
multi_label=self.model.attrs["multi_label"],
|
||||
positive_label=positive_label, **kwargs
|
||||
)
|
||||
|
|
757
spacy/scorer.py
757
spacy/scorer.py
|
@ -1,6 +1,8 @@
|
|||
import numpy as np
|
||||
|
||||
from .errors import Errors
|
||||
from .util import get_lang_class
|
||||
from .morphology import Morphology
|
||||
|
||||
|
||||
class PRFScore:
|
||||
|
@ -32,6 +34,9 @@ class PRFScore:
|
|||
r = self.recall
|
||||
return 2 * ((p * r) / (p + r + 1e-100))
|
||||
|
||||
def to_dict(self):
|
||||
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
||||
|
||||
|
||||
class ROCAUCScore:
|
||||
"""
|
||||
|
@ -65,391 +70,405 @@ class ROCAUCScore:
|
|||
class Scorer:
|
||||
"""Compute evaluation scores."""
|
||||
|
||||
def __init__(self, eval_punct=False, pipeline=None):
|
||||
def __init__(self, nlp=None, **cfg):
|
||||
"""Initialize the Scorer.
|
||||
|
||||
eval_punct (bool): Evaluate the dependency attachments to and from
|
||||
punctuation.
|
||||
RETURNS (Scorer): The newly created object.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#init
|
||||
"""
|
||||
self.tokens = PRFScore()
|
||||
self.sbd = PRFScore()
|
||||
self.unlabelled = PRFScore()
|
||||
self.labelled = PRFScore()
|
||||
self.labelled_per_dep = dict()
|
||||
self.tags = PRFScore()
|
||||
self.pos = PRFScore()
|
||||
self.morphs = PRFScore()
|
||||
self.morphs_per_feat = dict()
|
||||
self.sent_starts = PRFScore()
|
||||
self.ner = PRFScore()
|
||||
self.ner_per_ents = dict()
|
||||
self.eval_punct = eval_punct
|
||||
self.textcat = PRFScore()
|
||||
self.textcat_f_per_cat = dict()
|
||||
self.textcat_auc_per_cat = dict()
|
||||
self.textcat_positive_label = None
|
||||
self.textcat_multilabel = False
|
||||
self.nlp = nlp
|
||||
self.cfg = cfg
|
||||
|
||||
if pipeline:
|
||||
for name, component in pipeline:
|
||||
if name == "textcat":
|
||||
self.textcat_multilabel = component.model.attrs["multi_label"]
|
||||
self.textcat_positive_label = component.cfg.get(
|
||||
"positive_label", None
|
||||
)
|
||||
for label in component.cfg.get("labels", []):
|
||||
self.textcat_auc_per_cat[label] = ROCAUCScore()
|
||||
self.textcat_f_per_cat[label] = PRFScore()
|
||||
if not nlp:
|
||||
# create a default pipeline
|
||||
nlp = get_lang_class("xx")()
|
||||
nlp.add_pipe("senter")
|
||||
nlp.add_pipe("tagger")
|
||||
nlp.add_pipe("morphologizer")
|
||||
nlp.add_pipe("parser")
|
||||
nlp.add_pipe("ner")
|
||||
nlp.add_pipe("textcat")
|
||||
self.nlp = nlp
|
||||
|
||||
@property
|
||||
def tags_acc(self):
|
||||
"""RETURNS (float): Part-of-speech tag accuracy (fine grained tags,
|
||||
i.e. `Token.tag`).
|
||||
"""
|
||||
return self.tags.fscore * 100
|
||||
|
||||
@property
|
||||
def pos_acc(self):
|
||||
"""RETURNS (float): Part-of-speech tag accuracy (coarse grained pos,
|
||||
i.e. `Token.pos`).
|
||||
"""
|
||||
return self.pos.fscore * 100
|
||||
|
||||
@property
|
||||
def morphs_acc(self):
|
||||
"""RETURNS (float): Morph tag accuracy (morphological features,
|
||||
i.e. `Token.morph`).
|
||||
"""
|
||||
return self.morphs.fscore * 100
|
||||
|
||||
@property
|
||||
def morphs_per_type(self):
|
||||
"""RETURNS (dict): Scores per dependency label.
|
||||
"""
|
||||
return {
|
||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||
for k, v in self.morphs_per_feat.items()
|
||||
}
|
||||
|
||||
@property
|
||||
def sent_p(self):
|
||||
"""RETURNS (float): F-score for identification of sentence starts.
|
||||
i.e. `Token.is_sent_start`).
|
||||
"""
|
||||
return self.sent_starts.precision * 100
|
||||
|
||||
@property
|
||||
def sent_r(self):
|
||||
"""RETURNS (float): F-score for identification of sentence starts.
|
||||
i.e. `Token.is_sent_start`).
|
||||
"""
|
||||
return self.sent_starts.recall * 100
|
||||
|
||||
@property
|
||||
def sent_f(self):
|
||||
"""RETURNS (float): F-score for identification of sentence starts.
|
||||
i.e. `Token.is_sent_start`).
|
||||
"""
|
||||
return self.sent_starts.fscore * 100
|
||||
|
||||
@property
|
||||
def token_acc(self):
|
||||
"""RETURNS (float): Tokenization accuracy."""
|
||||
return self.tokens.precision * 100
|
||||
|
||||
@property
|
||||
def uas(self):
|
||||
"""RETURNS (float): Unlabelled dependency score."""
|
||||
return self.unlabelled.fscore * 100
|
||||
|
||||
@property
|
||||
def las(self):
|
||||
"""RETURNS (float): Labelled dependency score."""
|
||||
return self.labelled.fscore * 100
|
||||
|
||||
@property
|
||||
def las_per_type(self):
|
||||
"""RETURNS (dict): Scores per dependency label.
|
||||
"""
|
||||
return {
|
||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||
for k, v in self.labelled_per_dep.items()
|
||||
}
|
||||
|
||||
@property
|
||||
def ents_p(self):
|
||||
"""RETURNS (float): Named entity accuracy (precision)."""
|
||||
return self.ner.precision * 100
|
||||
|
||||
@property
|
||||
def ents_r(self):
|
||||
"""RETURNS (float): Named entity accuracy (recall)."""
|
||||
return self.ner.recall * 100
|
||||
|
||||
@property
|
||||
def ents_f(self):
|
||||
"""RETURNS (float): Named entity accuracy (F-score)."""
|
||||
return self.ner.fscore * 100
|
||||
|
||||
@property
|
||||
def ents_per_type(self):
|
||||
"""RETURNS (dict): Scores per entity label.
|
||||
"""
|
||||
return {
|
||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||
for k, v in self.ner_per_ents.items()
|
||||
}
|
||||
|
||||
@property
|
||||
def textcat_f(self):
|
||||
"""RETURNS (float): f-score on positive label for binary classification,
|
||||
macro-averaged f-score for multilabel classification
|
||||
"""
|
||||
if not self.textcat_multilabel:
|
||||
if self.textcat_positive_label:
|
||||
# binary classification
|
||||
return self.textcat.fscore * 100
|
||||
# multi-class and/or multi-label
|
||||
return (
|
||||
sum([score.fscore for label, score in self.textcat_f_per_cat.items()])
|
||||
/ (len(self.textcat_f_per_cat) + 1e-100)
|
||||
* 100
|
||||
)
|
||||
|
||||
@property
|
||||
def textcat_auc(self):
|
||||
"""RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined)
|
||||
"""
|
||||
return max(
|
||||
sum([score.score for label, score in self.textcat_auc_per_cat.items()])
|
||||
/ (len(self.textcat_auc_per_cat) + 1e-100),
|
||||
-1,
|
||||
)
|
||||
|
||||
@property
|
||||
def textcats_auc_per_cat(self):
|
||||
"""RETURNS (dict): AUC ROC Scores per textcat label.
|
||||
"""
|
||||
return {
|
||||
k: {"roc_auc_score": max(v.score, -1)}
|
||||
for k, v in self.textcat_auc_per_cat.items()
|
||||
}
|
||||
|
||||
@property
|
||||
def textcats_f_per_cat(self):
|
||||
"""RETURNS (dict): F-scores per textcat label.
|
||||
"""
|
||||
return {
|
||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||
for k, v in self.textcat_f_per_cat.items()
|
||||
}
|
||||
|
||||
@property
|
||||
def scores(self):
|
||||
"""RETURNS (dict): All scores mapped by key.
|
||||
"""
|
||||
return {
|
||||
"uas": self.uas,
|
||||
"las": self.las,
|
||||
"las_per_type": self.las_per_type,
|
||||
"ents_p": self.ents_p,
|
||||
"ents_r": self.ents_r,
|
||||
"ents_f": self.ents_f,
|
||||
"ents_per_type": self.ents_per_type,
|
||||
"tags_acc": self.tags_acc,
|
||||
"pos_acc": self.pos_acc,
|
||||
"morphs_acc": self.morphs_acc,
|
||||
"morphs_per_type": self.morphs_per_type,
|
||||
"sent_p": self.sent_p,
|
||||
"sent_r": self.sent_r,
|
||||
"sent_f": self.sent_f,
|
||||
"token_acc": self.token_acc,
|
||||
"textcat_f": self.textcat_f,
|
||||
"textcat_auc": self.textcat_auc,
|
||||
"textcats_f_per_cat": self.textcats_f_per_cat,
|
||||
"textcats_auc_per_cat": self.textcats_auc_per_cat,
|
||||
}
|
||||
|
||||
def score(self, example, verbose=False, punct_labels=("p", "punct")):
|
||||
"""Update the evaluation scores from a single Example.
|
||||
|
||||
example (Example): The predicted annotations + correct annotations.
|
||||
verbose (bool): Print debugging information.
|
||||
punct_labels (tuple): Dependency labels for punctuation. Used to
|
||||
evaluate dependency attachments to punctuation if `eval_punct` is
|
||||
`True`.
|
||||
def score(self, examples):
|
||||
"""Evaluate a list of Examples.
|
||||
|
||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||
RETURNS (Dict): A dictionary of scores.
|
||||
DOCS: https://spacy.io/api/scorer#score
|
||||
"""
|
||||
doc = example.predicted
|
||||
gold_doc = example.reference
|
||||
align = example.alignment
|
||||
gold_deps = set()
|
||||
gold_deps_per_dep = {}
|
||||
gold_tags = set()
|
||||
gold_pos = set()
|
||||
gold_morphs = set()
|
||||
gold_morphs_per_feat = {}
|
||||
gold_sent_starts = set()
|
||||
for gold_i, token in enumerate(gold_doc):
|
||||
gold_tags.add((gold_i, token.tag_))
|
||||
gold_pos.add((gold_i, token.pos_))
|
||||
gold_morphs.add((gold_i, token.morph_))
|
||||
if token.morph_:
|
||||
for feat in token.morph_.split("|"):
|
||||
field, values = feat.split("=")
|
||||
if field not in self.morphs_per_feat:
|
||||
self.morphs_per_feat[field] = PRFScore()
|
||||
if field not in gold_morphs_per_feat:
|
||||
gold_morphs_per_feat[field] = set()
|
||||
gold_morphs_per_feat[field].add((gold_i, feat))
|
||||
if token.sent_start:
|
||||
gold_sent_starts.add(gold_i)
|
||||
dep = token.dep_.lower()
|
||||
if dep not in punct_labels:
|
||||
gold_deps.add((gold_i, token.head.i, dep))
|
||||
if dep not in self.labelled_per_dep:
|
||||
self.labelled_per_dep[dep] = PRFScore()
|
||||
if dep not in gold_deps_per_dep:
|
||||
gold_deps_per_dep[dep] = set()
|
||||
gold_deps_per_dep[dep].add((gold_i, token.head.i, dep))
|
||||
cand_deps = set()
|
||||
cand_deps_per_dep = {}
|
||||
cand_tags = set()
|
||||
cand_pos = set()
|
||||
cand_morphs = set()
|
||||
cand_morphs_per_feat = {}
|
||||
cand_sent_starts = set()
|
||||
for token in doc:
|
||||
if token.orth_.isspace():
|
||||
scores = {}
|
||||
|
||||
if hasattr(self.nlp.tokenizer, "score"):
|
||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
|
||||
for name, component in self.nlp.pipeline:
|
||||
if hasattr(component, "score"):
|
||||
scores.update(component.score(examples, **self.cfg))
|
||||
|
||||
return scores
|
||||
|
||||
@staticmethod
|
||||
def score_tokenization(examples, **cfg):
|
||||
"""Returns accuracy and PRF scores for tokenization.
|
||||
|
||||
* token_acc: # correct tokens / # gold tokens
|
||||
* token_p/r/f: PRF for token character spans
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
|
||||
"""
|
||||
acc_score = PRFScore()
|
||||
prf_score = PRFScore()
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
align = example.alignment
|
||||
gold_spans = set()
|
||||
pred_spans = set()
|
||||
for token in gold_doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
gold_spans.add((token.idx, token.idx + len(token)))
|
||||
for token in pred_doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
pred_spans.add((token.idx, token.idx + len(token)))
|
||||
if align.x2y.lengths[token.i] != 1:
|
||||
acc_score.fp += 1
|
||||
else:
|
||||
acc_score.tp += 1
|
||||
prf_score.score_set(pred_spans, gold_spans)
|
||||
return {
|
||||
"token_acc": acc_score.fscore,
|
||||
"token_p": prf_score.precision,
|
||||
"token_r": prf_score.recall,
|
||||
"token_f": prf_score.fscore,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def score_token_attr(examples, attr, getter=getattr, **cfg):
|
||||
"""Returns an accuracy score for a token-level attribute.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
attr (str): The attribute to score.
|
||||
getter (callable): Defaults to getattr. If provided,
|
||||
getter(token, attr) should return the value of the attribute for an
|
||||
individual token.
|
||||
RETURNS (dict): A dictionary containing the accuracy score under the
|
||||
key attr_acc.
|
||||
"""
|
||||
tag_score = PRFScore()
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
align = example.alignment
|
||||
gold_tags = set()
|
||||
for gold_i, token in enumerate(gold_doc):
|
||||
gold_tags.add((gold_i, getter(token, attr)))
|
||||
pred_tags = set()
|
||||
for token in pred_doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
if align.x2y.lengths[token.i] == 1:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
pred_tags.add((gold_i, getter(token, attr)))
|
||||
tag_score.score_set(pred_tags, gold_tags)
|
||||
return {
|
||||
attr + "_acc": tag_score.fscore,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
|
||||
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
attr (str): The attribute to score.
|
||||
getter (callable): Defaults to getattr. If provided,
|
||||
getter(token, attr) should return the value of the attribute for an
|
||||
individual token.
|
||||
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
||||
the key attr_per_feat.
|
||||
"""
|
||||
per_feat = {}
|
||||
for example in examples:
|
||||
pred_doc = example.predicted
|
||||
gold_doc = example.reference
|
||||
align = example.alignment
|
||||
gold_per_feat = {}
|
||||
for gold_i, token in enumerate(gold_doc):
|
||||
morph = str(getter(token, attr))
|
||||
if morph:
|
||||
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||
field, values = feat.split(Morphology.FIELD_SEP)
|
||||
if field not in per_feat:
|
||||
per_feat[field] = PRFScore()
|
||||
if field not in gold_per_feat:
|
||||
gold_per_feat[field] = set()
|
||||
gold_per_feat[field].add((gold_i, feat))
|
||||
pred_per_feat = {}
|
||||
for token in pred_doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
if align.x2y.lengths[token.i] == 1:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
morph = str(getter(token, attr))
|
||||
if morph:
|
||||
for feat in morph.split("|"):
|
||||
field, values = feat.split("=")
|
||||
if field not in per_feat:
|
||||
per_feat[field] = PRFScore()
|
||||
if field not in pred_per_feat:
|
||||
pred_per_feat[field] = set()
|
||||
pred_per_feat[field].add((gold_i, feat))
|
||||
for field in per_feat:
|
||||
per_feat[field].score_set(
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
||||
)
|
||||
return {
|
||||
attr + "_per_feat": per_feat,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def score_spans(examples, attr, getter=getattr, **cfg):
|
||||
"""Returns PRF scores for labeled spans.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
attr (str): The attribute to score.
|
||||
getter (callable): Defaults to getattr. If provided,
|
||||
getter(doc, attr) should return the spans for the individual doc.
|
||||
RETURNS (dict): A dictionary containing the PRF scores under the
|
||||
keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
||||
"""
|
||||
score = PRFScore()
|
||||
score_per_type = dict()
|
||||
for example in examples:
|
||||
pred_doc = example.predicted
|
||||
gold_doc = example.reference
|
||||
# Find all labels in gold and doc
|
||||
labels = set(
|
||||
[k.label_ for k in getter(gold_doc, attr)]
|
||||
+ [k.label_ for k in getter(pred_doc, attr)]
|
||||
)
|
||||
# Set up all labels for per type scoring and prepare gold per type
|
||||
gold_per_type = {label: set() for label in labels}
|
||||
for label in labels:
|
||||
if label not in score_per_type:
|
||||
score_per_type[label] = PRFScore()
|
||||
# Find all predidate labels, for all and per type
|
||||
gold_spans = set()
|
||||
pred_spans = set()
|
||||
|
||||
# Special case for ents:
|
||||
# If we have missing values in the gold, we can't easily tell
|
||||
# whether our NER predictions are true.
|
||||
# It seems bad but it's what we've always done.
|
||||
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
|
||||
continue
|
||||
if align.x2y.lengths[token.i] != 1:
|
||||
self.tokens.fp += 1
|
||||
gold_i = None
|
||||
else:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
self.tokens.tp += 1
|
||||
cand_tags.add((gold_i, token.tag_))
|
||||
cand_pos.add((gold_i, token.pos_))
|
||||
cand_morphs.add((gold_i, token.morph_))
|
||||
if token.morph_:
|
||||
for feat in token.morph_.split("|"):
|
||||
field, values = feat.split("=")
|
||||
if field not in self.morphs_per_feat:
|
||||
self.morphs_per_feat[field] = PRFScore()
|
||||
if field not in cand_morphs_per_feat:
|
||||
cand_morphs_per_feat[field] = set()
|
||||
cand_morphs_per_feat[field].add((gold_i, feat))
|
||||
if token.is_sent_start:
|
||||
cand_sent_starts.add(gold_i)
|
||||
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
||||
if align.x2y.lengths[token.head.i] == 1:
|
||||
gold_head = align.x2y[token.head.i].dataXd[0, 0]
|
||||
else:
|
||||
gold_head = None
|
||||
# None is indistinct, so we can't just add it to the set
|
||||
# Multiple (None, None) deps are possible
|
||||
if gold_i is None or gold_head is None:
|
||||
self.unlabelled.fp += 1
|
||||
self.labelled.fp += 1
|
||||
else:
|
||||
cand_deps.add((gold_i, gold_head, token.dep_.lower()))
|
||||
if token.dep_.lower() not in self.labelled_per_dep:
|
||||
self.labelled_per_dep[token.dep_.lower()] = PRFScore()
|
||||
if token.dep_.lower() not in cand_deps_per_dep:
|
||||
cand_deps_per_dep[token.dep_.lower()] = set()
|
||||
cand_deps_per_dep[token.dep_.lower()].add(
|
||||
(gold_i, gold_head, token.dep_.lower())
|
||||
|
||||
for span in getter(gold_doc, attr):
|
||||
gold_span = (span.label_, span.start, span.end - 1)
|
||||
gold_spans.add(gold_span)
|
||||
gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
|
||||
pred_per_type = {label: set() for label in labels}
|
||||
for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
|
||||
pred_spans.add((span.label_, span.start, span.end - 1))
|
||||
pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
|
||||
# Scores per label
|
||||
for k, v in score_per_type.items():
|
||||
if k in pred_per_type:
|
||||
v.score_set(pred_per_type[k], gold_per_type[k])
|
||||
# Score for all labels
|
||||
score.score_set(pred_spans, gold_spans)
|
||||
results = {
|
||||
attr + "_p": score.precision,
|
||||
attr + "_r": score.recall,
|
||||
attr + "_f": score.fscore,
|
||||
attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||
}
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def score_cats(
|
||||
examples,
|
||||
attr,
|
||||
getter=getattr,
|
||||
labels=[],
|
||||
multi_label=True,
|
||||
positive_label=None,
|
||||
**cfg
|
||||
):
|
||||
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
||||
dict with scores for each label like Doc.cats.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
attr (str): The attribute to score.
|
||||
getter (callable): Defaults to getattr. If provided,
|
||||
getter(doc, attr) should return the values for the individual doc.
|
||||
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
||||
multi_label (bool): Whether the attribute allows multiple labels.
|
||||
Defaults to True.
|
||||
positive_label (str): The positive label for a binary task with
|
||||
exclusive classes. Defaults to None.
|
||||
RETURNS (dict): A dictionary containing the scores:
|
||||
for binary exclusive with positive label: attr_p/r/f,
|
||||
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f,
|
||||
for multilabel, macro-averaged AUC: attr_macro_auc,
|
||||
for all: attr_f_per_type, attr_auc_per_type
|
||||
"""
|
||||
score = PRFScore()
|
||||
f_per_type = dict()
|
||||
auc_per_type = dict()
|
||||
for label in labels:
|
||||
f_per_type[label] = PRFScore()
|
||||
auc_per_type[label] = ROCAUCScore()
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
gold_values = getter(gold_doc, attr)
|
||||
pred_values = getter(pred_doc, attr)
|
||||
if (
|
||||
len(gold_values) > 0
|
||||
and set(f_per_type) == set(auc_per_type) == set(gold_values)
|
||||
and set(gold_values) == set(pred_values)
|
||||
):
|
||||
gold_val = max(gold_values, key=gold_values.get)
|
||||
pred_val = max(pred_values, key=pred_values.get)
|
||||
if positive_label:
|
||||
score.score_set(
|
||||
set([positive_label]) & set([pred_val]),
|
||||
set([positive_label]) & set([gold_val]),
|
||||
)
|
||||
for label in set(gold_values):
|
||||
auc_per_type[label].score_set(
|
||||
pred_values[label], gold_values[label]
|
||||
)
|
||||
f_per_type[label].score_set(
|
||||
set([label]) & set([pred_val]), set([label]) & set([gold_val])
|
||||
)
|
||||
elif len(f_per_type) > 0:
|
||||
model_labels = set(f_per_type)
|
||||
eval_labels = set(gold_values)
|
||||
raise ValueError(
|
||||
Errors.E162.format(
|
||||
model_labels=model_labels, eval_labels=eval_labels
|
||||
)
|
||||
# Find all NER labels in gold and doc
|
||||
ent_labels = set(
|
||||
[k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]
|
||||
)
|
||||
# Set up all labels for per type scoring and prepare gold per type
|
||||
gold_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||
for ent_label in ent_labels:
|
||||
if ent_label not in self.ner_per_ents:
|
||||
self.ner_per_ents[ent_label] = PRFScore()
|
||||
# Find all candidate labels, for all and per type
|
||||
gold_ents = set()
|
||||
cand_ents = set()
|
||||
# If we have missing values in the gold, we can't easily tell whether
|
||||
# our NER predictions are true.
|
||||
# It seems bad but it's what we've always done.
|
||||
if all(token.ent_iob != 0 for token in gold_doc):
|
||||
for ent in gold_doc.ents:
|
||||
gold_ent = (ent.label_, ent.start, ent.end - 1)
|
||||
gold_ents.add(gold_ent)
|
||||
gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
||||
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||
for ent in example.get_aligned_spans_x2y(doc.ents):
|
||||
cand_ents.add((ent.label_, ent.start, ent.end - 1))
|
||||
cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
||||
# Scores per ent
|
||||
for k, v in self.ner_per_ents.items():
|
||||
if k in cand_per_ents:
|
||||
v.score_set(cand_per_ents[k], gold_per_ents[k])
|
||||
# Score for all ents
|
||||
self.ner.score_set(cand_ents, gold_ents)
|
||||
self.tags.score_set(cand_tags, gold_tags)
|
||||
self.pos.score_set(cand_pos, gold_pos)
|
||||
self.morphs.score_set(cand_morphs, gold_morphs)
|
||||
for field in self.morphs_per_feat:
|
||||
self.morphs_per_feat[field].score_set(
|
||||
cand_morphs_per_feat.get(field, set()),
|
||||
gold_morphs_per_feat.get(field, set()),
|
||||
)
|
||||
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
||||
self.labelled.score_set(cand_deps, gold_deps)
|
||||
for dep in self.labelled_per_dep:
|
||||
self.labelled_per_dep[dep].score_set(
|
||||
cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
|
||||
)
|
||||
self.unlabelled.score_set(
|
||||
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
|
||||
)
|
||||
if (
|
||||
len(gold_doc.cats) > 0
|
||||
and set(self.textcat_f_per_cat)
|
||||
== set(self.textcat_auc_per_cat)
|
||||
== set(gold_doc.cats)
|
||||
and set(gold_doc.cats) == set(doc.cats)
|
||||
):
|
||||
goldcat = max(gold_doc.cats, key=gold_doc.cats.get)
|
||||
candcat = max(doc.cats, key=doc.cats.get)
|
||||
if self.textcat_positive_label:
|
||||
self.textcat.score_set(
|
||||
set([self.textcat_positive_label]) & set([candcat]),
|
||||
set([self.textcat_positive_label]) & set([goldcat]),
|
||||
)
|
||||
for label in set(gold_doc.cats):
|
||||
self.textcat_auc_per_cat[label].score_set(
|
||||
doc.cats[label], gold_doc.cats[label]
|
||||
elif len(auc_per_type) > 0:
|
||||
model_labels = set(auc_per_type)
|
||||
eval_labels = set(gold_values)
|
||||
raise ValueError(
|
||||
Errors.E162.format(
|
||||
model_labels=model_labels, eval_labels=eval_labels
|
||||
)
|
||||
)
|
||||
self.textcat_f_per_cat[label].score_set(
|
||||
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
||||
results = {
|
||||
attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||
attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||
}
|
||||
if len(labels) == 2 and not multi_label and positive_label:
|
||||
results[attr + "_p"] = score.precision
|
||||
results[attr + "_r"] = score.recall
|
||||
results[attr + "_f"] = score.fscore
|
||||
elif not multi_label:
|
||||
results[attr + "_macro_f"] = sum(
|
||||
[score.fscore for label, score in f_per_type.items()]
|
||||
) / (len(f_per_type) + 1e-100)
|
||||
else:
|
||||
results[attr + "_macro_auc"] = max(
|
||||
sum([score.score for label, score in auc_per_type.items()])
|
||||
/ (len(auc_per_type) + 1e-100),
|
||||
-1,
|
||||
)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def score_deps(
|
||||
examples,
|
||||
attr,
|
||||
getter=getattr,
|
||||
head_attr="head",
|
||||
head_getter=getattr,
|
||||
ignore_labels=tuple(),
|
||||
**cfg
|
||||
):
|
||||
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
||||
parses.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
attr (str): The attribute containing the dependency label.
|
||||
getter (callable): Defaults to getattr. If provided,
|
||||
getter(token, attr) should return the value of the attribute for an
|
||||
individual token.
|
||||
head_attr (str): The attribute containing the head token. Defaults to
|
||||
'head'.
|
||||
head_getter (callable): Defaults to getattr. If provided,
|
||||
head_getter(token, attr) should return the value of the head for an
|
||||
individual token.
|
||||
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
|
||||
RETURNS (dict): A dictionary containing the scores:
|
||||
attr_uas, attr_las, and attr_las_per_type.
|
||||
"""
|
||||
unlabelled = PRFScore()
|
||||
labelled = PRFScore()
|
||||
labelled_per_dep = dict()
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
align = example.alignment
|
||||
gold_deps = set()
|
||||
gold_deps_per_dep = {}
|
||||
for gold_i, token in enumerate(gold_doc):
|
||||
dep = getter(token, attr)
|
||||
head = head_getter(token, head_attr)
|
||||
if dep not in ignore_labels:
|
||||
gold_deps.add((gold_i, head.i, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
if dep not in gold_deps_per_dep:
|
||||
gold_deps_per_dep[dep] = set()
|
||||
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
||||
pred_deps = set()
|
||||
pred_deps_per_dep = {}
|
||||
for token in pred_doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
if align.x2y.lengths[token.i] != 1:
|
||||
gold_i = None
|
||||
else:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
dep = getter(token, attr)
|
||||
head = head_getter(token, head_attr)
|
||||
if dep not in ignore_labels and token.orth_.strip():
|
||||
if align.x2y.lengths[head.i] == 1:
|
||||
gold_head = align.x2y[head.i].dataXd[0, 0]
|
||||
else:
|
||||
gold_head = None
|
||||
# None is indistinct, so we can't just add it to the set
|
||||
# Multiple (None, None) deps are possible
|
||||
if gold_i is None or gold_head is None:
|
||||
unlabelled.fp += 1
|
||||
labelled.fp += 1
|
||||
else:
|
||||
pred_deps.add((gold_i, gold_head, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
if dep not in pred_deps_per_dep:
|
||||
pred_deps_per_dep[dep] = set()
|
||||
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
||||
labelled.score_set(pred_deps, gold_deps)
|
||||
for dep in labelled_per_dep:
|
||||
labelled_per_dep[dep].score_set(
|
||||
pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
|
||||
)
|
||||
elif len(self.textcat_f_per_cat) > 0:
|
||||
model_labels = set(self.textcat_f_per_cat)
|
||||
eval_labels = set(gold_doc.cats)
|
||||
raise ValueError(
|
||||
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
||||
unlabelled.score_set(
|
||||
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
||||
)
|
||||
elif len(self.textcat_auc_per_cat) > 0:
|
||||
model_labels = set(self.textcat_auc_per_cat)
|
||||
eval_labels = set(gold_doc.cats)
|
||||
raise ValueError(
|
||||
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
||||
)
|
||||
if verbose:
|
||||
gold_words = gold_doc.words
|
||||
for w_id, h_id, dep in cand_deps - gold_deps:
|
||||
print("F", gold_words[w_id], dep, gold_words[h_id])
|
||||
for w_id, h_id, dep in gold_deps - cand_deps:
|
||||
print("M", gold_words[w_id], dep, gold_words[h_id])
|
||||
return {
|
||||
attr + "_uas": unlabelled.fscore,
|
||||
attr + "_las": labelled.fscore,
|
||||
attr
|
||||
+ "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
|
||||
}
|
||||
|
||||
|
||||
#############################################################################
|
||||
|
|
|
@ -85,6 +85,8 @@ def test_overfitting_IO():
|
|||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
# Set exclusive labels
|
||||
textcat.model.attrs["multi_label"] = False
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -114,6 +116,10 @@ def test_overfitting_IO():
|
|||
assert cats2["POSITIVE"] > 0.9
|
||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}})
|
||||
assert scores["cats_f"] == 1.0
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
|
|
|
@ -7,6 +7,7 @@ from spacy.scorer import Scorer, ROCAUCScore
|
|||
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||
from .util import get_doc
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
test_las_apple = [
|
||||
|
@ -77,13 +78,61 @@ def tagged_doc():
|
|||
doc[i].tag_ = tags[i]
|
||||
doc[i].pos_ = pos[i]
|
||||
doc[i].morph_ = morphs[i]
|
||||
if i > 0:
|
||||
doc[i].is_sent_start = False
|
||||
doc.is_tagged = True
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sented_doc():
|
||||
text = "One sentence. Two sentences. Three sentences."
|
||||
nlp = English()
|
||||
doc = nlp(text)
|
||||
for i in range(len(doc)):
|
||||
if i % 3 == 0:
|
||||
doc[i].is_sent_start = True
|
||||
else:
|
||||
doc[i].is_sent_start = False
|
||||
return doc
|
||||
|
||||
|
||||
def test_tokenization(sented_doc):
|
||||
scorer = Scorer()
|
||||
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
|
||||
example = Example.from_dict(sented_doc, gold)
|
||||
scores = scorer.score([example])
|
||||
assert scores["token_acc"] == 1.0
|
||||
|
||||
nlp = English()
|
||||
example.predicted = Doc(nlp.vocab, words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."], spaces=[True, True, True, True, True, False])
|
||||
example.predicted[1].is_sent_start = False
|
||||
scores = scorer.score([example])
|
||||
assert scores["token_acc"] == approx(0.66666666)
|
||||
assert scores["token_p"] == 0.5
|
||||
assert scores["token_r"] == approx(0.33333333)
|
||||
assert scores["token_f"] == 0.4
|
||||
|
||||
|
||||
def test_sents(sented_doc):
|
||||
scorer = Scorer()
|
||||
gold = {"sent_starts": [t.sent_start for t in sented_doc]}
|
||||
example = Example.from_dict(sented_doc, gold)
|
||||
scores = scorer.score([example])
|
||||
assert scores["sents_f"] == 1.0
|
||||
|
||||
# One sentence start is moved
|
||||
gold["sent_starts"][3] = 0
|
||||
gold["sent_starts"][4] = 1
|
||||
example = Example.from_dict(sented_doc, gold)
|
||||
scores = scorer.score([example])
|
||||
assert scores["sents_f"] == approx(0.3333333)
|
||||
|
||||
|
||||
def test_las_per_type(en_vocab):
|
||||
# Gold and Doc are identical
|
||||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_las_apple:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
|
@ -93,20 +142,21 @@ def test_las_per_type(en_vocab):
|
|||
)
|
||||
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||
example = Example.from_dict(doc, gold)
|
||||
scorer.score(example)
|
||||
results = scorer.scores
|
||||
examples.append(example)
|
||||
results = scorer.score(examples)
|
||||
|
||||
assert results["uas"] == 100
|
||||
assert results["las"] == 100
|
||||
assert results["las_per_type"]["nsubj"]["p"] == 100
|
||||
assert results["las_per_type"]["nsubj"]["r"] == 100
|
||||
assert results["las_per_type"]["nsubj"]["f"] == 100
|
||||
assert results["las_per_type"]["compound"]["p"] == 100
|
||||
assert results["las_per_type"]["compound"]["r"] == 100
|
||||
assert results["las_per_type"]["compound"]["f"] == 100
|
||||
assert results["dep_uas"] == 1.0
|
||||
assert results["dep_las"] == 1.0
|
||||
assert results["dep_las_per_type"]["nsubj"]["p"] == 1.0
|
||||
assert results["dep_las_per_type"]["nsubj"]["r"] == 1.0
|
||||
assert results["dep_las_per_type"]["nsubj"]["f"] == 1.0
|
||||
assert results["dep_las_per_type"]["compound"]["p"] == 1.0
|
||||
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
|
||||
assert results["dep_las_per_type"]["compound"]["f"] == 1.0
|
||||
|
||||
# One dep is incorrect in Doc
|
||||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_las_apple:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
|
@ -117,22 +167,23 @@ def test_las_per_type(en_vocab):
|
|||
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||
doc[0].dep_ = "compound"
|
||||
example = Example.from_dict(doc, gold)
|
||||
scorer.score(example)
|
||||
results = scorer.scores
|
||||
examples.append(example)
|
||||
results = scorer.score(examples)
|
||||
|
||||
assert results["uas"] == 100
|
||||
assert_almost_equal(results["las"], 90.9090909)
|
||||
assert results["las_per_type"]["nsubj"]["p"] == 0
|
||||
assert results["las_per_type"]["nsubj"]["r"] == 0
|
||||
assert results["las_per_type"]["nsubj"]["f"] == 0
|
||||
assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
|
||||
assert results["las_per_type"]["compound"]["r"] == 100
|
||||
assert results["las_per_type"]["compound"]["f"] == 80
|
||||
assert results["dep_uas"] == 1.0
|
||||
assert_almost_equal(results["dep_las"], 0.9090909)
|
||||
assert results["dep_las_per_type"]["nsubj"]["p"] == 0
|
||||
assert results["dep_las_per_type"]["nsubj"]["r"] == 0
|
||||
assert results["dep_las_per_type"]["nsubj"]["f"] == 0
|
||||
assert_almost_equal(results["dep_las_per_type"]["compound"]["p"], 0.666666666)
|
||||
assert results["dep_las_per_type"]["compound"]["r"] == 1.0
|
||||
assert results["dep_las_per_type"]["compound"]["f"] == 0.8
|
||||
|
||||
|
||||
def test_ner_per_type(en_vocab):
|
||||
# Gold and Doc are identical
|
||||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_ner_cardinal:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
|
@ -140,20 +191,24 @@ def test_ner_per_type(en_vocab):
|
|||
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
||||
)
|
||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||
ex = Example.from_dict(doc, {"entities": entities})
|
||||
scorer.score(ex)
|
||||
results = scorer.scores
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
# a hack for sentence boundaries
|
||||
example.predicted[1].is_sent_start = False
|
||||
example.reference[1].is_sent_start = False
|
||||
examples.append(example)
|
||||
results = scorer.score(examples)
|
||||
|
||||
assert results["ents_p"] == 100
|
||||
assert results["ents_f"] == 100
|
||||
assert results["ents_r"] == 100
|
||||
assert results["ents_per_type"]["CARDINAL"]["p"] == 100
|
||||
assert results["ents_per_type"]["CARDINAL"]["f"] == 100
|
||||
assert results["ents_per_type"]["CARDINAL"]["r"] == 100
|
||||
assert results["ents_p"] == 1.0
|
||||
assert results["ents_r"] == 1.0
|
||||
assert results["ents_f"] == 1.0
|
||||
assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0
|
||||
assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0
|
||||
assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0
|
||||
|
||||
# Doc has one missing and one extra entity
|
||||
# Entity type MONEY is not present in Doc
|
||||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_ner_apple:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
|
@ -161,25 +216,28 @@ def test_ner_per_type(en_vocab):
|
|||
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
||||
)
|
||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||
ex = Example.from_dict(doc, {"entities": entities})
|
||||
scorer.score(ex)
|
||||
results = scorer.scores
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
# a hack for sentence boundaries
|
||||
example.predicted[1].is_sent_start = False
|
||||
example.reference[1].is_sent_start = False
|
||||
examples.append(example)
|
||||
results = scorer.score(examples)
|
||||
|
||||
assert results["ents_p"] == approx(66.66666)
|
||||
assert results["ents_r"] == approx(66.66666)
|
||||
assert results["ents_f"] == approx(66.66666)
|
||||
assert results["ents_p"] == approx(0.6666666)
|
||||
assert results["ents_r"] == approx(0.6666666)
|
||||
assert results["ents_f"] == approx(0.6666666)
|
||||
assert "GPE" in results["ents_per_type"]
|
||||
assert "MONEY" in results["ents_per_type"]
|
||||
assert "ORG" in results["ents_per_type"]
|
||||
assert results["ents_per_type"]["GPE"]["p"] == 100
|
||||
assert results["ents_per_type"]["GPE"]["r"] == 100
|
||||
assert results["ents_per_type"]["GPE"]["f"] == 100
|
||||
assert results["ents_per_type"]["GPE"]["p"] == 1.0
|
||||
assert results["ents_per_type"]["GPE"]["r"] == 1.0
|
||||
assert results["ents_per_type"]["GPE"]["f"] == 1.0
|
||||
assert results["ents_per_type"]["MONEY"]["p"] == 0
|
||||
assert results["ents_per_type"]["MONEY"]["r"] == 0
|
||||
assert results["ents_per_type"]["MONEY"]["f"] == 0
|
||||
assert results["ents_per_type"]["ORG"]["p"] == 50
|
||||
assert results["ents_per_type"]["ORG"]["r"] == 100
|
||||
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
|
||||
assert results["ents_per_type"]["ORG"]["p"] == 0.5
|
||||
assert results["ents_per_type"]["ORG"]["r"] == 1.0
|
||||
assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)
|
||||
|
||||
|
||||
def test_tag_score(tagged_doc):
|
||||
|
@ -189,17 +247,17 @@ def test_tag_score(tagged_doc):
|
|||
"tags": [t.tag_ for t in tagged_doc],
|
||||
"pos": [t.pos_ for t in tagged_doc],
|
||||
"morphs": [t.morph_ for t in tagged_doc],
|
||||
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
|
||||
}
|
||||
example = Example.from_dict(tagged_doc, gold)
|
||||
scorer.score(example)
|
||||
results = scorer.scores
|
||||
results = scorer.score([example])
|
||||
|
||||
assert results["tags_acc"] == 100
|
||||
assert results["pos_acc"] == 100
|
||||
assert results["morphs_acc"] == 100
|
||||
assert results["morphs_per_type"]["NounType"]["f"] == 100
|
||||
assert results["tag_acc"] == 1.0
|
||||
assert results["pos_acc"] == 1.0
|
||||
assert results["morph_acc"] == 1.0
|
||||
assert results["morph_per_feat"]["NounType"].fscore == 1.0
|
||||
|
||||
# Gold and Doc are identical
|
||||
# Gold annotation is modified
|
||||
scorer = Scorer()
|
||||
tags = [t.tag_ for t in tagged_doc]
|
||||
tags[0] = "NN"
|
||||
|
@ -208,16 +266,21 @@ def test_tag_score(tagged_doc):
|
|||
morphs = [t.morph_ for t in tagged_doc]
|
||||
morphs[1] = "Number=sing"
|
||||
morphs[2] = "Number=plur"
|
||||
gold = {"tags": tags, "pos": pos, "morphs": morphs}
|
||||
gold = {
|
||||
"tags": tags,
|
||||
"pos": pos,
|
||||
"morphs": morphs,
|
||||
"sent_starts": gold["sent_starts"],
|
||||
}
|
||||
example = Example.from_dict(tagged_doc, gold)
|
||||
scorer.score(example)
|
||||
results = scorer.scores
|
||||
results = scorer.score([example])
|
||||
|
||||
assert results["tags_acc"] == 90
|
||||
assert results["pos_acc"] == 90
|
||||
assert results["morphs_acc"] == approx(80)
|
||||
assert results["morphs_per_type"]["Poss"]["f"] == 0.0
|
||||
assert results["morphs_per_type"]["Number"]["f"] == approx(72.727272)
|
||||
assert results["tag_acc"] == 0.9
|
||||
assert results["pos_acc"] == 0.9
|
||||
assert results["morph_acc"] == approx(0.8)
|
||||
assert results["morph_per_feat"]["NounType"].fscore == 1.0
|
||||
assert results["morph_per_feat"]["Poss"].fscore == 0.0
|
||||
assert results["morph_per_feat"]["Number"].fscore == approx(0.72727272)
|
||||
|
||||
|
||||
def test_roc_auc_score():
|
||||
|
|
|
@ -24,6 +24,7 @@ from . import util
|
|||
from .util import registry
|
||||
from .attrs import intify_attrs
|
||||
from .symbols import ORTH
|
||||
from .scorer import Scorer
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.Tokenizer.v1")
|
||||
|
@ -743,6 +744,9 @@ cdef class Tokenizer:
|
|||
tokens.extend(reversed(suffixes))
|
||||
return tokens
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
|
|
@ -108,8 +108,8 @@ Evaluate a model's pipeline components.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> scorer = nlp.evaluate(examples, verbose=True)
|
||||
> print(scorer.scores)
|
||||
> scores = nlp.evaluate(examples, verbose=True)
|
||||
> print(scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
|
@ -119,7 +119,7 @@ Evaluate a model's pipeline components.
|
|||
| `batch_size` | int | The batch size to use. |
|
||||
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
||||
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
||||
| **RETURNS** | Scorer | The scorer containing the evaluation scores. |
|
||||
| **RETURNS** | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores. |
|
||||
|
||||
## Language.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
|
|
@ -5,9 +5,12 @@ tag: class
|
|||
source: spacy/scorer.py
|
||||
---
|
||||
|
||||
The `Scorer` computes and stores evaluation scores. It's typically created by
|
||||
The `Scorer` computes evaluation scores. It's typically created by
|
||||
[`Language.evaluate`](/api/language#evaluate).
|
||||
|
||||
In addition, the `Scorer` provides a number of evaluation methods for
|
||||
evaluating `Token` and `Doc` attributes.
|
||||
|
||||
## Scorer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a new `Scorer`.
|
||||
|
@ -17,46 +20,114 @@ Create a new `Scorer`.
|
|||
> ```python
|
||||
> from spacy.scorer import Scorer
|
||||
>
|
||||
> # default scoring pipeline
|
||||
> scorer = Scorer()
|
||||
>
|
||||
> # provided scoring pipeline
|
||||
> nlp = spacy.load("en_core_web_sm")
|
||||
> scorer = Scorer(nlp)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------- | ------------------------------------------------------------ |
|
||||
| `eval_punct` | bool | Evaluate the dependency attachments to and from punctuation. |
|
||||
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. |
|
||||
| **RETURNS** | `Scorer` | The newly created object. |
|
||||
|
||||
## Scorer.score {#score tag="method"}
|
||||
|
||||
Update the evaluation scores from a single [`Example`](/api/example) object.
|
||||
Calculate the scores for a list of [`Example`](/api/example) objects using the
|
||||
scoring methods provided by the components in the pipeline.
|
||||
|
||||
The returned `Dict` contains the scores provided by the individual pipeline
|
||||
components. For the scoring methods provided by the `Scorer` and use by the
|
||||
core pipeline components, the individual score names start with the `Token` or
|
||||
`Doc` attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`,
|
||||
`tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`,
|
||||
`dep_las`, `dep_las_per_type`, `ents_p/r/f`, `ents_per_type`,
|
||||
`textcat_macro_auc`, `textcat_macro_f`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> scorer = Scorer()
|
||||
> scorer.score(example)
|
||||
> scorer.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `example` | `Example` | The `Example` object holding both the predictions and the correct gold-standard annotations. |
|
||||
| `verbose` | bool | Print debugging information. |
|
||||
| `punct_labels` | tuple | Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is `True`. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| **RETURNS** | `Dict` | A dictionary of scores. |
|
||||
## Scorer.score_tokenization {#score_tokenization tag="staticmethod"}
|
||||
|
||||
## Properties
|
||||
Scores the tokenization:
|
||||
|
||||
* `token_acc`: # correct tokens / # gold tokens
|
||||
* `token_p/r/f`: PRF for token character spans
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc/p/r/f`. |
|
||||
|
||||
## Scorer.score_token_attr {#score_token_attr tag="staticmethod"}
|
||||
|
||||
Scores a single token attribute.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the score `attr_acc`. |
|
||||
|
||||
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"}
|
||||
|
||||
Scores a single token attribute per feature for a token attribute in UFEATS format.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. |
|
||||
|
||||
## Scorer.score_spans {#score_spans tag="staticmethod"}
|
||||
|
||||
Returns PRF scores for labeled or unlabeled spans.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. |
|
||||
|
||||
## Scorer.score_deps {#score_deps tag="staticmethod"}
|
||||
|
||||
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute containing the dependency label. |
|
||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||
| `head_attr` | `str` | The attribute containing the head token. |
|
||||
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
|
||||
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`).
|
||||
| **RETURNS** | `Dict` | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. |
|
||||
|
||||
## Scorer.score_cats {#score_cats tag="staticmethod"}
|
||||
|
||||
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
||||
containing scores for each label like `Doc.cats`.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
|
||||
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. |
|
||||
| multi_label | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
|
||||
| positive_label | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the scores: 1) for binary exclusive with positive label: `attr_p/r/f`; 2) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 3) for multilabel, macro-averaged AUC: `attr_macro_auc`; 4) for all: `attr_f_per_type`, `attr_auc_per_type` |
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------------------- | ----- | -------------------------------------------------------------------------------------- |
|
||||
| `token_acc` | float | Tokenization accuracy. |
|
||||
| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). |
|
||||
| `uas` | float | Unlabelled dependency score. |
|
||||
| `las` | float | Labelled dependency score. |
|
||||
| `ents_p` | float | Named entity accuracy (precision). |
|
||||
| `ents_r` | float | Named entity accuracy (recall). |
|
||||
| `ents_f` | float | Named entity accuracy (F-score). |
|
||||
| `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. |
|
||||
| `textcat_f` <Tag variant="new">3.0</Tag> | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. |
|
||||
| `textcat_auc` <Tag variant="new">3.0</Tag> | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined). |
|
||||
| `textcats_f_per_cat` <Tag variant="new">3.0</Tag> | dict | F-scores per textcat label, keyed by label. |
|
||||
| `textcats_auc_per_cat` <Tag variant="new">3.0</Tag> | dict | ROC AUC scores per textcat label, keyed by label. |
|
||||
| `las_per_type` <Tag variant="new">2.2.3</Tag> | dict | Labelled dependency scores, keyed by label. |
|
||||
| `scores` | dict | All scores, keyed by type. |
|
||||
|
|
Loading…
Reference in New Issue
Block a user