Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility Refactor the `Scorer` to improve flexibility for arbitrary pipeline components. * Individual pipeline components provide their own `evaluate` methods that score a list of `Example`s and return a dictionary of scores * `Scorer` is initialized either: * with a provided pipeline containing components to be scored * with a default pipeline containing the built-in statistical components (senter, tagger, morphologizer, parser, ner) * `Scorer.score` evaluates a list of `Example`s and returns a dictionary of scores referring to the scores provided by the components in the pipeline Significant differences: * `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc` and the new `morph_acc`, `pos_acc`, and `lemma_acc` * Scoring is no longer cumulative: `Scorer.score` scores a list of examples rather than a single example and does not retain any state about previously scored examples * PRF values in the returned scores are no longer multiplied by 100 * Add kwargs to Morphologizer.evaluate * Create generalized scoring methods in Scorer * Generalized static scoring methods are added to `Scorer` * Methods require an attribute (either on Token or Doc) that is used to key the returned scores Naming differences: * `uas`, `las`, and `las_per_type` in the scores dict are renamed to `dep_uas`, `dep_las`, and `dep_las_per_type` Scoring differences: * `Doc.sents` is now scored as spans rather than on sentence-initial token positions so that `Doc.sents` and `Doc.ents` can be scored with the same method (this lowers scores since a single incorrect sentence start results in two incorrect spans) * Simplify / extend hasattr check for eval method * Add hasattr check to tokenizer scoring * Simplify to hasattr check for component scoring * Reset Example alignment if docs are set Reset the Example alignment if either doc is set in case the tokenization has changed. * Add PRF tokenization scoring for tokens as spans Add PRF scores for tokens as character spans. The scores are: * token_acc: # correct tokens / # gold tokens * token_p/r/f: PRF for (token.idx, token.idx + len(token)) * Add docstring to Scorer.score_tokenization * Rename component.evaluate() to component.score() * Update Scorer API docs * Update scoring for positive_label in textcat * Fix TextCategorizer.score kwargs * Update Language.evaluate docs * Update score names in default config
2025-07-18 20:22:25 +03:00 · 2020-07-25 12:53:02 +02:00 · 2020-07-25 12:53:02 +02:00 · 2bcceb80c4
commit 2bcceb80c4
parent 656574a01a
20 changed files with 2233 additions and 496 deletions
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -68,41 +68,43 @@ def evaluate(
    nlp = util.load_model(model)
    dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
    begin = timer()
-    scorer = nlp.evaluate(dev_dataset, verbose=False)
+    scores = nlp.evaluate(dev_dataset, verbose=False)
    end = timer()
    nwords = sum(len(ex.predicted) for ex in dev_dataset)
-    results = {
-        "Time": f"{end - begin:.2f} s",
-        "Words": nwords,
-        "Words/s": f"{nwords / (end - begin):.0f}",
-        "TOK": f"{scorer.token_acc:.2f}",
-        "TAG": f"{scorer.tags_acc:.2f}",
-        "POS": f"{scorer.pos_acc:.2f}",
-        "MORPH": f"{scorer.morphs_acc:.2f}",
-        "UAS": f"{scorer.uas:.2f}",
-        "LAS": f"{scorer.las:.2f}",
-        "NER P": f"{scorer.ents_p:.2f}",
-        "NER R": f"{scorer.ents_r:.2f}",
-        "NER F": f"{scorer.ents_f:.2f}",
-        "Textcat AUC": f"{scorer.textcat_auc:.2f}",
-        "Textcat F": f"{scorer.textcat_f:.2f}",
-        "Sent P": f"{scorer.sent_p:.2f}",
-        "Sent R": f"{scorer.sent_r:.2f}",
-        "Sent F": f"{scorer.sent_f:.2f}",
+    metrics = {
+        "TOK": "token_acc",
+        "TAG": "tag_acc",
+        "POS": "pos_acc",
+        "MORPH": "morph_acc",
+        "LEMMA": "lemma_acc",
+        "UAS": "dep_uas",
+        "LAS": "dep_las",
+        "NER P": "ents_p",
+        "NER R": "ents_r",
+        "NER F": "ents_f",
+        "Textcat AUC": 'textcat_macro_auc',
+        "Textcat F": 'textcat_macro_f',
+        "Sent P": 'sents_p',
+        "Sent R": 'sents_r',
+        "Sent F": 'sents_f',
    }
+    results = {}
+    for metric, key in metrics.items():
+        if key in scores:
+            results[metric] = f"{scores[key]*100:.2f}"
    data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}

    msg.table(results, title="Results")

-    if scorer.ents_per_type:
-        data["ents_per_type"] = scorer.ents_per_type
-        print_ents_per_type(msg, scorer.ents_per_type)
-    if scorer.textcats_f_per_cat:
-        data["textcats_f_per_cat"] = scorer.textcats_f_per_cat
-        print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat)
-    if scorer.textcats_auc_per_cat:
-        data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat
-        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
+    if "ents_per_type" in scores:
+        if scores["ents_per_type"]:
+            print_ents_per_type(msg, scores["ents_per_type"])
+    if "textcat_f_per_cat" in scores:
+        if scores["textcat_f_per_cat"]:
+            print_textcats_f_per_cat(msg, scores["textcat_f_per_cat"])
+    if "textcat_auc_per_cat" in scores:
+        if scores["textcat_auc_per_cat"]:
+            print_textcats_auc_per_cat(msg, scores["textcat_auc_per_cat"])

    if displacy_path:
        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
@ -148,7 +150,7 @@ def render_parses(

 def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
    data = [
-        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
+        (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
        for k, v in scores.items()
    ]
    msg.table(
@ -161,7 +163,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No

 def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
    data = [
-        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
+        (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
        for k, v in scores.items()
    ]
    msg.table(
@ -176,7 +178,7 @@ def print_textcats_auc_per_cat(
    msg: Printer, scores: Dict[str, Dict[str, float]]
 ) -> None:
    msg.table(
-        [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()],
+        [(k, f"{v:.2f}") for k, v in scores.items()],
        header=("", "ROC AUC"),
        aligns=("l", "r"),
        title="Textcat ROC AUC (per label)",
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -179,6 +179,7 @@ def train(
                progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
    except Exception as e:
        if output_path is not None:
+            raise e
            msg.warn(
                f"Aborting and saving the final best model. "
                f"Encountered exception: {str(e)}",
@ -259,12 +260,11 @@ def create_evaluation_callback(
        start_time = timer()
        if optimizer.averages:
            with nlp.use_params(optimizer.averages):
-                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
+                scores = nlp.evaluate(dev_examples, batch_size=batch_size)
        else:
-            scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
+            scores = nlp.evaluate(dev_examples, batch_size=batch_size)
        end_time = timer()
        wps = n_words / (end_time - start_time)
-        scores = scorer.scores
        # Calculate a weighted sum based on score_weights for the main score
        weights = cfg["score_weights"]
        try:
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -40,8 +40,8 @@ seed = 0
 accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
-scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
-score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
+scores = ["speed", "tag_acc", "dep_uas", "dep_las", "ents_f"]
+score_weights = {"tag_acc": 0.2, "dep_las": 0.4, "ents_f": 0.4}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
 discard_oversize = false
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -45,6 +45,7 @@ cdef class Example:

        def __set__(self, doc):
            self.x = doc
+            self._alignment = None

    property reference:
        def __get__(self):
@ -52,6 +53,7 @@ cdef class Example:

        def __set__(self, doc):
            self.y = doc
+            self._alignment = None

    def copy(self):
        return Example(
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1011,10 +1011,13 @@ class Language:
                name="language", method="evaluate", types=wrong_types
            )
            raise TypeError(err)
-        if scorer is None:
-            scorer = Scorer(pipeline=self.pipeline)
        if component_cfg is None:
            component_cfg = {}
+        if scorer is None:
+            kwargs = component_cfg.get("scorer", {})
+            kwargs.setdefault("verbose", verbose)
+            kwargs.setdefault("nlp", self)
+            scorer = Scorer(**kwargs)
        docs = list(eg.predicted for eg in examples)
        for name, pipe in self.pipeline:
            kwargs = component_cfg.get(name, {})
@ -1027,10 +1030,7 @@ class Language:
            if verbose:
                print(doc)
            eg.predicted = doc
-            kwargs = component_cfg.get("scorer", {})
-            kwargs.setdefault("verbose", verbose)
-            scorer.score(eg, **kwargs)
-        return scorer
+        return scorer.score(examples)

    @contextmanager
    def use_params(self, params: dict, **cfg):
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -8,6 +8,7 @@ from ..syntax.arc_eager cimport ArcEager
 from .functions import merge_subtokens
 from ..language import Language
 from ..syntax import nonproj
+from ..scorer import Scorer


 default_model_config = """
@ -102,3 +103,14 @@ cdef class DependencyParser(Parser):
                    label = label.split("||")[1]
                labels.add(label)
        return tuple(sorted(labels))
+
+    def score(self, examples, **kwargs):
+        def dep_getter(token, attr):
+            dep = getattr(token, attr)
+            dep = token.vocab.strings.as_string(dep).lower()
+            return dep
+        results = {}
+        results.update(Scorer.score_spans(examples, "sents", **kwargs))
+        results.update(Scorer.score_deps(examples, "dep", getter=dep_getter,
+            ignore_labels=("p", "punct"), **kwargs))
+        return results
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -14,6 +14,7 @@ from ..errors import Errors
 from .pipe import deserialize_config
 from .tagger import Tagger
 from .. import util
+from ..scorer import Scorer


 default_model_config = """
@ -162,6 +163,14 @@ class Morphologizer(Tagger):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

+    def score(self, examples, **kwargs):
+        results = {}
+        results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+        results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
+        results.update(Scorer.score_token_attr_per_feat(examples,
+            "morph", **kwargs))
+        return results
+
    def to_bytes(self, exclude=tuple()):
        serialize = {}
        serialize["model"] = self.model.to_bytes
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -6,6 +6,7 @@ from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown

 from ..language import Language
+from ..scorer import Scorer


 default_model_config = """
@ -88,3 +89,6 @@ cdef class EntityRecognizer(Parser):
        labels = set(move.split("-")[1] for move in self.move_names
                     if move[0] in ("B", "I", "L", "U"))
        return tuple(sorted(labels))
+
+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "ents", **kwargs)
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -117,6 +117,9 @@ class Pipe:
        with self.model.use_params(params):
            yield

+    def score(self, examples, **kwargs):
+        return {}
+
    def to_bytes(self, exclude=tuple()):
        """Serialize the pipe to a bytestring.

--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -6,6 +6,7 @@ from ..tokens.doc cimport Doc

 from .pipe import Pipe
 from ..language import Language
+from ..scorer import Scorer
 from .. import util


@ -130,6 +131,9 @@ class Sentencizer(Pipe):
                    else:
                        doc.c[j].sent_start = -1

+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "sents", **kwargs)
+
    def to_bytes(self, exclude=tuple()):
        """Serialize the sentencizer to a bytestring.

--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -8,6 +8,7 @@ from .pipe import deserialize_config
 from .tagger import Tagger
 from ..language import Language
 from ..errors import Errors
+from ..scorer import Scorer
 from .. import util


@ -104,6 +105,9 @@ class SentenceRecognizer(Tagger):
    def add_label(self, label, values=None):
        raise NotImplementedError

+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "sents", **kwargs)
+
    def to_bytes(self, exclude=tuple()):
        serialize = {}
        serialize["model"] = self.model.to_bytes
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -14,6 +14,7 @@ from ..language import Language
 from ..attrs import POS, ID
 from ..parts_of_speech import X
 from ..errors import Errors, TempErrors, Warnings
+from ..scorer import Scorer
 from .. import util


@ -250,6 +251,13 @@ class Tagger(Pipe):
        with self.model.use_params(params):
            yield

+    def score(self, examples, **kwargs):
+        scores = {}
+        scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
+        scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+        scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
+        return scores
+
    def to_bytes(self, exclude=tuple()):
        serialize = {}
        serialize["model"] = self.model.to_bytes
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -6,6 +6,7 @@ from .pipe import Pipe
 from ..language import Language
 from ..gold import Example
 from ..errors import Errors
+from ..scorer import Scorer
 from .. import util
 from ..tokens import Doc
 from ..vocab import Vocab
@ -250,3 +251,9 @@ class TextCategorizer(Pipe):
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
+
+    def score(self, examples, positive_label=None, **kwargs):
+        return Scorer.score_cats(examples, "cats", labels=self.labels,
+            multi_label=self.model.attrs["multi_label"],
+            positive_label=positive_label, **kwargs
+        )
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,6 +1,8 @@
 import numpy as np

 from .errors import Errors
+from .util import get_lang_class
+from .morphology import Morphology


 class PRFScore:
@ -32,6 +34,9 @@ class PRFScore:
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

+    def to_dict(self):
+        return {"p": self.precision, "r": self.recall, "f": self.fscore}
+

 class ROCAUCScore:
    """
@ -65,391 +70,405 @@ class ROCAUCScore:
 class Scorer:
    """Compute evaluation scores."""

-    def __init__(self, eval_punct=False, pipeline=None):
+    def __init__(self, nlp=None, **cfg):
        """Initialize the Scorer.
-
-        eval_punct (bool): Evaluate the dependency attachments to and from
-            punctuation.
        RETURNS (Scorer): The newly created object.

        DOCS: https://spacy.io/api/scorer#init
        """
-        self.tokens = PRFScore()
-        self.sbd = PRFScore()
-        self.unlabelled = PRFScore()
-        self.labelled = PRFScore()
-        self.labelled_per_dep = dict()
-        self.tags = PRFScore()
-        self.pos = PRFScore()
-        self.morphs = PRFScore()
-        self.morphs_per_feat = dict()
-        self.sent_starts = PRFScore()
-        self.ner = PRFScore()
-        self.ner_per_ents = dict()
-        self.eval_punct = eval_punct
-        self.textcat = PRFScore()
-        self.textcat_f_per_cat = dict()
-        self.textcat_auc_per_cat = dict()
-        self.textcat_positive_label = None
-        self.textcat_multilabel = False
+        self.nlp = nlp
+        self.cfg = cfg

-        if pipeline:
-            for name, component in pipeline:
-                if name == "textcat":
-                    self.textcat_multilabel = component.model.attrs["multi_label"]
-                    self.textcat_positive_label = component.cfg.get(
-                        "positive_label", None
-                    )
-                    for label in component.cfg.get("labels", []):
-                        self.textcat_auc_per_cat[label] = ROCAUCScore()
-                        self.textcat_f_per_cat[label] = PRFScore()
+        if not nlp:
+            # create a default pipeline
+            nlp = get_lang_class("xx")()
+            nlp.add_pipe("senter")
+            nlp.add_pipe("tagger")
+            nlp.add_pipe("morphologizer")
+            nlp.add_pipe("parser")
+            nlp.add_pipe("ner")
+            nlp.add_pipe("textcat")
+            self.nlp = nlp

-    @property
-    def tags_acc(self):
-        """RETURNS (float): Part-of-speech tag accuracy (fine grained tags,
-            i.e. `Token.tag`).
-        """
-        return self.tags.fscore * 100
-
-    @property
-    def pos_acc(self):
-        """RETURNS (float): Part-of-speech tag accuracy (coarse grained pos,
-            i.e. `Token.pos`).
-        """
-        return self.pos.fscore * 100
-
-    @property
-    def morphs_acc(self):
-        """RETURNS (float): Morph tag accuracy (morphological features,
-           i.e. `Token.morph`).
-       """
-        return self.morphs.fscore * 100
-
-    @property
-    def morphs_per_type(self):
-        """RETURNS (dict): Scores per dependency label.
-       """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.morphs_per_feat.items()
-        }
-
-    @property
-    def sent_p(self):
-        """RETURNS (float): F-score for identification of sentence starts.
-            i.e. `Token.is_sent_start`).
-        """
-        return self.sent_starts.precision * 100
-
-    @property
-    def sent_r(self):
-        """RETURNS (float): F-score for identification of sentence starts.
-            i.e. `Token.is_sent_start`).
-        """
-        return self.sent_starts.recall * 100
-
-    @property
-    def sent_f(self):
-        """RETURNS (float): F-score for identification of sentence starts.
-            i.e. `Token.is_sent_start`).
-        """
-        return self.sent_starts.fscore * 100
-
-    @property
-    def token_acc(self):
-        """RETURNS (float): Tokenization accuracy."""
-        return self.tokens.precision * 100
-
-    @property
-    def uas(self):
-        """RETURNS (float): Unlabelled dependency score."""
-        return self.unlabelled.fscore * 100
-
-    @property
-    def las(self):
-        """RETURNS (float): Labelled dependency score."""
-        return self.labelled.fscore * 100
-
-    @property
-    def las_per_type(self):
-        """RETURNS (dict): Scores per dependency label.
-        """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.labelled_per_dep.items()
-        }
-
-    @property
-    def ents_p(self):
-        """RETURNS (float): Named entity accuracy (precision)."""
-        return self.ner.precision * 100
-
-    @property
-    def ents_r(self):
-        """RETURNS (float): Named entity accuracy (recall)."""
-        return self.ner.recall * 100
-
-    @property
-    def ents_f(self):
-        """RETURNS (float): Named entity accuracy (F-score)."""
-        return self.ner.fscore * 100
-
-    @property
-    def ents_per_type(self):
-        """RETURNS (dict): Scores per entity label.
-        """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.ner_per_ents.items()
-        }
-
-    @property
-    def textcat_f(self):
-        """RETURNS (float): f-score on positive label for binary classification,
-        macro-averaged f-score for multilabel classification
-        """
-        if not self.textcat_multilabel:
-            if self.textcat_positive_label:
-                # binary classification
-                return self.textcat.fscore * 100
-        # multi-class and/or multi-label
-        return (
-            sum([score.fscore for label, score in self.textcat_f_per_cat.items()])
-            / (len(self.textcat_f_per_cat) + 1e-100)
-            * 100
-        )
-
-    @property
-    def textcat_auc(self):
-        """RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined)
-        """
-        return max(
-            sum([score.score for label, score in self.textcat_auc_per_cat.items()])
-            / (len(self.textcat_auc_per_cat) + 1e-100),
-            -1,
-        )
-
-    @property
-    def textcats_auc_per_cat(self):
-        """RETURNS (dict): AUC ROC Scores per textcat label.
-        """
-        return {
-            k: {"roc_auc_score": max(v.score, -1)}
-            for k, v in self.textcat_auc_per_cat.items()
-        }
-
-    @property
-    def textcats_f_per_cat(self):
-        """RETURNS (dict): F-scores per textcat label.
-        """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.textcat_f_per_cat.items()
-        }
-
-    @property
-    def scores(self):
-        """RETURNS (dict): All scores mapped by key.
-        """
-        return {
-            "uas": self.uas,
-            "las": self.las,
-            "las_per_type": self.las_per_type,
-            "ents_p": self.ents_p,
-            "ents_r": self.ents_r,
-            "ents_f": self.ents_f,
-            "ents_per_type": self.ents_per_type,
-            "tags_acc": self.tags_acc,
-            "pos_acc": self.pos_acc,
-            "morphs_acc": self.morphs_acc,
-            "morphs_per_type": self.morphs_per_type,
-            "sent_p": self.sent_p,
-            "sent_r": self.sent_r,
-            "sent_f": self.sent_f,
-            "token_acc": self.token_acc,
-            "textcat_f": self.textcat_f,
-            "textcat_auc": self.textcat_auc,
-            "textcats_f_per_cat": self.textcats_f_per_cat,
-            "textcats_auc_per_cat": self.textcats_auc_per_cat,
-        }
-
-    def score(self, example, verbose=False, punct_labels=("p", "punct")):
-        """Update the evaluation scores from a single Example.
-
-        example (Example): The predicted annotations + correct annotations.
-        verbose (bool): Print debugging information.
-        punct_labels (tuple): Dependency labels for punctuation. Used to
-            evaluate dependency attachments to punctuation if `eval_punct` is
-            `True`.
+    def score(self, examples):
+        """Evaluate a list of Examples.

+        examples (Iterable[Example]): The predicted annotations + correct annotations.
+        RETURNS (Dict): A dictionary of scores.
        DOCS: https://spacy.io/api/scorer#score
        """
-        doc = example.predicted
-        gold_doc = example.reference
-        align = example.alignment
-        gold_deps = set()
-        gold_deps_per_dep = {}
-        gold_tags = set()
-        gold_pos = set()
-        gold_morphs = set()
-        gold_morphs_per_feat = {}
-        gold_sent_starts = set()
-        for gold_i, token in enumerate(gold_doc):
-            gold_tags.add((gold_i, token.tag_))
-            gold_pos.add((gold_i, token.pos_))
-            gold_morphs.add((gold_i, token.morph_))
-            if token.morph_:
-                for feat in token.morph_.split("|"):
-                    field, values = feat.split("=")
-                    if field not in self.morphs_per_feat:
-                        self.morphs_per_feat[field] = PRFScore()
-                    if field not in gold_morphs_per_feat:
-                        gold_morphs_per_feat[field] = set()
-                    gold_morphs_per_feat[field].add((gold_i, feat))
-            if token.sent_start:
-                gold_sent_starts.add(gold_i)
-            dep = token.dep_.lower()
-            if dep not in punct_labels:
-                gold_deps.add((gold_i, token.head.i, dep))
-                if dep not in self.labelled_per_dep:
-                    self.labelled_per_dep[dep] = PRFScore()
-                if dep not in gold_deps_per_dep:
-                    gold_deps_per_dep[dep] = set()
-                gold_deps_per_dep[dep].add((gold_i, token.head.i, dep))
-        cand_deps = set()
-        cand_deps_per_dep = {}
-        cand_tags = set()
-        cand_pos = set()
-        cand_morphs = set()
-        cand_morphs_per_feat = {}
-        cand_sent_starts = set()
-        for token in doc:
-            if token.orth_.isspace():
+        scores = {}
+
+        if hasattr(self.nlp.tokenizer, "score"):
+            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
+        for name, component in self.nlp.pipeline:
+            if hasattr(component, "score"):
+                scores.update(component.score(examples, **self.cfg))
+
+        return scores
+
+    @staticmethod
+    def score_tokenization(examples, **cfg):
+        """Returns accuracy and PRF scores for tokenization.
+
+        * token_acc: # correct tokens / # gold tokens
+        * token_p/r/f: PRF for token character spans
+
+        examples (Iterable[Example]): Examples to score
+        RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
+        """
+        acc_score = PRFScore()
+        prf_score = PRFScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_spans = set()
+            pred_spans = set()
+            for token in gold_doc:
+                if token.orth_.isspace():
+                    continue
+                gold_spans.add((token.idx, token.idx + len(token)))
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                pred_spans.add((token.idx, token.idx + len(token)))
+                if align.x2y.lengths[token.i] != 1:
+                    acc_score.fp += 1
+                else:
+                    acc_score.tp += 1
+            prf_score.score_set(pred_spans, gold_spans)
+        return {
+            "token_acc": acc_score.fscore,
+            "token_p": prf_score.precision,
+            "token_r": prf_score.recall,
+            "token_f": prf_score.fscore,
+        }
+
+    @staticmethod
+    def score_token_attr(examples, attr, getter=getattr, **cfg):
+        """Returns an accuracy score for a token-level attribute.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        RETURNS (dict): A dictionary containing the accuracy score under the
+            key attr_acc.
+        """
+        tag_score = PRFScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_tags = set()
+            for gold_i, token in enumerate(gold_doc):
+                gold_tags.add((gold_i, getter(token, attr)))
+            pred_tags = set()
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] == 1:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    pred_tags.add((gold_i, getter(token, attr)))
+            tag_score.score_set(pred_tags, gold_tags)
+        return {
+            attr + "_acc": tag_score.fscore,
+        }
+
+    @staticmethod
+    def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
+        """Return PRF scores per feat for a token attribute in UFEATS format.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
+            the key attr_per_feat.
+        """
+        per_feat = {}
+        for example in examples:
+            pred_doc = example.predicted
+            gold_doc = example.reference
+            align = example.alignment
+            gold_per_feat = {}
+            for gold_i, token in enumerate(gold_doc):
+                morph = str(getter(token, attr))
+                if morph:
+                    for feat in morph.split(Morphology.FEATURE_SEP):
+                        field, values = feat.split(Morphology.FIELD_SEP)
+                        if field not in per_feat:
+                            per_feat[field] = PRFScore()
+                        if field not in gold_per_feat:
+                            gold_per_feat[field] = set()
+                        gold_per_feat[field].add((gold_i, feat))
+            pred_per_feat = {}
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] == 1:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    morph = str(getter(token, attr))
+                    if morph:
+                        for feat in morph.split("|"):
+                            field, values = feat.split("=")
+                            if field not in per_feat:
+                                per_feat[field] = PRFScore()
+                            if field not in pred_per_feat:
+                                pred_per_feat[field] = set()
+                            pred_per_feat[field].add((gold_i, feat))
+            for field in per_feat:
+                per_feat[field].score_set(
+                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
+                )
+        return {
+            attr + "_per_feat": per_feat,
+        }
+
+    @staticmethod
+    def score_spans(examples, attr, getter=getattr, **cfg):
+        """Returns PRF scores for labeled spans.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(doc, attr) should return the spans for the individual doc.
+        RETURNS (dict): A dictionary containing the PRF scores under the
+            keys attr_p/r/f and the per-type PRF scores under attr_per_type.
+        """
+        score = PRFScore()
+        score_per_type = dict()
+        for example in examples:
+            pred_doc = example.predicted
+            gold_doc = example.reference
+            # Find all labels in gold and doc
+            labels = set(
+                [k.label_ for k in getter(gold_doc, attr)]
+                + [k.label_ for k in getter(pred_doc, attr)]
+            )
+            # Set up all labels for per type scoring and prepare gold per type
+            gold_per_type = {label: set() for label in labels}
+            for label in labels:
+                if label not in score_per_type:
+                    score_per_type[label] = PRFScore()
+            # Find all predidate labels, for all and per type
+            gold_spans = set()
+            pred_spans = set()
+
+            # Special case for ents:
+            # If we have missing values in the gold, we can't easily tell
+            # whether our NER predictions are true.
+            # It seems bad but it's what we've always done.
+            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
                continue
-            if align.x2y.lengths[token.i] != 1:
-                self.tokens.fp += 1
-                gold_i = None
-            else:
-                gold_i = align.x2y[token.i].dataXd[0, 0]
-                self.tokens.tp += 1
-                cand_tags.add((gold_i, token.tag_))
-                cand_pos.add((gold_i, token.pos_))
-                cand_morphs.add((gold_i, token.morph_))
-                if token.morph_:
-                    for feat in token.morph_.split("|"):
-                        field, values = feat.split("=")
-                        if field not in self.morphs_per_feat:
-                            self.morphs_per_feat[field] = PRFScore()
-                        if field not in cand_morphs_per_feat:
-                            cand_morphs_per_feat[field] = set()
-                        cand_morphs_per_feat[field].add((gold_i, feat))
-                if token.is_sent_start:
-                    cand_sent_starts.add(gold_i)
-            if token.dep_.lower() not in punct_labels and token.orth_.strip():
-                if align.x2y.lengths[token.head.i] == 1:
-                    gold_head = align.x2y[token.head.i].dataXd[0, 0]
-                else:
-                    gold_head = None
-                # None is indistinct, so we can't just add it to the set
-                # Multiple (None, None) deps are possible
-                if gold_i is None or gold_head is None:
-                    self.unlabelled.fp += 1
-                    self.labelled.fp += 1
-                else:
-                    cand_deps.add((gold_i, gold_head, token.dep_.lower()))
-                    if token.dep_.lower() not in self.labelled_per_dep:
-                        self.labelled_per_dep[token.dep_.lower()] = PRFScore()
-                    if token.dep_.lower() not in cand_deps_per_dep:
-                        cand_deps_per_dep[token.dep_.lower()] = set()
-                    cand_deps_per_dep[token.dep_.lower()].add(
-                        (gold_i, gold_head, token.dep_.lower())
+
+            for span in getter(gold_doc, attr):
+                gold_span = (span.label_, span.start, span.end - 1)
+                gold_spans.add(gold_span)
+                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            pred_per_type = {label: set() for label in labels}
+            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
+                pred_spans.add((span.label_, span.start, span.end - 1))
+                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            # Scores per label
+            for k, v in score_per_type.items():
+                if k in pred_per_type:
+                    v.score_set(pred_per_type[k], gold_per_type[k])
+            # Score for all labels
+            score.score_set(pred_spans, gold_spans)
+        results = {
+            attr + "_p": score.precision,
+            attr + "_r": score.recall,
+            attr + "_f": score.fscore,
+            attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
+        return results
+
+    @staticmethod
+    def score_cats(
+        examples,
+        attr,
+        getter=getattr,
+        labels=[],
+        multi_label=True,
+        positive_label=None,
+        **cfg
+    ):
+        """Returns PRF and ROC AUC scores for a doc-level attribute with a
+        dict with scores for each label like Doc.cats.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(doc, attr) should return the values for the individual doc.
+        labels (Iterable[str]): The set of possible labels. Defaults to [].
+        multi_label (bool): Whether the attribute allows multiple labels.
+            Defaults to True.
+        positive_label (str): The positive label for a binary task with
+            exclusive classes. Defaults to None.
+        RETURNS (dict): A dictionary containing the scores:
+            for binary exclusive with positive label: attr_p/r/f,
+            for 3+ exclusive classes, macro-averaged fscore: attr_macro_f,
+            for multilabel, macro-averaged AUC: attr_macro_auc,
+            for all: attr_f_per_type, attr_auc_per_type
+        """
+        score = PRFScore()
+        f_per_type = dict()
+        auc_per_type = dict()
+        for label in labels:
+            f_per_type[label] = PRFScore()
+            auc_per_type[label] = ROCAUCScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            gold_values = getter(gold_doc, attr)
+            pred_values = getter(pred_doc, attr)
+            if (
+                len(gold_values) > 0
+                and set(f_per_type) == set(auc_per_type) == set(gold_values)
+                and set(gold_values) == set(pred_values)
+            ):
+                gold_val = max(gold_values, key=gold_values.get)
+                pred_val = max(pred_values, key=pred_values.get)
+                if positive_label:
+                    score.score_set(
+                        set([positive_label]) & set([pred_val]),
+                        set([positive_label]) & set([gold_val]),
+                    )
+                for label in set(gold_values):
+                    auc_per_type[label].score_set(
+                        pred_values[label], gold_values[label]
+                    )
+                    f_per_type[label].score_set(
+                        set([label]) & set([pred_val]), set([label]) & set([gold_val])
+                    )
+            elif len(f_per_type) > 0:
+                model_labels = set(f_per_type)
+                eval_labels = set(gold_values)
+                raise ValueError(
+                    Errors.E162.format(
+                        model_labels=model_labels, eval_labels=eval_labels
                    )
-        # Find all NER labels in gold and doc
-        ent_labels = set(
-            [k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]
-        )
-        # Set up all labels for per type scoring and prepare gold per type
-        gold_per_ents = {ent_label: set() for ent_label in ent_labels}
-        for ent_label in ent_labels:
-            if ent_label not in self.ner_per_ents:
-                self.ner_per_ents[ent_label] = PRFScore()
-        # Find all candidate labels, for all and per type
-        gold_ents = set()
-        cand_ents = set()
-        # If we have missing values in the gold, we can't easily tell whether
-        # our NER predictions are true.
-        # It seems bad but it's what we've always done.
-        if all(token.ent_iob != 0 for token in gold_doc):
-            for ent in gold_doc.ents:
-                gold_ent = (ent.label_, ent.start, ent.end - 1)
-                gold_ents.add(gold_ent)
-                gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
-            cand_per_ents = {ent_label: set() for ent_label in ent_labels}
-            for ent in example.get_aligned_spans_x2y(doc.ents):
-                cand_ents.add((ent.label_, ent.start, ent.end - 1))
-                cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
-            # Scores per ent
-            for k, v in self.ner_per_ents.items():
-                if k in cand_per_ents:
-                    v.score_set(cand_per_ents[k], gold_per_ents[k])
-            # Score for all ents
-            self.ner.score_set(cand_ents, gold_ents)
-        self.tags.score_set(cand_tags, gold_tags)
-        self.pos.score_set(cand_pos, gold_pos)
-        self.morphs.score_set(cand_morphs, gold_morphs)
-        for field in self.morphs_per_feat:
-            self.morphs_per_feat[field].score_set(
-                cand_morphs_per_feat.get(field, set()),
-                gold_morphs_per_feat.get(field, set()),
-            )
-        self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
-        self.labelled.score_set(cand_deps, gold_deps)
-        for dep in self.labelled_per_dep:
-            self.labelled_per_dep[dep].score_set(
-                cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
-            )
-        self.unlabelled.score_set(
-            set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
-        )
-        if (
-            len(gold_doc.cats) > 0
-            and set(self.textcat_f_per_cat)
-            == set(self.textcat_auc_per_cat)
-            == set(gold_doc.cats)
-            and set(gold_doc.cats) == set(doc.cats)
-        ):
-            goldcat = max(gold_doc.cats, key=gold_doc.cats.get)
-            candcat = max(doc.cats, key=doc.cats.get)
-            if self.textcat_positive_label:
-                self.textcat.score_set(
-                    set([self.textcat_positive_label]) & set([candcat]),
-                    set([self.textcat_positive_label]) & set([goldcat]),
                )
-            for label in set(gold_doc.cats):
-                self.textcat_auc_per_cat[label].score_set(
-                    doc.cats[label], gold_doc.cats[label]
+            elif len(auc_per_type) > 0:
+                model_labels = set(auc_per_type)
+                eval_labels = set(gold_values)
+                raise ValueError(
+                    Errors.E162.format(
+                        model_labels=model_labels, eval_labels=eval_labels
+                    )
                )
-                self.textcat_f_per_cat[label].score_set(
-                    set([label]) & set([candcat]), set([label]) & set([goldcat])
+        results = {
+            attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+            attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
+        }
+        if len(labels) == 2 and not multi_label and positive_label:
+            results[attr + "_p"] = score.precision
+            results[attr + "_r"] = score.recall
+            results[attr + "_f"] = score.fscore
+        elif not multi_label:
+            results[attr + "_macro_f"] = sum(
+                [score.fscore for label, score in f_per_type.items()]
+            ) / (len(f_per_type) + 1e-100)
+        else:
+            results[attr + "_macro_auc"] = max(
+                sum([score.score for label, score in auc_per_type.items()])
+                / (len(auc_per_type) + 1e-100),
+                -1,
+            )
+        return results
+
+    @staticmethod
+    def score_deps(
+        examples,
+        attr,
+        getter=getattr,
+        head_attr="head",
+        head_getter=getattr,
+        ignore_labels=tuple(),
+        **cfg
+    ):
+        """Returns the UAS, LAS, and LAS per type scores for dependency
+        parses.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute containing the dependency label.
+        getter (callable): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        head_attr (str): The attribute containing the head token. Defaults to
+            'head'.
+        head_getter (callable): Defaults to getattr. If provided,
+            head_getter(token, attr) should return the value of the head for an
+            individual token.
+        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
+        RETURNS (dict): A dictionary containing the scores:
+            attr_uas, attr_las, and attr_las_per_type.
+        """
+        unlabelled = PRFScore()
+        labelled = PRFScore()
+        labelled_per_dep = dict()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_deps = set()
+            gold_deps_per_dep = {}
+            for gold_i, token in enumerate(gold_doc):
+                dep = getter(token, attr)
+                head = head_getter(token, head_attr)
+                if dep not in ignore_labels:
+                    gold_deps.add((gold_i, head.i, dep))
+                    if dep not in labelled_per_dep:
+                        labelled_per_dep[dep] = PRFScore()
+                    if dep not in gold_deps_per_dep:
+                        gold_deps_per_dep[dep] = set()
+                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
+            pred_deps = set()
+            pred_deps_per_dep = {}
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] != 1:
+                    gold_i = None
+                else:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                dep = getter(token, attr)
+                head = head_getter(token, head_attr)
+                if dep not in ignore_labels and token.orth_.strip():
+                    if align.x2y.lengths[head.i] == 1:
+                        gold_head = align.x2y[head.i].dataXd[0, 0]
+                    else:
+                        gold_head = None
+                    # None is indistinct, so we can't just add it to the set
+                    # Multiple (None, None) deps are possible
+                    if gold_i is None or gold_head is None:
+                        unlabelled.fp += 1
+                        labelled.fp += 1
+                    else:
+                        pred_deps.add((gold_i, gold_head, dep))
+                        if dep not in labelled_per_dep:
+                            labelled_per_dep[dep] = PRFScore()
+                        if dep not in pred_deps_per_dep:
+                            pred_deps_per_dep[dep] = set()
+                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
+            labelled.score_set(pred_deps, gold_deps)
+            for dep in labelled_per_dep:
+                labelled_per_dep[dep].score_set(
+                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
                )
-        elif len(self.textcat_f_per_cat) > 0:
-            model_labels = set(self.textcat_f_per_cat)
-            eval_labels = set(gold_doc.cats)
-            raise ValueError(
-                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
+            unlabelled.score_set(
+                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
            )
-        elif len(self.textcat_auc_per_cat) > 0:
-            model_labels = set(self.textcat_auc_per_cat)
-            eval_labels = set(gold_doc.cats)
-            raise ValueError(
-                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
-            )
-        if verbose:
-            gold_words = gold_doc.words
-            for w_id, h_id, dep in cand_deps - gold_deps:
-                print("F", gold_words[w_id], dep, gold_words[h_id])
-            for w_id, h_id, dep in gold_deps - cand_deps:
-                print("M", gold_words[w_id], dep, gold_words[h_id])
+        return {
+            attr + "_uas": unlabelled.fscore,
+            attr + "_las": labelled.fscore,
+            attr
+            + "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
+        }


 #############################################################################
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -85,6 +85,8 @@ def test_overfitting_IO():
    fix_random_seed(0)
    nlp = English()
    textcat = nlp.add_pipe("textcat")
+    # Set exclusive labels
+    textcat.model.attrs["multi_label"] = False
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -114,6 +116,10 @@ def test_overfitting_IO():
        assert cats2["POSITIVE"] > 0.9
        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)

+    # Test scoring
+    scores = nlp.evaluate(train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}})
+    assert scores["cats_f"] == 1.0
+

 # fmt: off
@pytest.mark.parametrize(
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -7,6 +7,7 @@ from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc
 from spacy.lang.en import English
+from spacy.tokens import Doc


 test_las_apple = [
@ -77,13 +78,61 @@ def tagged_doc():
        doc[i].tag_ = tags[i]
        doc[i].pos_ = pos[i]
        doc[i].morph_ = morphs[i]
+        if i > 0:
+            doc[i].is_sent_start = False
    doc.is_tagged = True
    return doc


+@pytest.fixture
+def sented_doc():
+    text = "One sentence. Two sentences. Three sentences."
+    nlp = English()
+    doc = nlp(text)
+    for i in range(len(doc)):
+        if i % 3 == 0:
+            doc[i].is_sent_start = True
+        else:
+            doc[i].is_sent_start = False
+    return doc
+
+
+def test_tokenization(sented_doc):
+    scorer = Scorer()
+    gold = {"sent_starts": [t.sent_start for t in sented_doc]}
+    example = Example.from_dict(sented_doc, gold)
+    scores = scorer.score([example])
+    assert scores["token_acc"] == 1.0
+
+    nlp = English()
+    example.predicted = Doc(nlp.vocab, words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."], spaces=[True, True, True, True, True, False])
+    example.predicted[1].is_sent_start = False
+    scores = scorer.score([example])
+    assert scores["token_acc"] == approx(0.66666666)
+    assert scores["token_p"] == 0.5
+    assert scores["token_r"] == approx(0.33333333)
+    assert scores["token_f"] == 0.4
+
+
+def test_sents(sented_doc):
+    scorer = Scorer()
+    gold = {"sent_starts": [t.sent_start for t in sented_doc]}
+    example = Example.from_dict(sented_doc, gold)
+    scores = scorer.score([example])
+    assert scores["sents_f"] == 1.0
+
+    # One sentence start is moved
+    gold["sent_starts"][3] = 0
+    gold["sent_starts"][4] = 1
+    example = Example.from_dict(sented_doc, gold)
+    scores = scorer.score([example])
+    assert scores["sents_f"] == approx(0.3333333)
+
+
 def test_las_per_type(en_vocab):
    # Gold and Doc are identical
    scorer = Scorer()
+    examples = []
    for input_, annot in test_las_apple:
        doc = get_doc(
            en_vocab,
@ -93,20 +142,21 @@ def test_las_per_type(en_vocab):
        )
        gold = {"heads": annot["heads"], "deps": annot["deps"]}
        example = Example.from_dict(doc, gold)
-        scorer.score(example)
-    results = scorer.scores
+        examples.append(example)
+    results = scorer.score(examples)

-    assert results["uas"] == 100
-    assert results["las"] == 100
-    assert results["las_per_type"]["nsubj"]["p"] == 100
-    assert results["las_per_type"]["nsubj"]["r"] == 100
-    assert results["las_per_type"]["nsubj"]["f"] == 100
-    assert results["las_per_type"]["compound"]["p"] == 100
-    assert results["las_per_type"]["compound"]["r"] == 100
-    assert results["las_per_type"]["compound"]["f"] == 100
+    assert results["dep_uas"] == 1.0
+    assert results["dep_las"] == 1.0
+    assert results["dep_las_per_type"]["nsubj"]["p"] == 1.0
+    assert results["dep_las_per_type"]["nsubj"]["r"] == 1.0
+    assert results["dep_las_per_type"]["nsubj"]["f"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["p"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["r"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["f"] == 1.0

    # One dep is incorrect in Doc
    scorer = Scorer()
+    examples = []
    for input_, annot in test_las_apple:
        doc = get_doc(
            en_vocab,
@ -117,22 +167,23 @@ def test_las_per_type(en_vocab):
        gold = {"heads": annot["heads"], "deps": annot["deps"]}
        doc[0].dep_ = "compound"
        example = Example.from_dict(doc, gold)
-        scorer.score(example)
-    results = scorer.scores
+        examples.append(example)
+    results = scorer.score(examples)

-    assert results["uas"] == 100
-    assert_almost_equal(results["las"], 90.9090909)
-    assert results["las_per_type"]["nsubj"]["p"] == 0
-    assert results["las_per_type"]["nsubj"]["r"] == 0
-    assert results["las_per_type"]["nsubj"]["f"] == 0
-    assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
-    assert results["las_per_type"]["compound"]["r"] == 100
-    assert results["las_per_type"]["compound"]["f"] == 80
+    assert results["dep_uas"] == 1.0
+    assert_almost_equal(results["dep_las"], 0.9090909)
+    assert results["dep_las_per_type"]["nsubj"]["p"] == 0
+    assert results["dep_las_per_type"]["nsubj"]["r"] == 0
+    assert results["dep_las_per_type"]["nsubj"]["f"] == 0
+    assert_almost_equal(results["dep_las_per_type"]["compound"]["p"], 0.666666666)
+    assert results["dep_las_per_type"]["compound"]["r"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["f"] == 0.8


 def test_ner_per_type(en_vocab):
    # Gold and Doc are identical
    scorer = Scorer()
+    examples = []
    for input_, annot in test_ner_cardinal:
        doc = get_doc(
            en_vocab,
@ -140,20 +191,24 @@ def test_ner_per_type(en_vocab):
            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
        )
        entities = biluo_tags_from_offsets(doc, annot["entities"])
-        ex = Example.from_dict(doc, {"entities": entities})
-        scorer.score(ex)
-    results = scorer.scores
+        example = Example.from_dict(doc, {"entities": entities})
+        # a hack for sentence boundaries
+        example.predicted[1].is_sent_start = False
+        example.reference[1].is_sent_start = False
+        examples.append(example)
+    results = scorer.score(examples)

-    assert results["ents_p"] == 100
-    assert results["ents_f"] == 100
-    assert results["ents_r"] == 100
-    assert results["ents_per_type"]["CARDINAL"]["p"] == 100
-    assert results["ents_per_type"]["CARDINAL"]["f"] == 100
-    assert results["ents_per_type"]["CARDINAL"]["r"] == 100
+    assert results["ents_p"] == 1.0
+    assert results["ents_r"] == 1.0
+    assert results["ents_f"] == 1.0
+    assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0
+    assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0
+    assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0

    # Doc has one missing and one extra entity
    # Entity type MONEY is not present in Doc
    scorer = Scorer()
+    examples = []
    for input_, annot in test_ner_apple:
        doc = get_doc(
            en_vocab,
@ -161,25 +216,28 @@ def test_ner_per_type(en_vocab):
            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
        )
        entities = biluo_tags_from_offsets(doc, annot["entities"])
-        ex = Example.from_dict(doc, {"entities": entities})
-        scorer.score(ex)
-    results = scorer.scores
+        example = Example.from_dict(doc, {"entities": entities})
+        # a hack for sentence boundaries
+        example.predicted[1].is_sent_start = False
+        example.reference[1].is_sent_start = False
+        examples.append(example)
+    results = scorer.score(examples)

-    assert results["ents_p"] == approx(66.66666)
-    assert results["ents_r"] == approx(66.66666)
-    assert results["ents_f"] == approx(66.66666)
+    assert results["ents_p"] == approx(0.6666666)
+    assert results["ents_r"] == approx(0.6666666)
+    assert results["ents_f"] == approx(0.6666666)
    assert "GPE" in results["ents_per_type"]
    assert "MONEY" in results["ents_per_type"]
    assert "ORG" in results["ents_per_type"]
-    assert results["ents_per_type"]["GPE"]["p"] == 100
-    assert results["ents_per_type"]["GPE"]["r"] == 100
-    assert results["ents_per_type"]["GPE"]["f"] == 100
+    assert results["ents_per_type"]["GPE"]["p"] == 1.0
+    assert results["ents_per_type"]["GPE"]["r"] == 1.0
+    assert results["ents_per_type"]["GPE"]["f"] == 1.0
    assert results["ents_per_type"]["MONEY"]["p"] == 0
    assert results["ents_per_type"]["MONEY"]["r"] == 0
    assert results["ents_per_type"]["MONEY"]["f"] == 0
-    assert results["ents_per_type"]["ORG"]["p"] == 50
-    assert results["ents_per_type"]["ORG"]["r"] == 100
-    assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
+    assert results["ents_per_type"]["ORG"]["p"] == 0.5
+    assert results["ents_per_type"]["ORG"]["r"] == 1.0
+    assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)


 def test_tag_score(tagged_doc):
@ -189,17 +247,17 @@ def test_tag_score(tagged_doc):
        "tags": [t.tag_ for t in tagged_doc],
        "pos": [t.pos_ for t in tagged_doc],
        "morphs": [t.morph_ for t in tagged_doc],
+        "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
    }
    example = Example.from_dict(tagged_doc, gold)
-    scorer.score(example)
-    results = scorer.scores
+    results = scorer.score([example])

-    assert results["tags_acc"] == 100
-    assert results["pos_acc"] == 100
-    assert results["morphs_acc"] == 100
-    assert results["morphs_per_type"]["NounType"]["f"] == 100
+    assert results["tag_acc"] == 1.0
+    assert results["pos_acc"] == 1.0
+    assert results["morph_acc"] == 1.0
+    assert results["morph_per_feat"]["NounType"].fscore == 1.0

-    # Gold and Doc are identical
+    # Gold annotation is modified
    scorer = Scorer()
    tags = [t.tag_ for t in tagged_doc]
    tags[0] = "NN"
@ -208,16 +266,21 @@ def test_tag_score(tagged_doc):
    morphs = [t.morph_ for t in tagged_doc]
    morphs[1] = "Number=sing"
    morphs[2] = "Number=plur"
-    gold = {"tags": tags, "pos": pos, "morphs": morphs}
+    gold = {
+        "tags": tags,
+        "pos": pos,
+        "morphs": morphs,
+        "sent_starts": gold["sent_starts"],
+    }
    example = Example.from_dict(tagged_doc, gold)
-    scorer.score(example)
-    results = scorer.scores
+    results = scorer.score([example])

-    assert results["tags_acc"] == 90
-    assert results["pos_acc"] == 90
-    assert results["morphs_acc"] == approx(80)
-    assert results["morphs_per_type"]["Poss"]["f"] == 0.0
-    assert results["morphs_per_type"]["Number"]["f"] == approx(72.727272)
+    assert results["tag_acc"] == 0.9
+    assert results["pos_acc"] == 0.9
+    assert results["morph_acc"] == approx(0.8)
+    assert results["morph_per_feat"]["NounType"].fscore == 1.0
+    assert results["morph_per_feat"]["Poss"].fscore == 0.0
+    assert results["morph_per_feat"]["Number"].fscore == approx(0.72727272)


 def test_roc_auc_score():
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -24,6 +24,7 @@ from . import util
 from .util import registry
 from .attrs import intify_attrs
 from .symbols import ORTH
+from .scorer import Scorer


@registry.tokenizers("spacy.Tokenizer.v1")
@ -743,6 +744,9 @@ cdef class Tokenizer:
            tokens.extend(reversed(suffixes))
        return tokens

+    def score(self, examples, **kwargs):
+        return Scorer.score_tokenization(examples)
+
    def to_disk(self, path, **kwargs):
        """Save the current state to a directory.

--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -108,8 +108,8 @@ Evaluate a model's pipeline components.
 > #### Example
 >
 > ```python
-> scorer = nlp.evaluate(examples, verbose=True)
-> print(scorer.scores)
+> scores = nlp.evaluate(examples, verbose=True)
+> print(scores)
 > ```

 | Name                                         | Type                | Description                                                                           |
@ -119,7 +119,7 @@ Evaluate a model's pipeline components.
 | `batch_size`                                 | int                 | The batch size to use.                                                                |
 | `scorer`                                     | `Scorer`            | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
 | `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]`   | Config parameters for specific pipeline components, keyed by component name.          |
-| **RETURNS**                                  | Scorer              | The scorer containing the evaluation scores.                                          |
+| **RETURNS**                                  | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores.                                        |

 ## Language.begin_training {#begin_training tag="method"}

--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -5,9 +5,12 @@ tag: class
 source: spacy/scorer.py
 ---

-The `Scorer` computes and stores evaluation scores. It's typically created by
+The `Scorer` computes evaluation scores. It's typically created by
 [`Language.evaluate`](/api/language#evaluate).

+In addition, the `Scorer` provides a number of evaluation methods for
+evaluating `Token` and `Doc` attributes.
+
 ## Scorer.\_\_init\_\_ {#init tag="method"}

 Create a new `Scorer`.
@ -17,46 +20,114 @@ Create a new `Scorer`.
 > ```python
 > from spacy.scorer import Scorer
 >
+> # default scoring pipeline
 > scorer = Scorer()
+>
+> # provided scoring pipeline
+> nlp = spacy.load("en_core_web_sm")
+> scorer = Scorer(nlp)
 > ```

 | Name         | Type     | Description                                                  |
 | ------------ | -------- | ------------------------------------------------------------ |
-| `eval_punct` | bool     | Evaluate the dependency attachments to and from punctuation. |
+| `nlp`  | Language       | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`.  |
 | **RETURNS**  | `Scorer` | The newly created object.                                    |

 ## Scorer.score {#score tag="method"}

-Update the evaluation scores from a single [`Example`](/api/example) object.
+Calculate the scores for a list of [`Example`](/api/example) objects using the
+scoring methods provided by the components in the pipeline.

+The returned `Dict` contains the scores provided by the individual pipeline
+components. For the scoring methods provided by the `Scorer` and use by the
+core pipeline components, the individual score names start with the `Token` or
+`Doc` attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`,
+`tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`,
+`dep_las`, `dep_las_per_type`, `ents_p/r/f`, `ents_per_type`,
+`textcat_macro_auc`, `textcat_macro_f`.
+ 
 > #### Example
 >
 > ```python
 > scorer = Scorer()
-> scorer.score(example)
+> scorer.score(examples)
 > ```

-| Name           | Type      | Description                                                                                                          |
-| -------------- | --------- | -------------------------------------------------------------------------------------------------------------------- |
-| `example`      | `Example` | The `Example` object holding both the predictions and the correct gold-standard annotations.                         |
-| `verbose`      | bool      | Print debugging information.                                                                                         |
-| `punct_labels` | tuple     | Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is `True`. |
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| **RETURNS** | `Dict`              | A dictionary of scores.                                                                       |
+## Scorer.score_tokenization {#score_tokenization tag="staticmethod"}

-## Properties
+Scores the tokenization:
+
+* `token_acc`: # correct tokens / # gold tokens
+* `token_p/r/f`: PRF for token character spans
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| **RETURNS** | `Dict`              | A dictionary containing the scores `token_acc/p/r/f`.                                         |
+
+## Scorer.score_token_attr {#score_token_attr tag="staticmethod"}
+
+Scores a single token attribute.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| **RETURNS** | `Dict`              | A dictionary containing the score `attr_acc`.                                                 |
+
+## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"}
+
+Scores a single token attribute per feature for a token attribute in UFEATS format.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| **RETURNS** | `Dict`              | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. |
+
+## Scorer.score_spans {#score_spans tag="staticmethod"}
+
+Returns PRF scores for labeled or unlabeled spans.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
+| **RETURNS** | `Dict`              | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. |
+
+## Scorer.score_deps {#score_deps tag="staticmethod"}
+
+Calculate the UAS, LAS, and LAS per type scores for dependency parses.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute containing the dependency label. |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| `head_attr` | `str`               | The attribute containing the head token. |
+| `head_getter` | `callable`          | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
+| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`).
+| **RETURNS** | `Dict`              | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. |
+
+## Scorer.score_cats {#score_cats tag="staticmethod"}
+
+Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
+containing scores for each label like `Doc.cats`.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
+| labels      | `Iterable[str]`     | The set of possible labels. Defaults to `[]`. |
+| multi_label | `bool`              | Whether the attribute allows multiple labels. Defaults to `True`. |
+| positive_label | `str`            | The positive label for a binary task with exclusive classes. Defaults to `None`. |
+| **RETURNS** | `Dict`              | A dictionary containing the scores: 1) for binary exclusive with positive label: `attr_p/r/f`; 2) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 3) for multilabel, macro-averaged AUC: `attr_macro_auc`; 4) for all: `attr_f_per_type`, `attr_auc_per_type` |

-| Name                                                | Type  | Description                                                                            |
-| --------------------------------------------------- | ----- | -------------------------------------------------------------------------------------- |
-| `token_acc`                                         | float | Tokenization accuracy.                                                                 |
-| `tags_acc`                                          | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`).                     |
-| `uas`                                               | float | Unlabelled dependency score.                                                           |
-| `las`                                               | float | Labelled dependency score.                                                             |
-| `ents_p`                                            | float | Named entity accuracy (precision).                                                     |
-| `ents_r`                                            | float | Named entity accuracy (recall).                                                        |
-| `ents_f`                                            | float | Named entity accuracy (F-score).                                                       |
-| `ents_per_type` <Tag variant="new">2.1.5</Tag>      | dict  | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores.  |
-| `textcat_f` <Tag variant="new">3.0</Tag>            | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. |
-| `textcat_auc` <Tag variant="new">3.0</Tag>          | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined).        |
-| `textcats_f_per_cat` <Tag variant="new">3.0</Tag>   | dict  | F-scores per textcat label, keyed by label.                                            |
-| `textcats_auc_per_cat` <Tag variant="new">3.0</Tag> | dict  | ROC AUC scores per textcat label, keyed by label.                                      |
-| `las_per_type` <Tag variant="new">2.2.3</Tag>       | dict  | Labelled dependency scores, keyed by label.                                            |
-| `scores`                                            | dict  | All scores, keyed by type.                                                             |