From 2bcceb80c48cb6934537ee13f40268366a3f0463 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sat, 25 Jul 2020 12:53:02 +0200
Subject: [PATCH] Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
---
 spacy/cli/evaluate.py                |   64 +-
 spacy/cli/train.py                   |    6 +-
 spacy/default_config.cfg             |    4 +-
 spacy/gold/example.pyx               |    2 +
 spacy/language.py                    |   12 +-
 spacy/pipeline/dep_parser.pyx        |   12 +
 spacy/pipeline/morphologizer.pyx     |    9 +
 spacy/pipeline/ner.pyx               |    4 +
 spacy/pipeline/pipe.pyx              |    3 +
 spacy/pipeline/pipes.pyx             | 1519 ++++++++++++++++++++++++++
 spacy/pipeline/sentencizer.pyx       |    4 +
 spacy/pipeline/senter.pyx            |    4 +
 spacy/pipeline/tagger.pyx            |    8 +
 spacy/pipeline/textcat.py            |    7 +
 spacy/scorer.py                      |  757 ++++++-------
 spacy/tests/pipeline/test_textcat.py |    6 +
 spacy/tests/test_scorer.py           |  175 ++-
 spacy/tokenizer.pyx                  |    4 +
 website/docs/api/language.md         |    6 +-
 website/docs/api/scorer.md           |  123 ++-
 20 files changed, 2233 insertions(+), 496 deletions(-)
 create mode 100644 spacy/pipeline/pipes.pyx

diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index de2e01818..66b22b131 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -68,41 +68,43 @@ def evaluate(
     nlp = util.load_model(model)
     dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
     begin = timer()
-    scorer = nlp.evaluate(dev_dataset, verbose=False)
+    scores = nlp.evaluate(dev_dataset, verbose=False)
     end = timer()
     nwords = sum(len(ex.predicted) for ex in dev_dataset)
-    results = {
-        "Time": f"{end - begin:.2f} s",
-        "Words": nwords,
-        "Words/s": f"{nwords / (end - begin):.0f}",
-        "TOK": f"{scorer.token_acc:.2f}",
-        "TAG": f"{scorer.tags_acc:.2f}",
-        "POS": f"{scorer.pos_acc:.2f}",
-        "MORPH": f"{scorer.morphs_acc:.2f}",
-        "UAS": f"{scorer.uas:.2f}",
-        "LAS": f"{scorer.las:.2f}",
-        "NER P": f"{scorer.ents_p:.2f}",
-        "NER R": f"{scorer.ents_r:.2f}",
-        "NER F": f"{scorer.ents_f:.2f}",
-        "Textcat AUC": f"{scorer.textcat_auc:.2f}",
-        "Textcat F": f"{scorer.textcat_f:.2f}",
-        "Sent P": f"{scorer.sent_p:.2f}",
-        "Sent R": f"{scorer.sent_r:.2f}",
-        "Sent F": f"{scorer.sent_f:.2f}",
+    metrics = {
+        "TOK": "token_acc",
+        "TAG": "tag_acc",
+        "POS": "pos_acc",
+        "MORPH": "morph_acc",
+        "LEMMA": "lemma_acc",
+        "UAS": "dep_uas",
+        "LAS": "dep_las",
+        "NER P": "ents_p",
+        "NER R": "ents_r",
+        "NER F": "ents_f",
+        "Textcat AUC": 'textcat_macro_auc',
+        "Textcat F": 'textcat_macro_f',
+        "Sent P": 'sents_p',
+        "Sent R": 'sents_r',
+        "Sent F": 'sents_f',
     }
+    results = {}
+    for metric, key in metrics.items():
+        if key in scores:
+            results[metric] = f"{scores[key]*100:.2f}"
     data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
 
     msg.table(results, title="Results")
 
-    if scorer.ents_per_type:
-        data["ents_per_type"] = scorer.ents_per_type
-        print_ents_per_type(msg, scorer.ents_per_type)
-    if scorer.textcats_f_per_cat:
-        data["textcats_f_per_cat"] = scorer.textcats_f_per_cat
-        print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat)
-    if scorer.textcats_auc_per_cat:
-        data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat
-        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
+    if "ents_per_type" in scores:
+        if scores["ents_per_type"]:
+            print_ents_per_type(msg, scores["ents_per_type"])
+    if "textcat_f_per_cat" in scores:
+        if scores["textcat_f_per_cat"]:
+            print_textcats_f_per_cat(msg, scores["textcat_f_per_cat"])
+    if "textcat_auc_per_cat" in scores:
+        if scores["textcat_auc_per_cat"]:
+            print_textcats_auc_per_cat(msg, scores["textcat_auc_per_cat"])
 
     if displacy_path:
         factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
@@ -148,7 +150,7 @@ def render_parses(
 
 def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
     data = [
-        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
+        (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
         for k, v in scores.items()
     ]
     msg.table(
@@ -161,7 +163,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No
 
 def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
     data = [
-        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
+        (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
         for k, v in scores.items()
     ]
     msg.table(
@@ -176,7 +178,7 @@ def print_textcats_auc_per_cat(
     msg: Printer, scores: Dict[str, Dict[str, float]]
 ) -> None:
     msg.table(
-        [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()],
+        [(k, f"{v:.2f}") for k, v in scores.items()],
         header=("", "ROC AUC"),
         aligns=("l", "r"),
         title="Textcat ROC AUC (per label)",
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index c51aac974..f3580ea10 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -179,6 +179,7 @@ def train(
                 progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
     except Exception as e:
         if output_path is not None:
+            raise e
             msg.warn(
                 f"Aborting and saving the final best model. "
                 f"Encountered exception: {str(e)}",
@@ -259,12 +260,11 @@ def create_evaluation_callback(
         start_time = timer()
         if optimizer.averages:
             with nlp.use_params(optimizer.averages):
-                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
+                scores = nlp.evaluate(dev_examples, batch_size=batch_size)
         else:
-            scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
+            scores = nlp.evaluate(dev_examples, batch_size=batch_size)
         end_time = timer()
         wps = n_words / (end_time - start_time)
-        scores = scorer.scores
         # Calculate a weighted sum based on score_weights for the main score
         weights = cfg["score_weights"]
         try:
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 7e6c7a6ec..db8f03f2a 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -40,8 +40,8 @@ seed = 0
 accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
-scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
-score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
+scores = ["speed", "tag_acc", "dep_uas", "dep_las", "ents_f"]
+score_weights = {"tag_acc": 0.2, "dep_las": 0.4, "ents_f": 0.4}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
 discard_oversize = false
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 355578de3..9101cefce 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -45,6 +45,7 @@ cdef class Example:
 
         def __set__(self, doc):
             self.x = doc
+            self._alignment = None
 
     property reference:
         def __get__(self):
@@ -52,6 +53,7 @@ cdef class Example:
 
         def __set__(self, doc):
             self.y = doc
+            self._alignment = None
 
     def copy(self):
         return Example(
diff --git a/spacy/language.py b/spacy/language.py
index 09429a04c..bd816e948 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1011,10 +1011,13 @@ class Language:
                 name="language", method="evaluate", types=wrong_types
             )
             raise TypeError(err)
-        if scorer is None:
-            scorer = Scorer(pipeline=self.pipeline)
         if component_cfg is None:
             component_cfg = {}
+        if scorer is None:
+            kwargs = component_cfg.get("scorer", {})
+            kwargs.setdefault("verbose", verbose)
+            kwargs.setdefault("nlp", self)
+            scorer = Scorer(**kwargs)
         docs = list(eg.predicted for eg in examples)
         for name, pipe in self.pipeline:
             kwargs = component_cfg.get(name, {})
@@ -1027,10 +1030,7 @@ class Language:
             if verbose:
                 print(doc)
             eg.predicted = doc
-            kwargs = component_cfg.get("scorer", {})
-            kwargs.setdefault("verbose", verbose)
-            scorer.score(eg, **kwargs)
-        return scorer
+        return scorer.score(examples)
 
     @contextmanager
     def use_params(self, params: dict, **cfg):
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 1651119f8..78926a984 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -8,6 +8,7 @@ from ..syntax.arc_eager cimport ArcEager
 from .functions import merge_subtokens
 from ..language import Language
 from ..syntax import nonproj
+from ..scorer import Scorer
 
 
 default_model_config = """
@@ -102,3 +103,14 @@ cdef class DependencyParser(Parser):
                     label = label.split("||")[1]
                 labels.add(label)
         return tuple(sorted(labels))
+
+    def score(self, examples, **kwargs):
+        def dep_getter(token, attr):
+            dep = getattr(token, attr)
+            dep = token.vocab.strings.as_string(dep).lower()
+            return dep
+        results = {}
+        results.update(Scorer.score_spans(examples, "sents", **kwargs))
+        results.update(Scorer.score_deps(examples, "dep", getter=dep_getter,
+            ignore_labels=("p", "punct"), **kwargs))
+        return results
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index a5a54f139..fb80a9d86 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -14,6 +14,7 @@ from ..errors import Errors
 from .pipe import deserialize_config
 from .tagger import Tagger
 from .. import util
+from ..scorer import Scorer
 
 
 default_model_config = """
@@ -162,6 +163,14 @@ class Morphologizer(Tagger):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
+    def score(self, examples, **kwargs):
+        results = {}
+        results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+        results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
+        results.update(Scorer.score_token_attr_per_feat(examples,
+            "morph", **kwargs))
+        return results
+
     def to_bytes(self, exclude=tuple()):
         serialize = {}
         serialize["model"] = self.model.to_bytes
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index ea904f69e..cb2ca89d8 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -6,6 +6,7 @@ from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
 
 from ..language import Language
+from ..scorer import Scorer
 
 
 default_model_config = """
@@ -88,3 +89,6 @@ cdef class EntityRecognizer(Parser):
         labels = set(move.split("-")[1] for move in self.move_names
                      if move[0] in ("B", "I", "L", "U"))
         return tuple(sorted(labels))
+
+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "ents", **kwargs)
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 5fa7d82db..e7702aa59 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -117,6 +117,9 @@ class Pipe:
         with self.model.use_params(params):
             yield
 
+    def score(self, examples, **kwargs):
+        return {}
+
     def to_bytes(self, exclude=tuple()):
         """Serialize the pipe to a bytestring.
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
new file mode 100644
index 000000000..5c71a33e7
--- /dev/null
+++ b/spacy/pipeline/pipes.pyx
@@ -0,0 +1,1519 @@
+# cython: infer_types=True, profile=True
+import numpy
+import srsly
+import random
+
+from thinc.api import CosineDistance, to_categorical, get_array_module
+from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
+import warnings
+
+from ..tokens.doc cimport Doc
+from ..syntax.nn_parser cimport Parser
+from ..syntax.ner cimport BiluoPushDown
+from ..syntax.arc_eager cimport ArcEager
+from ..morphology cimport Morphology
+from ..vocab cimport Vocab
+
+from .defaults import default_tagger, default_parser,  default_ner,  default_textcat
+from .defaults import default_nel, default_senter
+from .functions import merge_subtokens
+from ..language import Language, component
+from ..syntax import nonproj
+from ..gold.example import Example
+from ..attrs import POS, ID
+from ..util import link_vectors_to_models, create_default_optimizer
+from ..parts_of_speech import X
+from ..kb import KnowledgeBase
+from ..errors import Errors, TempErrors, Warnings
+from .. import util
+from ..scorer import Scorer
+
+
+def _load_cfg(path):
+    if path.exists():
+        return srsly.read_json(path)
+    else:
+        return {}
+
+
+class Pipe:
+    """This class is not instantiated directly. Components inherit from it, and
+    it defines the interface that components should follow to function as
+    components in a spaCy analysis pipeline.
+    """
+
+    name = None
+
+    @classmethod
+    def from_nlp(cls, nlp, model, **cfg):
+        return cls(nlp.vocab, model, **cfg)
+
+    def __init__(self, vocab, model, **cfg):
+        """Create a new pipe instance."""
+        raise NotImplementedError
+
+    def __call__(self, Doc doc):
+        """Apply the pipe to one document. The document is
+        modified in-place, and returned.
+
+        Both __call__ and pipe should delegate to the `predict()`
+        and `set_annotations()` methods.
+        """
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
+        return doc
+
+    def pipe(self, stream, batch_size=128):
+        """Apply the pipe to a stream of documents.
+
+        Both __call__ and pipe should delegate to the `predict()`
+        and `set_annotations()` methods.
+        """
+        for docs in util.minibatch(stream, size=batch_size):
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
+            yield from docs
+
+    def predict(self, docs):
+        """Apply the pipeline's model to a batch of docs, without
+        modifying them.
+        """
+        raise NotImplementedError
+
+    def set_annotations(self, docs, scores):
+        """Modify a batch of documents, using pre-computed scores."""
+        raise NotImplementedError
+
+    def rehearse(self, examples, sgd=None, losses=None, **config):
+        pass
+
+    def get_loss(self, examples, scores):
+        """Find the loss and gradient of loss for the batch of
+        examples (with embedded docs) and their predicted scores."""
+        raise NotImplementedError
+
+    def add_label(self, label):
+        """Add an output label, to be predicted by the model.
+
+        It's possible to extend pretrained models with new labels,
+        but care should be taken to avoid the "catastrophic forgetting"
+        problem.
+        """
+        raise NotImplementedError
+
+    def create_optimizer(self):
+        return create_default_optimizer()
+
+    def begin_training(
+        self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
+    ):
+        """Initialize the pipe for training, using data exampes if available.
+        If no model has been initialized yet, the model is added."""
+        self.model.initialize()
+        if hasattr(self, "vocab"):
+            link_vectors_to_models(self.vocab)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+    def set_output(self, nO):
+        if self.model.has_dim("nO") is not False:
+            self.model.set_dim("nO", nO)
+        if self.model.has_ref("output_layer"):
+            self.model.get_ref("output_layer").set_dim("nO", nO)
+
+    def get_gradients(self):
+        """Get non-zero gradients of the model's parameters, as a dictionary
+        keyed by the parameter ID. The values are (weights, gradients) tuples.
+        """
+        gradients = {}
+        queue = [self.model]
+        seen = set()
+        for node in queue:
+            if node.id in seen:
+                continue
+            seen.add(node.id)
+            if hasattr(node, "_mem") and node._mem.gradient.any():
+                gradients[node.id] = [node._mem.weights, node._mem.gradient]
+            if hasattr(node, "_layers"):
+                queue.extend(node._layers)
+        return gradients
+
+    def use_params(self, params):
+        """Modify the pipe's model, to use the given parameter values."""
+        with self.model.use_params(params):
+            yield
+
+    def score(self, examples, **kwargs):
+        return {}
+
+    def to_bytes(self, exclude=tuple()):
+        """Serialize the pipe to a bytestring.
+
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized object.
+        """
+        serialize = {}
+        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        serialize["model"] = self.model.to_bytes
+        if hasattr(self, "vocab"):
+            serialize["vocab"] = self.vocab.to_bytes
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, exclude=tuple()):
+        """Load the pipe from a bytestring."""
+
+        def load_model(b):
+            try:
+                self.model.from_bytes(b)
+            except AttributeError:
+                raise ValueError(Errors.E149)
+
+        deserialize = {}
+        if hasattr(self, "vocab"):
+            deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
+        deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
+        deserialize["model"] = load_model
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, exclude=tuple()):
+        """Serialize the pipe to disk."""
+        serialize = {}
+        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
+        serialize["model"] = lambda p: self.model.to_disk(p)
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, exclude=tuple()):
+        """Load the pipe from disk."""
+
+        def load_model(p):
+            try:
+                self.model.from_bytes(p.open("rb").read())
+            except AttributeError:
+                raise ValueError(Errors.E149)
+
+        deserialize = {}
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
+        deserialize["model"] = load_model
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+
+@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
+class Tagger(Pipe):
+    """Pipeline component for part-of-speech tagging.
+
+    DOCS: https://spacy.io/api/tagger
+    """
+
+    def __init__(self, vocab, model, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self._rehearsal_model = None
+        self.cfg = dict(sorted(cfg.items()))
+
+    @property
+    def labels(self):
+        return tuple(self.vocab.morphology.tag_names)
+
+    def __call__(self, doc):
+        tags = self.predict([doc])
+        self.set_annotations([doc], tags)
+        return doc
+
+    def pipe(self, stream, batch_size=128):
+        for docs in util.minibatch(stream, size=batch_size):
+            tag_ids = self.predict(docs)
+            self.set_annotations(docs, tag_ids)
+            yield from docs
+
+    def predict(self, docs):
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            n_labels = len(self.labels)
+            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
+            assert len(guesses) == len(docs)
+            return guesses
+        scores = self.model.predict(docs)
+        assert len(scores) == len(docs), (len(scores), len(docs))
+        guesses = self._scores2guesses(scores)
+        assert len(guesses) == len(docs)
+        return guesses
+
+    def _scores2guesses(self, scores):
+        guesses = []
+        for doc_scores in scores:
+            doc_guesses = doc_scores.argmax(axis=1)
+            if not isinstance(doc_guesses, numpy.ndarray):
+                doc_guesses = doc_guesses.get()
+            guesses.append(doc_guesses)
+        return guesses
+
+    def set_annotations(self, docs, batch_tag_ids):
+        if isinstance(docs, Doc):
+            docs = [docs]
+        cdef Doc doc
+        cdef int idx = 0
+        cdef Vocab vocab = self.vocab
+        assign_morphology = self.cfg.get("set_morphology", True)
+        for i, doc in enumerate(docs):
+            doc_tag_ids = batch_tag_ids[i]
+            if hasattr(doc_tag_ids, "get"):
+                doc_tag_ids = doc_tag_ids.get()
+            for j, tag_id in enumerate(doc_tag_ids):
+                # Don't clobber preset POS tags
+                if doc.c[j].tag == 0:
+                    if doc.c[j].pos == 0 and assign_morphology:
+                        # Don't clobber preset lemmas
+                        lemma = doc.c[j].lemma
+                        vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
+                        if lemma != 0 and lemma != doc.c[j].lex.orth:
+                            doc.c[j].lemma = lemma
+                    else:
+                        doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
+                idx += 1
+            doc.is_tagged = True
+
+    def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+
+        try:
+            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
+                # Handle cases where there are no tokens in any docs.
+                return
+        except AttributeError:
+            types = set([type(eg) for eg in examples])
+            raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types))
+        set_dropout_rate(self.model, drop)
+        tag_scores, bp_tag_scores = self.model.begin_update(
+            [eg.predicted for eg in examples])
+        for sc in tag_scores:
+            if self.model.ops.xp.isnan(sc.sum()):
+                raise ValueError("nan value in scores")
+        loss, d_tag_scores = self.get_loss(examples, tag_scores)
+        bp_tag_scores(d_tag_scores)
+        if sgd not in (None, False):
+            self.model.finish_update(sgd)
+
+        losses[self.name] += loss
+        if set_annotations:
+            docs = [eg.predicted for eg in examples]
+            self.set_annotations(docs, self._scores2guesses(tag_scores))
+        return losses
+
+    def rehearse(self, examples, drop=0., sgd=None, losses=None):
+        """Perform a 'rehearsal' update, where we try to match the output of
+        an initial model.
+        """
+        try:
+            docs = [eg.predicted for eg in examples]
+        except AttributeError:
+            types = set([type(eg) for eg in examples])
+            raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types))
+        if self._rehearsal_model is None:
+            return
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            return
+        set_dropout_rate(self.model, drop)
+        guesses, backprop = self.model.begin_update(docs)
+        target = self._rehearsal_model(examples)
+        gradient = guesses - target
+        backprop(gradient)
+        self.model.finish_update(sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += (gradient**2).sum()
+
+    def get_loss(self, examples, scores):
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
+        return float(loss), d_scores
+
+    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
+                       **kwargs):
+        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
+        if not any(table in self.vocab.lookups for table in lemma_tables):
+            warnings.warn(Warnings.W022)
+        if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
+            warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
+        orig_tag_map = dict(self.vocab.morphology.tag_map)
+        new_tag_map = {}
+        for example in get_examples():
+            try:
+                y = example.y
+            except AttributeError:
+                raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example)))
+            for token in y:
+                tag = token.tag_
+                if tag in orig_tag_map:
+                    new_tag_map[tag] = orig_tag_map[tag]
+                else:
+                    new_tag_map[tag] = {POS: X}
+
+        cdef Vocab vocab = self.vocab
+        if new_tag_map:
+            if "_SP" in orig_tag_map:
+                new_tag_map["_SP"] = orig_tag_map["_SP"]
+            vocab.morphology = Morphology(vocab.strings, new_tag_map,
+                                          vocab.morphology.lemmatizer,
+                                          exc=vocab.morphology.exc)
+        self.set_output(len(self.labels))
+        doc_sample = [Doc(self.vocab, words=["hello", "world"])]
+        if pipeline is not None:
+            for name, component in pipeline:
+                if component is self:
+                    break
+                if hasattr(component, "pipe"):
+                    doc_sample = list(component.pipe(doc_sample))
+                else:
+                    doc_sample = [component(doc) for doc in doc_sample]
+        self.model.initialize(X=doc_sample)
+        # Get batch of example docs, example outputs to call begin_training().
+        # This lets the model infer shapes.
+        link_vectors_to_models(self.vocab)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+    def add_label(self, label, values=None):
+        if not isinstance(label, str):
+            raise ValueError(Errors.E187)
+        if label in self.labels:
+            return 0
+        if self.model.has_dim("nO"):
+            # Here's how the model resizing will work, once the
+            # neuron-to-tag mapping is no longer controlled by
+            # the Morphology class, which sorts the tag names.
+            # The sorting makes adding labels difficult.
+            # smaller = self.model._layers[-1]
+            # larger = Softmax(len(self.labels)+1, smaller.nI)
+            # copy_array(larger.W[:smaller.nO], smaller.W)
+            # copy_array(larger.b[:smaller.nO], smaller.b)
+            # self.model._layers[-1] = larger
+            raise ValueError(TempErrors.T003)
+        tag_map = dict(self.vocab.morphology.tag_map)
+        if values is None:
+            values = {POS: "X"}
+        tag_map[label] = values
+        self.vocab.morphology = Morphology(
+            self.vocab.strings, tag_map=tag_map,
+            lemmatizer=self.vocab.morphology.lemmatizer,
+            exc=self.vocab.morphology.exc)
+        return 1
+
+    def use_params(self, params):
+        with self.model.use_params(params):
+            yield
+
+    def score(self, examples, **kwargs):
+        scores = {}
+        scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
+        scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+        scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
+        return scores
+
+    def to_bytes(self, exclude=tuple()):
+        serialize = {}
+        serialize["model"] = self.model.to_bytes
+        serialize["vocab"] = self.vocab.to_bytes
+        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
+        serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, exclude=tuple()):
+        def load_model(b):
+            try:
+                self.model.from_bytes(b)
+            except AttributeError:
+                raise ValueError(Errors.E149)
+
+        def load_tag_map(b):
+            tag_map = srsly.msgpack_loads(b)
+            self.vocab.morphology = Morphology(
+                self.vocab.strings, tag_map=tag_map,
+                lemmatizer=self.vocab.morphology.lemmatizer,
+                exc=self.vocab.morphology.exc)
+
+        deserialize = {
+            "vocab": lambda b: self.vocab.from_bytes(b),
+            "tag_map": load_tag_map,
+            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+            "model": lambda b: load_model(b),
+        }
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, exclude=tuple()):
+        tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
+        serialize = {
+            "vocab": lambda p: self.vocab.to_disk(p),
+            "tag_map": lambda p: srsly.write_msgpack(p, tag_map),
+            "model": lambda p: self.model.to_disk(p),
+            "cfg": lambda p: srsly.write_json(p, self.cfg),
+        }
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, exclude=tuple()):
+        def load_model(p):
+            with p.open("rb") as file_:
+                try:
+                    self.model.from_bytes(file_.read())
+                except AttributeError:
+                    raise ValueError(Errors.E149)
+
+        def load_tag_map(p):
+            tag_map = srsly.read_msgpack(p)
+            self.vocab.morphology = Morphology(
+                self.vocab.strings, tag_map=tag_map,
+                lemmatizer=self.vocab.morphology.lemmatizer,
+                exc=self.vocab.morphology.exc)
+
+        deserialize = {
+            "vocab": lambda p: self.vocab.from_disk(p),
+            "cfg": lambda p: self.cfg.update(_load_cfg(p)),
+            "tag_map": load_tag_map,
+            "model": load_model,
+        }
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+
+@component("senter", assigns=["token.is_sent_start"], default_model=default_senter)
+class SentenceRecognizer(Tagger):
+    """Pipeline component for sentence segmentation.
+
+    DOCS: https://spacy.io/api/sentencerecognizer
+    """
+
+    def __init__(self, vocab, model, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self._rehearsal_model = None
+        self.cfg = dict(sorted(cfg.items()))
+
+    @property
+    def labels(self):
+        # labels are numbered by index internally, so this matches GoldParse
+        # and Example where the sentence-initial tag is 1 and other positions
+        # are 0
+        return tuple(["I", "S"])
+
+    def set_annotations(self, docs, batch_tag_ids):
+        if isinstance(docs, Doc):
+            docs = [docs]
+        cdef Doc doc
+        for i, doc in enumerate(docs):
+            doc_tag_ids = batch_tag_ids[i]
+            if hasattr(doc_tag_ids, "get"):
+                doc_tag_ids = doc_tag_ids.get()
+            for j, tag_id in enumerate(doc_tag_ids):
+                # Don't clobber existing sentence boundaries
+                if doc.c[j].sent_start == 0:
+                    if tag_id == 1:
+                        doc.c[j].sent_start = 1
+                    else:
+                        doc.c[j].sent_start = -1
+
+    def get_loss(self, examples, scores):
+        labels = self.labels
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        truths = []
+        for eg in examples:
+            eg_truth = []
+            for x in eg.get_aligned("sent_start"):
+                if x == None:
+                    eg_truth.append(None)
+                elif x == 1:
+                    eg_truth.append(labels[1])
+                else:
+                    # anything other than 1: 0, -1, -1 as uint64
+                    eg_truth.append(labels[0])
+            truths.append(eg_truth)
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
+        return float(loss), d_scores
+
+    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
+                       **kwargs):
+        self.set_output(len(self.labels))
+        self.model.initialize()
+        link_vectors_to_models(self.vocab)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+    def add_label(self, label, values=None):
+        raise NotImplementedError
+
+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "sents", **kwargs)
+
+    def to_bytes(self, exclude=tuple()):
+        serialize = {}
+        serialize["model"] = self.model.to_bytes
+        serialize["vocab"] = self.vocab.to_bytes
+        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, exclude=tuple()):
+        def load_model(b):
+            try:
+                self.model.from_bytes(b)
+            except AttributeError:
+                raise ValueError(Errors.E149)
+
+        deserialize = {
+            "vocab": lambda b: self.vocab.from_bytes(b),
+            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+            "model": lambda b: load_model(b),
+        }
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, exclude=tuple()):
+        serialize = {
+            "vocab": lambda p: self.vocab.to_disk(p),
+            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
+            "cfg": lambda p: srsly.write_json(p, self.cfg),
+        }
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, exclude=tuple()):
+        def load_model(p):
+            with p.open("rb") as file_:
+                try:
+                    self.model.from_bytes(file_.read())
+                except AttributeError:
+                    raise ValueError(Errors.E149)
+
+        deserialize = {
+            "vocab": lambda p: self.vocab.from_disk(p),
+            "cfg": lambda p: self.cfg.update(_load_cfg(p)),
+            "model": load_model,
+        }
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+
+@component("nn_labeller")
+class MultitaskObjective(Tagger):
+    """Experimental: Assist training of a parser or tagger, by training a
+    side-objective.
+    """
+
+    def __init__(self, vocab, model, **cfg):
+        self.vocab = vocab
+        self.model = model
+        target = cfg["target"]   # default: 'dep_tag_offset'
+        if target == "dep":
+            self.make_label = self.make_dep
+        elif target == "tag":
+            self.make_label = self.make_tag
+        elif target == "ent":
+            self.make_label = self.make_ent
+        elif target == "dep_tag_offset":
+            self.make_label = self.make_dep_tag_offset
+        elif target == "ent_tag":
+            self.make_label = self.make_ent_tag
+        elif target == "sent_start":
+            self.make_label = self.make_sent_start
+        elif hasattr(target, "__call__"):
+            self.make_label = target
+        else:
+            raise ValueError(Errors.E016)
+        self.cfg = dict(cfg)
+
+    @property
+    def labels(self):
+        return self.cfg.setdefault("labels", {})
+
+    @labels.setter
+    def labels(self, value):
+        self.cfg["labels"] = value
+
+    def set_annotations(self, docs, dep_ids):
+        pass
+
+    def begin_training(self, get_examples=lambda: [], pipeline=None,
+                       sgd=None, **kwargs):
+        gold_examples = nonproj.preprocess_training_data(get_examples())
+        # for raw_text, doc_annot in gold_tuples:
+        for example in gold_examples:
+            for token in example.y:
+                label = self.make_label(token)
+                if label is not None and label not in self.labels:
+                    self.labels[label] = len(self.labels)
+        self.model.initialize()
+        link_vectors_to_models(self.vocab)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+    def predict(self, docs):
+        tokvecs = self.model.get_ref("tok2vec")(docs)
+        scores = self.model.get_ref("softmax")(tokvecs)
+        return tokvecs, scores
+
+    def get_loss(self, examples, scores):
+        cdef int idx = 0
+        correct = numpy.zeros((scores.shape[0],), dtype="i")
+        guesses = scores.argmax(axis=1)
+        docs = [eg.predicted for eg in examples]
+        for i, eg in enumerate(examples):
+            # Handles alignment for tokenization differences
+            doc_annots = eg.get_aligned()  # TODO
+            for j in range(len(eg.predicted)):
+                tok_annots = {key: values[j] for key, values in tok_annots.items()}
+                label = self.make_label(j, tok_annots)
+                if label is None or label not in self.labels:
+                    correct[idx] = guesses[idx]
+                else:
+                    correct[idx] = self.labels[label]
+                idx += 1
+        correct = self.model.ops.xp.array(correct, dtype="i")
+        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
+        loss = (d_scores**2).sum()
+        return float(loss), d_scores
+
+    @staticmethod
+    def make_dep(token):
+        return token.dep_
+
+    @staticmethod
+    def make_tag(token):
+        return token.tag_
+
+    @staticmethod
+    def make_ent(token):
+        if token.ent_iob_ == "O":
+            return "O"
+        else:
+            return token.ent_iob_ + "-" + token.ent_type_
+
+    @staticmethod
+    def make_dep_tag_offset(token):
+        dep = token.dep_
+        tag = token.tag_
+        offset = token.head.i - token.i
+        offset = min(offset, 2)
+        offset = max(offset, -2)
+        return f"{dep}-{tag}:{offset}"
+
+    @staticmethod
+    def make_ent_tag(token):
+        if token.ent_iob_ == "O":
+            ent = "O"
+        else:
+            ent = token.ent_iob_ + "-" + token.ent_type_
+        tag = token.tag_
+        return f"{tag}-{ent}"
+
+    @staticmethod
+    def make_sent_start(token):
+        """A multi-task objective for representing sentence boundaries,
+        using BILU scheme. (O is impossible)
+        """
+        if token.is_sent_start and token.is_sent_end:
+            return "U-SENT"
+        elif token.is_sent_start:
+            return "B-SENT"
+        else:
+            return "I-SENT"
+
+
+class ClozeMultitask(Pipe):
+    def __init__(self, vocab, model, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.cfg = cfg
+        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
+
+    def set_annotations(self, docs, dep_ids):
+        pass
+
+    def begin_training(self, get_examples=lambda: [], pipeline=None,
+                       sgd=None, **kwargs):
+        link_vectors_to_models(self.vocab)
+        self.model.initialize()
+        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
+        self.model.output_layer.begin_training(X)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+    def predict(self, docs):
+        tokvecs = self.model.get_ref("tok2vec")(docs)
+        vectors = self.model.get_ref("output_layer")(tokvecs)
+        return tokvecs, vectors
+
+    def get_loss(self, examples, vectors, prediction):
+        # The simplest way to implement this would be to vstack the
+        # token.vector values, but that's a bit inefficient, especially on GPU.
+        # Instead we fetch the index into the vectors table for each of our tokens,
+        # and look them up all at once. This prevents data copying.
+        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
+        target = vectors[ids]
+        gradient = self.distance.get_grad(prediction, target)
+        loss = self.distance.get_loss(prediction, target)
+        return loss, gradient
+
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
+        pass
+
+    def rehearse(self, examples, drop=0., sgd=None, losses=None):
+        if losses is not None and self.name not in losses:
+            losses[self.name] = 0.
+        set_dropout_rate(self.model, drop)
+        try:
+            predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples])
+        except AttributeError:
+            types = set([type(eg) for eg in examples])
+            raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types))
+        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
+        bp_predictions(d_predictions)
+        if sgd is not None:
+            self.model.finish_update(sgd)
+
+        if losses is not None:
+            losses[self.name] += loss
+
+
+@component("textcat", assigns=["doc.cats"], default_model=default_textcat)
+class TextCategorizer(Pipe):
+    """Pipeline component for text classification.
+
+    DOCS: https://spacy.io/api/textcategorizer
+    """
+    def __init__(self, vocab, model, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self._rehearsal_model = None
+        self.cfg = dict(cfg)
+
+    @property
+    def labels(self):
+        return tuple(self.cfg.setdefault("labels", []))
+
+    def require_labels(self):
+        """Raise an error if the component's model has no labels defined."""
+        if not self.labels:
+            raise ValueError(Errors.E143.format(name=self.name))
+
+    @labels.setter
+    def labels(self, value):
+        self.cfg["labels"] = tuple(value)
+
+    def pipe(self, stream, batch_size=128):
+        for docs in util.minibatch(stream, size=batch_size):
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
+            yield from docs
+
+    def predict(self, docs):
+        tensors = [doc.tensor for doc in docs]
+
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            xp = get_array_module(tensors)
+            scores = xp.zeros((len(docs), len(self.labels)))
+            return scores
+
+        scores = self.model.predict(docs)
+        scores = self.model.ops.asarray(scores)
+        return scores
+
+    def set_annotations(self, docs, scores):
+        for i, doc in enumerate(docs):
+            for j, label in enumerate(self.labels):
+                doc.cats[label] = float(scores[i, j])
+
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        try:
+            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
+                # Handle cases where there are no tokens in any docs.
+                return losses
+        except AttributeError:
+            types = set([type(eg) for eg in examples])
+            raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
+        set_dropout_rate(self.model, drop)
+        scores, bp_scores = self.model.begin_update(
+            [eg.predicted for eg in examples]
+        )
+        loss, d_scores = self.get_loss(examples, scores)
+        bp_scores(d_scores)
+        if sgd is not None:
+            self.model.finish_update(sgd)
+        losses[self.name] += loss
+        if set_annotations:
+            docs = [eg.predicted for eg in examples]
+            self.set_annotations(docs, scores=scores)
+        return losses
+
+    def rehearse(self, examples, drop=0., sgd=None, losses=None):
+        if self._rehearsal_model is None:
+            return
+        try:
+            docs = [eg.predicted for eg in examples]
+        except AttributeError:
+            types = set([type(eg) for eg in examples])
+            raise TypeError(Errors.E978.format(name="TextCategorizer", method="rehearse", types=types))
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            return
+        set_dropout_rate(self.model, drop)
+        scores, bp_scores = self.model.begin_update(docs)
+        target = self._rehearsal_model(examples)
+        gradient = scores - target
+        bp_scores(gradient)
+        if sgd is not None:
+            self.model.finish_update(sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += (gradient**2).sum()
+
+    def _examples_to_truth(self, examples):
+        truths = numpy.zeros((len(examples), len(self.labels)), dtype="f")
+        not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
+        for i, eg in enumerate(examples):
+            for j, label in enumerate(self.labels):
+                if label in eg.reference.cats:
+                    truths[i, j] = eg.reference.cats[label]
+                else:
+                    not_missing[i, j] = 0.
+        truths = self.model.ops.asarray(truths)
+        return truths, not_missing
+
+    def get_loss(self, examples, scores):
+        truths, not_missing = self._examples_to_truth(examples)
+        not_missing = self.model.ops.asarray(not_missing)
+        d_scores = (scores-truths) / scores.shape[0]
+        d_scores *= not_missing
+        mean_square_error = (d_scores**2).sum(axis=1).mean()
+        return float(mean_square_error), d_scores
+
+    def add_label(self, label):
+        if not isinstance(label, str):
+            raise ValueError(Errors.E187)
+        if label in self.labels:
+            return 0
+        if self.model.has_dim("nO"):
+            # This functionality was available previously, but was broken.
+            # The problem is that we resize the last layer, but the last layer
+            # is actually just an ensemble. We're not resizing the child layers
+            # - a huge problem.
+            raise ValueError(Errors.E116)
+            # smaller = self.model._layers[-1]
+            # larger = Linear(len(self.labels)+1, smaller.nI)
+            # copy_array(larger.W[:smaller.nO], smaller.W)
+            # copy_array(larger.b[:smaller.nO], smaller.b)
+            # self.model._layers[-1] = larger
+        self.labels = tuple(list(self.labels) + [label])
+        return 1
+
+    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
+        # TODO: begin_training is not guaranteed to see all data / labels ?
+        examples = list(get_examples())
+        for example in examples:
+            try:
+                y = example.y
+            except AttributeError:
+                raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=type(example)))
+            for cat in y.cats:
+                self.add_label(cat)
+        self.require_labels()
+        docs = [Doc(Vocab(), words=["hello"])]
+        truths, _ = self._examples_to_truth(examples)
+        self.set_output(len(self.labels))
+        link_vectors_to_models(self.vocab)
+        self.model.initialize(X=docs, Y=truths)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+    def score(self, examples, **kwargs):
+        return Scorer.score_cats(examples, "cats", labels=self.labels,
+            multi_label=self.model.attrs["multi_label"],
+            positive_label=self.cfg.get("positive_label", None),
+            **kwargs
+        )
+
+
+cdef class DependencyParser(Parser):
+    """Pipeline component for dependency parsing.
+
+    DOCS: https://spacy.io/api/dependencyparser
+    """
+    # cdef classes can't have decorators, so we're defining this here
+    name = "parser"
+    factory = "parser"
+    assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
+    requires = []
+    TransitionSystem = ArcEager
+
+    @property
+    def postprocesses(self):
+        output = [nonproj.deprojectivize]
+        if self.cfg.get("learn_tokens") is True:
+            output.append(merge_subtokens)
+        return tuple(output)
+
+    def add_multitask_objective(self, mt_component):
+        self._multitasks.append(mt_component)
+
+    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
+        for labeller in self._multitasks:
+            labeller.model.set_dim("nO", len(self.labels))
+            if labeller.model.has_ref("output_layer"):
+                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
+            labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
+
+    def __reduce__(self):
+        return (DependencyParser, (self.vocab, self.model), (self.moves, self.cfg))
+
+    def __getstate__(self):
+        return (self.moves, self.cfg)
+
+    def __setstate__(self, state):
+        moves, config = state
+        self.moves = moves
+        self.cfg = config
+
+    @property
+    def labels(self):
+        labels = set()
+        # Get the labels from the model by looking at the available moves
+        for move in self.move_names:
+            if "-" in move:
+                label = move.split("-")[1]
+                if "||" in label:
+                    label = label.split("||")[1]
+                labels.add(label)
+        return tuple(sorted(labels))
+
+    def score(self, examples, **kwargs):
+        def dep_getter(token, attr):
+            dep = getattr(token, attr)
+            dep = token.vocab.strings.as_string(dep).lower()
+            return dep
+        results = {}
+        results.update(Scorer.score_spans(examples, "sents", **kwargs))
+        results.update(Scorer.score_deps(examples, "dep", getter=dep_getter,
+            ignore_labels=("p", "punct"), **kwargs))
+        return results
+
+
+cdef class EntityRecognizer(Parser):
+    """Pipeline component for named entity recognition.
+
+    DOCS: https://spacy.io/api/entityrecognizer
+    """
+    name = "ner"
+    factory = "ner"
+    assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
+    requires = []
+    TransitionSystem = BiluoPushDown
+
+    def add_multitask_objective(self, mt_component):
+        self._multitasks.append(mt_component)
+
+    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
+        for labeller in self._multitasks:
+            labeller.model.set_dim("nO", len(self.labels))
+            if labeller.model.has_ref("output_layer"):
+                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
+            labeller.begin_training(get_examples, pipeline=pipeline)
+
+    def __reduce__(self):
+        return (EntityRecognizer, (self.vocab, self.model), (self.moves, self.cfg))
+
+    def __getstate__(self):
+        return self.moves, self.cfg
+
+    def __setstate__(self, state):
+        moves, config = state
+        self.moves = moves
+        self.cfg = config
+
+    @property
+    def labels(self):
+        # Get the labels from the model by looking at the available moves, e.g.
+        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
+        labels = set(move.split("-")[1] for move in self.move_names
+                     if move[0] in ("B", "I", "L", "U"))
+        return tuple(sorted(labels))
+
+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "ents", **kwargs)
+
+@component(
+    "entity_linker",
+    requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
+    assigns=["token.ent_kb_id"],
+    default_model=default_nel,
+)
+class EntityLinker(Pipe):
+    """Pipeline component for named entity linking.
+
+    DOCS: https://spacy.io/api/entitylinker
+    """
+    NIL = "NIL"  # string used to refer to a non-existing link
+
+    def __init__(self, vocab, model, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.kb = None
+        self.kb = cfg.get("kb", None)
+        if self.kb is None:
+            # create an empty KB that should be filled by calling from_disk
+            self.kb = KnowledgeBase(vocab=vocab)
+        else:
+            del cfg["kb"]   # we don't want to duplicate its serialization
+        if not isinstance(self.kb, KnowledgeBase):
+            raise ValueError(Errors.E990.format(type=type(self.kb)))
+        self.cfg = dict(cfg)
+        self.distance = CosineDistance(normalize=False)
+        # how many neightbour sentences to take into account
+        self.n_sents = cfg.get("n_sents", 0)
+
+    def require_kb(self):
+        # Raise an error if the knowledge base is not initialized.
+        if len(self.kb) == 0:
+            raise ValueError(Errors.E139.format(name=self.name))
+
+    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
+        self.require_kb()
+        nO = self.kb.entity_vector_length
+        self.set_output(nO)
+        self.model.initialize()
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+    def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
+        self.require_kb()
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        if not examples:
+            return losses
+        sentence_docs = []
+        try:
+            docs = [eg.predicted for eg in examples]
+        except AttributeError:
+            types = set([type(eg) for eg in examples])
+            raise TypeError(Errors.E978.format(name="EntityLinker", method="update", types=types))
+        if set_annotations:
+            # This seems simpler than other ways to get that exact output -- but
+            # it does run the model twice :(
+            predictions = self.model.predict(docs)
+
+        for eg in examples:
+            sentences = [s for s in eg.predicted.sents]
+            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
+            for ent in eg.predicted.ents:
+                kb_id = kb_ids[ent.start]  # KB ID of the first token is the same as the whole span
+                if kb_id:
+                    try:
+                        # find the sentence in the list of sentences.
+                        sent_index = sentences.index(ent.sent)
+                    except AttributeError:
+                        # Catch the exception when ent.sent is None and provide a user-friendly warning
+                        raise RuntimeError(Errors.E030)
+                    # get n previous sentences, if there are any
+                    start_sentence = max(0, sent_index - self.n_sents)
+
+                    # get n posterior sentences, or as many < n as there are
+                    end_sentence = min(len(sentences) -1, sent_index + self.n_sents)
+
+                    # get token positions
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+
+                    # append that span as a doc to training
+                    sent_doc = eg.predicted[start_token:end_token].as_doc()
+                    sentence_docs.append(sent_doc)
+        set_dropout_rate(self.model, drop)
+        if not sentence_docs:
+            warnings.warn(Warnings.W093.format(name="Entity Linker"))
+            return 0.0
+        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
+        loss, d_scores = self.get_similarity_loss(
+            sentence_encodings=sentence_encodings,
+            examples=examples
+        )
+        bp_context(d_scores)
+        if sgd is not None:
+            self.model.finish_update(sgd)
+
+        losses[self.name] += loss
+        if set_annotations:
+            self.set_annotations(docs, predictions)
+        return losses
+
+    def get_similarity_loss(self, examples, sentence_encodings):
+        entity_encodings = []
+        for eg in examples:
+            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
+            for ent in eg.predicted.ents:
+                kb_id = kb_ids[ent.start]
+                if kb_id:
+                    entity_encoding = self.kb.get_vector(kb_id)
+                    entity_encodings.append(entity_encoding)
+
+        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
+
+        if sentence_encodings.shape != entity_encodings.shape:
+            raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
+
+        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
+        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
+        loss = loss / len(entity_encodings)
+        return loss, gradients
+
+    def __call__(self, doc):
+        kb_ids = self.predict([doc])
+        self.set_annotations([doc], kb_ids)
+        return doc
+
+    def pipe(self, stream, batch_size=128):
+        for docs in util.minibatch(stream, size=batch_size):
+            kb_ids = self.predict(docs)
+            self.set_annotations(docs, kb_ids)
+            yield from docs
+
+    def predict(self, docs):
+        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
+        self.require_kb()
+        entity_count = 0
+        final_kb_ids = []
+
+        if not docs:
+            return final_kb_ids
+
+        if isinstance(docs, Doc):
+            docs = [docs]
+
+        for i, doc in enumerate(docs):
+            sentences = [s for s in doc.sents]
+
+            if len(doc) > 0:
+                # Looping through each sentence and each entity
+                # This may go wrong if there are entities across sentences - which shouldn't happen normally.
+                for sent_index, sent in enumerate(sentences):
+                    if sent.ents:
+                        # get n_neightbour sentences, clipped to the length of the document
+                        start_sentence = max(0, sent_index - self.n_sents)
+                        end_sentence = min(len(sentences) -1, sent_index + self.n_sents)
+
+                        start_token = sentences[start_sentence].start
+                        end_token = sentences[end_sentence].end
+
+                        sent_doc = doc[start_token:end_token].as_doc()
+                        # currently, the context is the same for each entity in a sentence (should be refined)
+                        sentence_encoding = self.model.predict([sent_doc])[0]
+                        xp = get_array_module(sentence_encoding)
+                        sentence_encoding_t = sentence_encoding.T
+                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
+
+                        for ent in sent.ents:
+                            entity_count += 1
+
+                            to_discard = self.cfg.get("labels_discard", [])
+                            if to_discard and ent.label_ in to_discard:
+                                # ignoring this entity - setting to NIL
+                                final_kb_ids.append(self.NIL)
+
+                            else:
+                                candidates = self.kb.get_candidates(ent.text)
+                                if not candidates:
+                                    # no prediction possible for this entity - setting to NIL
+                                    final_kb_ids.append(self.NIL)
+
+                                elif len(candidates) == 1:
+                                    # shortcut for efficiency reasons: take the 1 candidate
+
+                                    # TODO: thresholding
+                                    final_kb_ids.append(candidates[0].entity_)
+
+                                else:
+                                    random.shuffle(candidates)
+
+                                    # this will set all prior probabilities to 0 if they should be excluded from the model
+                                    prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                                    if not self.cfg.get("incl_prior", True):
+                                        prior_probs = xp.asarray([0.0 for c in candidates])
+                                    scores = prior_probs
+
+                                    # add in similarity from the context
+                                    if self.cfg.get("incl_context", True):
+                                        entity_encodings = xp.asarray([c.entity_vector for c in candidates])
+                                        entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+
+                                        if len(entity_encodings) != len(prior_probs):
+                                            raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length"))
+
+                                        # cosine similarity
+                                        sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm)
+                                        if sims.shape != prior_probs.shape:
+                                            raise ValueError(Errors.E161)
+                                        scores = prior_probs + sims - (prior_probs*sims)
+
+                                    # TODO: thresholding
+                                    best_index = scores.argmax().item()
+                                    best_candidate = candidates[best_index]
+                                    final_kb_ids.append(best_candidate.entity_)
+
+        if not (len(final_kb_ids) == entity_count):
+            raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
+
+        return final_kb_ids
+
+    def set_annotations(self, docs, kb_ids):
+        count_ents = len([ent for doc in docs for ent in doc.ents])
+        if count_ents != len(kb_ids):
+            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
+
+        i=0
+        for doc in docs:
+            for ent in doc.ents:
+                kb_id = kb_ids[i]
+                i += 1
+                for token in ent:
+                    token.ent_kb_id_ = kb_id
+
+    def to_disk(self, path, exclude=tuple()):
+        serialize = {}
+        self.cfg["entity_width"] = self.kb.entity_vector_length
+        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
+        serialize["kb"] = lambda p: self.kb.dump(p)
+        serialize["model"] = lambda p: self.model.to_disk(p)
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, exclude=tuple()):
+        def load_model(p):
+            try:
+                self.model.from_bytes(p.open("rb").read())
+            except AttributeError:
+                raise ValueError(Errors.E149)
+
+        def load_kb(p):
+            self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
+            self.kb.load_bulk(p)
+
+        deserialize = {}
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
+        deserialize["kb"] = load_kb
+        deserialize["model"] = load_model
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+    def rehearse(self, examples, sgd=None, losses=None, **config):
+        raise NotImplementedError
+
+    def add_label(self, label):
+        raise NotImplementedError
+
+
+@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
+class Sentencizer(Pipe):
+    """Segment the Doc into sentences using a rule-based strategy.
+
+    DOCS: https://spacy.io/api/sentencizer
+    """
+
+    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
+            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
+            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
+            '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
+            '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
+            '﹖', '﹗', '！', '．', '？', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
+            '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
+            '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
+            '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
+            '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
+            '｡', '。']
+
+    def __init__(self, punct_chars=None, **kwargs):
+        """Initialize the sentencizer.
+
+        punct_chars (list): Punctuation characters to split on. Will be
+            serialized with the nlp object.
+        RETURNS (Sentencizer): The sentencizer component.
+
+        DOCS: https://spacy.io/api/sentencizer#init
+        """
+        if punct_chars:
+            self.punct_chars = set(punct_chars)
+        else:
+            self.punct_chars = set(self.default_punct_chars)
+
+    @classmethod
+    def from_nlp(cls, nlp, model=None, **cfg):
+        return cls(**cfg)
+
+    def begin_training(
+        self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
+    ):
+        pass
+
+    def __call__(self, doc):
+        """Apply the sentencizer to a Doc and set Token.is_sent_start.
+
+        example (Doc or Example): The document to process.
+        RETURNS (Doc or Example): The processed Doc or Example.
+
+        DOCS: https://spacy.io/api/sentencizer#call
+        """
+        start = 0
+        seen_period = False
+        for i, token in enumerate(doc):
+            is_in_punct_chars = token.text in self.punct_chars
+            token.is_sent_start = i == 0
+            if seen_period and not token.is_punct and not is_in_punct_chars:
+                doc[start].is_sent_start = True
+                start = token.i
+                seen_period = False
+            elif is_in_punct_chars:
+                seen_period = True
+        if start < len(doc):
+            doc[start].is_sent_start = True
+        return doc
+
+    def pipe(self, stream, batch_size=128):
+        for docs in util.minibatch(stream, size=batch_size):
+            predictions = self.predict(docs)
+            self.set_annotations(docs, predictions)
+            yield from docs
+
+    def predict(self, docs):
+        """Apply the pipeline's model to a batch of docs, without
+        modifying them.
+        """
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            guesses = [[] for doc in docs]
+            return guesses
+        guesses = []
+        for doc in docs:
+            doc_guesses = [False] * len(doc)
+            if len(doc) > 0:
+                start = 0
+                seen_period = False
+                doc_guesses[0] = True
+                for i, token in enumerate(doc):
+                    is_in_punct_chars = token.text in self.punct_chars
+                    if seen_period and not token.is_punct and not is_in_punct_chars:
+                        doc_guesses[start] = True
+                        start = token.i
+                        seen_period = False
+                    elif is_in_punct_chars:
+                        seen_period = True
+                if start < len(doc):
+                    doc_guesses[start] = True
+            guesses.append(doc_guesses)
+        return guesses
+
+    def set_annotations(self, docs, batch_tag_ids):
+        if isinstance(docs, Doc):
+            docs = [docs]
+        cdef Doc doc
+        cdef int idx = 0
+        for i, doc in enumerate(docs):
+            doc_tag_ids = batch_tag_ids[i]
+            for j, tag_id in enumerate(doc_tag_ids):
+                # Don't clobber existing sentence boundaries
+                if doc.c[j].sent_start == 0:
+                    if tag_id:
+                        doc.c[j].sent_start = 1
+                    else:
+                        doc.c[j].sent_start = -1
+
+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "sents", **kwargs)
+
+    def to_bytes(self, **kwargs):
+        """Serialize the sentencizer to a bytestring.
+
+        RETURNS (bytes): The serialized object.
+
+        DOCS: https://spacy.io/api/sentencizer#to_bytes
+        """
+        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
+
+    def from_bytes(self, bytes_data, **kwargs):
+        """Load the sentencizer from a bytestring.
+
+        bytes_data (bytes): The data to load.
+        returns (Sentencizer): The loaded object.
+
+        DOCS: https://spacy.io/api/sentencizer#from_bytes
+        """
+        cfg = srsly.msgpack_loads(bytes_data)
+        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+        return self
+
+    def to_disk(self, path, exclude=tuple(), **kwargs):
+        """Serialize the sentencizer to disk.
+
+        DOCS: https://spacy.io/api/sentencizer#to_disk
+        """
+        path = util.ensure_path(path)
+        path = path.with_suffix(".json")
+        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
+
+
+    def from_disk(self, path, exclude=tuple(), **kwargs):
+        """Load the sentencizer from disk.
+
+        DOCS: https://spacy.io/api/sentencizer#from_disk
+        """
+        path = util.ensure_path(path)
+        path = path.with_suffix(".json")
+        cfg = srsly.read_json(path)
+        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+        return self
+
+
+# Cython classes can't be decorated, so we need to add the factories here
+Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, model, **cfg)
+Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg)
+
+def parser_factory(nlp, model, **cfg):
+    default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
+    if model is None:
+        model = default_parser()
+        warnings.warn(Warnings.W098.format(name="parser"))
+    for key, value in default_config.items():
+        if key not in cfg:
+            cfg[key] = value
+    return DependencyParser.from_nlp(nlp, model, **cfg)
+
+def ner_factory(nlp, model, **cfg):
+    default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
+    if model is None:
+        model = default_ner()
+        warnings.warn(Warnings.W098.format(name="ner"))
+    for key, value in default_config.items():
+        if key not in cfg:
+            cfg[key] = value
+    return EntityRecognizer.from_nlp(nlp, model, **cfg)
+
+__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index c827ffc5c..70188f856 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -6,6 +6,7 @@ from ..tokens.doc cimport Doc
 
 from .pipe import Pipe
 from ..language import Language
+from ..scorer import Scorer
 from .. import util
 
 
@@ -130,6 +131,9 @@ class Sentencizer(Pipe):
                     else:
                         doc.c[j].sent_start = -1
 
+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "sents", **kwargs)
+
     def to_bytes(self, exclude=tuple()):
         """Serialize the sentencizer to a bytestring.
 
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 603b7965e..c065ae72f 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -8,6 +8,7 @@ from .pipe import deserialize_config
 from .tagger import Tagger
 from ..language import Language
 from ..errors import Errors
+from ..scorer import Scorer
 from .. import util
 
 
@@ -104,6 +105,9 @@ class SentenceRecognizer(Tagger):
     def add_label(self, label, values=None):
         raise NotImplementedError
 
+    def score(self, examples, **kwargs):
+        return Scorer.score_spans(examples, "sents", **kwargs)
+
     def to_bytes(self, exclude=tuple()):
         serialize = {}
         serialize["model"] = self.model.to_bytes
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index e4250b932..1c4105921 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -14,6 +14,7 @@ from ..language import Language
 from ..attrs import POS, ID
 from ..parts_of_speech import X
 from ..errors import Errors, TempErrors, Warnings
+from ..scorer import Scorer
 from .. import util
 
 
@@ -250,6 +251,13 @@ class Tagger(Pipe):
         with self.model.use_params(params):
             yield
 
+    def score(self, examples, **kwargs):
+        scores = {}
+        scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
+        scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+        scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
+        return scores
+
     def to_bytes(self, exclude=tuple()):
         serialize = {}
         serialize["model"] = self.model.to_bytes
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index ff79a600a..bc68bb806 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -6,6 +6,7 @@ from .pipe import Pipe
 from ..language import Language
 from ..gold import Example
 from ..errors import Errors
+from ..scorer import Scorer
 from .. import util
 from ..tokens import Doc
 from ..vocab import Vocab
@@ -250,3 +251,9 @@ class TextCategorizer(Pipe):
         if sgd is None:
             sgd = self.create_optimizer()
         return sgd
+
+    def score(self, examples, positive_label=None, **kwargs):
+        return Scorer.score_cats(examples, "cats", labels=self.labels,
+            multi_label=self.model.attrs["multi_label"],
+            positive_label=positive_label, **kwargs
+        )
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 512f27e07..a95fe70cf 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,6 +1,8 @@
 import numpy as np
 
 from .errors import Errors
+from .util import get_lang_class
+from .morphology import Morphology
 
 
 class PRFScore:
@@ -32,6 +34,9 @@ class PRFScore:
         r = self.recall
         return 2 * ((p * r) / (p + r + 1e-100))
 
+    def to_dict(self):
+        return {"p": self.precision, "r": self.recall, "f": self.fscore}
+
 
 class ROCAUCScore:
     """
@@ -65,391 +70,405 @@ class ROCAUCScore:
 class Scorer:
     """Compute evaluation scores."""
 
-    def __init__(self, eval_punct=False, pipeline=None):
+    def __init__(self, nlp=None, **cfg):
         """Initialize the Scorer.
-
-        eval_punct (bool): Evaluate the dependency attachments to and from
-            punctuation.
         RETURNS (Scorer): The newly created object.
 
         DOCS: https://spacy.io/api/scorer#init
         """
-        self.tokens = PRFScore()
-        self.sbd = PRFScore()
-        self.unlabelled = PRFScore()
-        self.labelled = PRFScore()
-        self.labelled_per_dep = dict()
-        self.tags = PRFScore()
-        self.pos = PRFScore()
-        self.morphs = PRFScore()
-        self.morphs_per_feat = dict()
-        self.sent_starts = PRFScore()
-        self.ner = PRFScore()
-        self.ner_per_ents = dict()
-        self.eval_punct = eval_punct
-        self.textcat = PRFScore()
-        self.textcat_f_per_cat = dict()
-        self.textcat_auc_per_cat = dict()
-        self.textcat_positive_label = None
-        self.textcat_multilabel = False
+        self.nlp = nlp
+        self.cfg = cfg
 
-        if pipeline:
-            for name, component in pipeline:
-                if name == "textcat":
-                    self.textcat_multilabel = component.model.attrs["multi_label"]
-                    self.textcat_positive_label = component.cfg.get(
-                        "positive_label", None
-                    )
-                    for label in component.cfg.get("labels", []):
-                        self.textcat_auc_per_cat[label] = ROCAUCScore()
-                        self.textcat_f_per_cat[label] = PRFScore()
+        if not nlp:
+            # create a default pipeline
+            nlp = get_lang_class("xx")()
+            nlp.add_pipe("senter")
+            nlp.add_pipe("tagger")
+            nlp.add_pipe("morphologizer")
+            nlp.add_pipe("parser")
+            nlp.add_pipe("ner")
+            nlp.add_pipe("textcat")
+            self.nlp = nlp
 
-    @property
-    def tags_acc(self):
-        """RETURNS (float): Part-of-speech tag accuracy (fine grained tags,
-            i.e. `Token.tag`).
-        """
-        return self.tags.fscore * 100
-
-    @property
-    def pos_acc(self):
-        """RETURNS (float): Part-of-speech tag accuracy (coarse grained pos,
-            i.e. `Token.pos`).
-        """
-        return self.pos.fscore * 100
-
-    @property
-    def morphs_acc(self):
-        """RETURNS (float): Morph tag accuracy (morphological features,
-           i.e. `Token.morph`).
-       """
-        return self.morphs.fscore * 100
-
-    @property
-    def morphs_per_type(self):
-        """RETURNS (dict): Scores per dependency label.
-       """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.morphs_per_feat.items()
-        }
-
-    @property
-    def sent_p(self):
-        """RETURNS (float): F-score for identification of sentence starts.
-            i.e. `Token.is_sent_start`).
-        """
-        return self.sent_starts.precision * 100
-
-    @property
-    def sent_r(self):
-        """RETURNS (float): F-score for identification of sentence starts.
-            i.e. `Token.is_sent_start`).
-        """
-        return self.sent_starts.recall * 100
-
-    @property
-    def sent_f(self):
-        """RETURNS (float): F-score for identification of sentence starts.
-            i.e. `Token.is_sent_start`).
-        """
-        return self.sent_starts.fscore * 100
-
-    @property
-    def token_acc(self):
-        """RETURNS (float): Tokenization accuracy."""
-        return self.tokens.precision * 100
-
-    @property
-    def uas(self):
-        """RETURNS (float): Unlabelled dependency score."""
-        return self.unlabelled.fscore * 100
-
-    @property
-    def las(self):
-        """RETURNS (float): Labelled dependency score."""
-        return self.labelled.fscore * 100
-
-    @property
-    def las_per_type(self):
-        """RETURNS (dict): Scores per dependency label.
-        """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.labelled_per_dep.items()
-        }
-
-    @property
-    def ents_p(self):
-        """RETURNS (float): Named entity accuracy (precision)."""
-        return self.ner.precision * 100
-
-    @property
-    def ents_r(self):
-        """RETURNS (float): Named entity accuracy (recall)."""
-        return self.ner.recall * 100
-
-    @property
-    def ents_f(self):
-        """RETURNS (float): Named entity accuracy (F-score)."""
-        return self.ner.fscore * 100
-
-    @property
-    def ents_per_type(self):
-        """RETURNS (dict): Scores per entity label.
-        """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.ner_per_ents.items()
-        }
-
-    @property
-    def textcat_f(self):
-        """RETURNS (float): f-score on positive label for binary classification,
-        macro-averaged f-score for multilabel classification
-        """
-        if not self.textcat_multilabel:
-            if self.textcat_positive_label:
-                # binary classification
-                return self.textcat.fscore * 100
-        # multi-class and/or multi-label
-        return (
-            sum([score.fscore for label, score in self.textcat_f_per_cat.items()])
-            / (len(self.textcat_f_per_cat) + 1e-100)
-            * 100
-        )
-
-    @property
-    def textcat_auc(self):
-        """RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined)
-        """
-        return max(
-            sum([score.score for label, score in self.textcat_auc_per_cat.items()])
-            / (len(self.textcat_auc_per_cat) + 1e-100),
-            -1,
-        )
-
-    @property
-    def textcats_auc_per_cat(self):
-        """RETURNS (dict): AUC ROC Scores per textcat label.
-        """
-        return {
-            k: {"roc_auc_score": max(v.score, -1)}
-            for k, v in self.textcat_auc_per_cat.items()
-        }
-
-    @property
-    def textcats_f_per_cat(self):
-        """RETURNS (dict): F-scores per textcat label.
-        """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.textcat_f_per_cat.items()
-        }
-
-    @property
-    def scores(self):
-        """RETURNS (dict): All scores mapped by key.
-        """
-        return {
-            "uas": self.uas,
-            "las": self.las,
-            "las_per_type": self.las_per_type,
-            "ents_p": self.ents_p,
-            "ents_r": self.ents_r,
-            "ents_f": self.ents_f,
-            "ents_per_type": self.ents_per_type,
-            "tags_acc": self.tags_acc,
-            "pos_acc": self.pos_acc,
-            "morphs_acc": self.morphs_acc,
-            "morphs_per_type": self.morphs_per_type,
-            "sent_p": self.sent_p,
-            "sent_r": self.sent_r,
-            "sent_f": self.sent_f,
-            "token_acc": self.token_acc,
-            "textcat_f": self.textcat_f,
-            "textcat_auc": self.textcat_auc,
-            "textcats_f_per_cat": self.textcats_f_per_cat,
-            "textcats_auc_per_cat": self.textcats_auc_per_cat,
-        }
-
-    def score(self, example, verbose=False, punct_labels=("p", "punct")):
-        """Update the evaluation scores from a single Example.
-
-        example (Example): The predicted annotations + correct annotations.
-        verbose (bool): Print debugging information.
-        punct_labels (tuple): Dependency labels for punctuation. Used to
-            evaluate dependency attachments to punctuation if `eval_punct` is
-            `True`.
+    def score(self, examples):
+        """Evaluate a list of Examples.
 
+        examples (Iterable[Example]): The predicted annotations + correct annotations.
+        RETURNS (Dict): A dictionary of scores.
         DOCS: https://spacy.io/api/scorer#score
         """
-        doc = example.predicted
-        gold_doc = example.reference
-        align = example.alignment
-        gold_deps = set()
-        gold_deps_per_dep = {}
-        gold_tags = set()
-        gold_pos = set()
-        gold_morphs = set()
-        gold_morphs_per_feat = {}
-        gold_sent_starts = set()
-        for gold_i, token in enumerate(gold_doc):
-            gold_tags.add((gold_i, token.tag_))
-            gold_pos.add((gold_i, token.pos_))
-            gold_morphs.add((gold_i, token.morph_))
-            if token.morph_:
-                for feat in token.morph_.split("|"):
-                    field, values = feat.split("=")
-                    if field not in self.morphs_per_feat:
-                        self.morphs_per_feat[field] = PRFScore()
-                    if field not in gold_morphs_per_feat:
-                        gold_morphs_per_feat[field] = set()
-                    gold_morphs_per_feat[field].add((gold_i, feat))
-            if token.sent_start:
-                gold_sent_starts.add(gold_i)
-            dep = token.dep_.lower()
-            if dep not in punct_labels:
-                gold_deps.add((gold_i, token.head.i, dep))
-                if dep not in self.labelled_per_dep:
-                    self.labelled_per_dep[dep] = PRFScore()
-                if dep not in gold_deps_per_dep:
-                    gold_deps_per_dep[dep] = set()
-                gold_deps_per_dep[dep].add((gold_i, token.head.i, dep))
-        cand_deps = set()
-        cand_deps_per_dep = {}
-        cand_tags = set()
-        cand_pos = set()
-        cand_morphs = set()
-        cand_morphs_per_feat = {}
-        cand_sent_starts = set()
-        for token in doc:
-            if token.orth_.isspace():
+        scores = {}
+
+        if hasattr(self.nlp.tokenizer, "score"):
+            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
+        for name, component in self.nlp.pipeline:
+            if hasattr(component, "score"):
+                scores.update(component.score(examples, **self.cfg))
+
+        return scores
+
+    @staticmethod
+    def score_tokenization(examples, **cfg):
+        """Returns accuracy and PRF scores for tokenization.
+
+        * token_acc: # correct tokens / # gold tokens
+        * token_p/r/f: PRF for token character spans
+
+        examples (Iterable[Example]): Examples to score
+        RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
+        """
+        acc_score = PRFScore()
+        prf_score = PRFScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_spans = set()
+            pred_spans = set()
+            for token in gold_doc:
+                if token.orth_.isspace():
+                    continue
+                gold_spans.add((token.idx, token.idx + len(token)))
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                pred_spans.add((token.idx, token.idx + len(token)))
+                if align.x2y.lengths[token.i] != 1:
+                    acc_score.fp += 1
+                else:
+                    acc_score.tp += 1
+            prf_score.score_set(pred_spans, gold_spans)
+        return {
+            "token_acc": acc_score.fscore,
+            "token_p": prf_score.precision,
+            "token_r": prf_score.recall,
+            "token_f": prf_score.fscore,
+        }
+
+    @staticmethod
+    def score_token_attr(examples, attr, getter=getattr, **cfg):
+        """Returns an accuracy score for a token-level attribute.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        RETURNS (dict): A dictionary containing the accuracy score under the
+            key attr_acc.
+        """
+        tag_score = PRFScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_tags = set()
+            for gold_i, token in enumerate(gold_doc):
+                gold_tags.add((gold_i, getter(token, attr)))
+            pred_tags = set()
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] == 1:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    pred_tags.add((gold_i, getter(token, attr)))
+            tag_score.score_set(pred_tags, gold_tags)
+        return {
+            attr + "_acc": tag_score.fscore,
+        }
+
+    @staticmethod
+    def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
+        """Return PRF scores per feat for a token attribute in UFEATS format.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
+            the key attr_per_feat.
+        """
+        per_feat = {}
+        for example in examples:
+            pred_doc = example.predicted
+            gold_doc = example.reference
+            align = example.alignment
+            gold_per_feat = {}
+            for gold_i, token in enumerate(gold_doc):
+                morph = str(getter(token, attr))
+                if morph:
+                    for feat in morph.split(Morphology.FEATURE_SEP):
+                        field, values = feat.split(Morphology.FIELD_SEP)
+                        if field not in per_feat:
+                            per_feat[field] = PRFScore()
+                        if field not in gold_per_feat:
+                            gold_per_feat[field] = set()
+                        gold_per_feat[field].add((gold_i, feat))
+            pred_per_feat = {}
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] == 1:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    morph = str(getter(token, attr))
+                    if morph:
+                        for feat in morph.split("|"):
+                            field, values = feat.split("=")
+                            if field not in per_feat:
+                                per_feat[field] = PRFScore()
+                            if field not in pred_per_feat:
+                                pred_per_feat[field] = set()
+                            pred_per_feat[field].add((gold_i, feat))
+            for field in per_feat:
+                per_feat[field].score_set(
+                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
+                )
+        return {
+            attr + "_per_feat": per_feat,
+        }
+
+    @staticmethod
+    def score_spans(examples, attr, getter=getattr, **cfg):
+        """Returns PRF scores for labeled spans.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(doc, attr) should return the spans for the individual doc.
+        RETURNS (dict): A dictionary containing the PRF scores under the
+            keys attr_p/r/f and the per-type PRF scores under attr_per_type.
+        """
+        score = PRFScore()
+        score_per_type = dict()
+        for example in examples:
+            pred_doc = example.predicted
+            gold_doc = example.reference
+            # Find all labels in gold and doc
+            labels = set(
+                [k.label_ for k in getter(gold_doc, attr)]
+                + [k.label_ for k in getter(pred_doc, attr)]
+            )
+            # Set up all labels for per type scoring and prepare gold per type
+            gold_per_type = {label: set() for label in labels}
+            for label in labels:
+                if label not in score_per_type:
+                    score_per_type[label] = PRFScore()
+            # Find all predidate labels, for all and per type
+            gold_spans = set()
+            pred_spans = set()
+
+            # Special case for ents:
+            # If we have missing values in the gold, we can't easily tell
+            # whether our NER predictions are true.
+            # It seems bad but it's what we've always done.
+            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
                 continue
-            if align.x2y.lengths[token.i] != 1:
-                self.tokens.fp += 1
-                gold_i = None
-            else:
-                gold_i = align.x2y[token.i].dataXd[0, 0]
-                self.tokens.tp += 1
-                cand_tags.add((gold_i, token.tag_))
-                cand_pos.add((gold_i, token.pos_))
-                cand_morphs.add((gold_i, token.morph_))
-                if token.morph_:
-                    for feat in token.morph_.split("|"):
-                        field, values = feat.split("=")
-                        if field not in self.morphs_per_feat:
-                            self.morphs_per_feat[field] = PRFScore()
-                        if field not in cand_morphs_per_feat:
-                            cand_morphs_per_feat[field] = set()
-                        cand_morphs_per_feat[field].add((gold_i, feat))
-                if token.is_sent_start:
-                    cand_sent_starts.add(gold_i)
-            if token.dep_.lower() not in punct_labels and token.orth_.strip():
-                if align.x2y.lengths[token.head.i] == 1:
-                    gold_head = align.x2y[token.head.i].dataXd[0, 0]
-                else:
-                    gold_head = None
-                # None is indistinct, so we can't just add it to the set
-                # Multiple (None, None) deps are possible
-                if gold_i is None or gold_head is None:
-                    self.unlabelled.fp += 1
-                    self.labelled.fp += 1
-                else:
-                    cand_deps.add((gold_i, gold_head, token.dep_.lower()))
-                    if token.dep_.lower() not in self.labelled_per_dep:
-                        self.labelled_per_dep[token.dep_.lower()] = PRFScore()
-                    if token.dep_.lower() not in cand_deps_per_dep:
-                        cand_deps_per_dep[token.dep_.lower()] = set()
-                    cand_deps_per_dep[token.dep_.lower()].add(
-                        (gold_i, gold_head, token.dep_.lower())
+
+            for span in getter(gold_doc, attr):
+                gold_span = (span.label_, span.start, span.end - 1)
+                gold_spans.add(gold_span)
+                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            pred_per_type = {label: set() for label in labels}
+            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
+                pred_spans.add((span.label_, span.start, span.end - 1))
+                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            # Scores per label
+            for k, v in score_per_type.items():
+                if k in pred_per_type:
+                    v.score_set(pred_per_type[k], gold_per_type[k])
+            # Score for all labels
+            score.score_set(pred_spans, gold_spans)
+        results = {
+            attr + "_p": score.precision,
+            attr + "_r": score.recall,
+            attr + "_f": score.fscore,
+            attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
+        return results
+
+    @staticmethod
+    def score_cats(
+        examples,
+        attr,
+        getter=getattr,
+        labels=[],
+        multi_label=True,
+        positive_label=None,
+        **cfg
+    ):
+        """Returns PRF and ROC AUC scores for a doc-level attribute with a
+        dict with scores for each label like Doc.cats.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (callable): Defaults to getattr. If provided,
+            getter(doc, attr) should return the values for the individual doc.
+        labels (Iterable[str]): The set of possible labels. Defaults to [].
+        multi_label (bool): Whether the attribute allows multiple labels.
+            Defaults to True.
+        positive_label (str): The positive label for a binary task with
+            exclusive classes. Defaults to None.
+        RETURNS (dict): A dictionary containing the scores:
+            for binary exclusive with positive label: attr_p/r/f,
+            for 3+ exclusive classes, macro-averaged fscore: attr_macro_f,
+            for multilabel, macro-averaged AUC: attr_macro_auc,
+            for all: attr_f_per_type, attr_auc_per_type
+        """
+        score = PRFScore()
+        f_per_type = dict()
+        auc_per_type = dict()
+        for label in labels:
+            f_per_type[label] = PRFScore()
+            auc_per_type[label] = ROCAUCScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            gold_values = getter(gold_doc, attr)
+            pred_values = getter(pred_doc, attr)
+            if (
+                len(gold_values) > 0
+                and set(f_per_type) == set(auc_per_type) == set(gold_values)
+                and set(gold_values) == set(pred_values)
+            ):
+                gold_val = max(gold_values, key=gold_values.get)
+                pred_val = max(pred_values, key=pred_values.get)
+                if positive_label:
+                    score.score_set(
+                        set([positive_label]) & set([pred_val]),
+                        set([positive_label]) & set([gold_val]),
+                    )
+                for label in set(gold_values):
+                    auc_per_type[label].score_set(
+                        pred_values[label], gold_values[label]
+                    )
+                    f_per_type[label].score_set(
+                        set([label]) & set([pred_val]), set([label]) & set([gold_val])
+                    )
+            elif len(f_per_type) > 0:
+                model_labels = set(f_per_type)
+                eval_labels = set(gold_values)
+                raise ValueError(
+                    Errors.E162.format(
+                        model_labels=model_labels, eval_labels=eval_labels
                     )
-        # Find all NER labels in gold and doc
-        ent_labels = set(
-            [k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]
-        )
-        # Set up all labels for per type scoring and prepare gold per type
-        gold_per_ents = {ent_label: set() for ent_label in ent_labels}
-        for ent_label in ent_labels:
-            if ent_label not in self.ner_per_ents:
-                self.ner_per_ents[ent_label] = PRFScore()
-        # Find all candidate labels, for all and per type
-        gold_ents = set()
-        cand_ents = set()
-        # If we have missing values in the gold, we can't easily tell whether
-        # our NER predictions are true.
-        # It seems bad but it's what we've always done.
-        if all(token.ent_iob != 0 for token in gold_doc):
-            for ent in gold_doc.ents:
-                gold_ent = (ent.label_, ent.start, ent.end - 1)
-                gold_ents.add(gold_ent)
-                gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
-            cand_per_ents = {ent_label: set() for ent_label in ent_labels}
-            for ent in example.get_aligned_spans_x2y(doc.ents):
-                cand_ents.add((ent.label_, ent.start, ent.end - 1))
-                cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
-            # Scores per ent
-            for k, v in self.ner_per_ents.items():
-                if k in cand_per_ents:
-                    v.score_set(cand_per_ents[k], gold_per_ents[k])
-            # Score for all ents
-            self.ner.score_set(cand_ents, gold_ents)
-        self.tags.score_set(cand_tags, gold_tags)
-        self.pos.score_set(cand_pos, gold_pos)
-        self.morphs.score_set(cand_morphs, gold_morphs)
-        for field in self.morphs_per_feat:
-            self.morphs_per_feat[field].score_set(
-                cand_morphs_per_feat.get(field, set()),
-                gold_morphs_per_feat.get(field, set()),
-            )
-        self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
-        self.labelled.score_set(cand_deps, gold_deps)
-        for dep in self.labelled_per_dep:
-            self.labelled_per_dep[dep].score_set(
-                cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
-            )
-        self.unlabelled.score_set(
-            set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
-        )
-        if (
-            len(gold_doc.cats) > 0
-            and set(self.textcat_f_per_cat)
-            == set(self.textcat_auc_per_cat)
-            == set(gold_doc.cats)
-            and set(gold_doc.cats) == set(doc.cats)
-        ):
-            goldcat = max(gold_doc.cats, key=gold_doc.cats.get)
-            candcat = max(doc.cats, key=doc.cats.get)
-            if self.textcat_positive_label:
-                self.textcat.score_set(
-                    set([self.textcat_positive_label]) & set([candcat]),
-                    set([self.textcat_positive_label]) & set([goldcat]),
                 )
-            for label in set(gold_doc.cats):
-                self.textcat_auc_per_cat[label].score_set(
-                    doc.cats[label], gold_doc.cats[label]
+            elif len(auc_per_type) > 0:
+                model_labels = set(auc_per_type)
+                eval_labels = set(gold_values)
+                raise ValueError(
+                    Errors.E162.format(
+                        model_labels=model_labels, eval_labels=eval_labels
+                    )
                 )
-                self.textcat_f_per_cat[label].score_set(
-                    set([label]) & set([candcat]), set([label]) & set([goldcat])
+        results = {
+            attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+            attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
+        }
+        if len(labels) == 2 and not multi_label and positive_label:
+            results[attr + "_p"] = score.precision
+            results[attr + "_r"] = score.recall
+            results[attr + "_f"] = score.fscore
+        elif not multi_label:
+            results[attr + "_macro_f"] = sum(
+                [score.fscore for label, score in f_per_type.items()]
+            ) / (len(f_per_type) + 1e-100)
+        else:
+            results[attr + "_macro_auc"] = max(
+                sum([score.score for label, score in auc_per_type.items()])
+                / (len(auc_per_type) + 1e-100),
+                -1,
+            )
+        return results
+
+    @staticmethod
+    def score_deps(
+        examples,
+        attr,
+        getter=getattr,
+        head_attr="head",
+        head_getter=getattr,
+        ignore_labels=tuple(),
+        **cfg
+    ):
+        """Returns the UAS, LAS, and LAS per type scores for dependency
+        parses.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute containing the dependency label.
+        getter (callable): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        head_attr (str): The attribute containing the head token. Defaults to
+            'head'.
+        head_getter (callable): Defaults to getattr. If provided,
+            head_getter(token, attr) should return the value of the head for an
+            individual token.
+        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
+        RETURNS (dict): A dictionary containing the scores:
+            attr_uas, attr_las, and attr_las_per_type.
+        """
+        unlabelled = PRFScore()
+        labelled = PRFScore()
+        labelled_per_dep = dict()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_deps = set()
+            gold_deps_per_dep = {}
+            for gold_i, token in enumerate(gold_doc):
+                dep = getter(token, attr)
+                head = head_getter(token, head_attr)
+                if dep not in ignore_labels:
+                    gold_deps.add((gold_i, head.i, dep))
+                    if dep not in labelled_per_dep:
+                        labelled_per_dep[dep] = PRFScore()
+                    if dep not in gold_deps_per_dep:
+                        gold_deps_per_dep[dep] = set()
+                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
+            pred_deps = set()
+            pred_deps_per_dep = {}
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] != 1:
+                    gold_i = None
+                else:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                dep = getter(token, attr)
+                head = head_getter(token, head_attr)
+                if dep not in ignore_labels and token.orth_.strip():
+                    if align.x2y.lengths[head.i] == 1:
+                        gold_head = align.x2y[head.i].dataXd[0, 0]
+                    else:
+                        gold_head = None
+                    # None is indistinct, so we can't just add it to the set
+                    # Multiple (None, None) deps are possible
+                    if gold_i is None or gold_head is None:
+                        unlabelled.fp += 1
+                        labelled.fp += 1
+                    else:
+                        pred_deps.add((gold_i, gold_head, dep))
+                        if dep not in labelled_per_dep:
+                            labelled_per_dep[dep] = PRFScore()
+                        if dep not in pred_deps_per_dep:
+                            pred_deps_per_dep[dep] = set()
+                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
+            labelled.score_set(pred_deps, gold_deps)
+            for dep in labelled_per_dep:
+                labelled_per_dep[dep].score_set(
+                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
                 )
-        elif len(self.textcat_f_per_cat) > 0:
-            model_labels = set(self.textcat_f_per_cat)
-            eval_labels = set(gold_doc.cats)
-            raise ValueError(
-                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
+            unlabelled.score_set(
+                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
             )
-        elif len(self.textcat_auc_per_cat) > 0:
-            model_labels = set(self.textcat_auc_per_cat)
-            eval_labels = set(gold_doc.cats)
-            raise ValueError(
-                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
-            )
-        if verbose:
-            gold_words = gold_doc.words
-            for w_id, h_id, dep in cand_deps - gold_deps:
-                print("F", gold_words[w_id], dep, gold_words[h_id])
-            for w_id, h_id, dep in gold_deps - cand_deps:
-                print("M", gold_words[w_id], dep, gold_words[h_id])
+        return {
+            attr + "_uas": unlabelled.fscore,
+            attr + "_las": labelled.fscore,
+            attr
+            + "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
+        }
 
 
 #############################################################################
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 5eb09a007..ff284873d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -85,6 +85,8 @@ def test_overfitting_IO():
     fix_random_seed(0)
     nlp = English()
     textcat = nlp.add_pipe("textcat")
+    # Set exclusive labels
+    textcat.model.attrs["multi_label"] = False
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -114,6 +116,10 @@ def test_overfitting_IO():
         assert cats2["POSITIVE"] > 0.9
         assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
 
+    # Test scoring
+    scores = nlp.evaluate(train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}})
+    assert scores["cats_f"] == 1.0
+
 
 # fmt: off
 @pytest.mark.parametrize(
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index a6684b706..fea263df5 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -7,6 +7,7 @@ from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc
 from spacy.lang.en import English
+from spacy.tokens import Doc
 
 
 test_las_apple = [
@@ -77,13 +78,61 @@ def tagged_doc():
         doc[i].tag_ = tags[i]
         doc[i].pos_ = pos[i]
         doc[i].morph_ = morphs[i]
+        if i > 0:
+            doc[i].is_sent_start = False
     doc.is_tagged = True
     return doc
 
 
+@pytest.fixture
+def sented_doc():
+    text = "One sentence. Two sentences. Three sentences."
+    nlp = English()
+    doc = nlp(text)
+    for i in range(len(doc)):
+        if i % 3 == 0:
+            doc[i].is_sent_start = True
+        else:
+            doc[i].is_sent_start = False
+    return doc
+
+
+def test_tokenization(sented_doc):
+    scorer = Scorer()
+    gold = {"sent_starts": [t.sent_start for t in sented_doc]}
+    example = Example.from_dict(sented_doc, gold)
+    scores = scorer.score([example])
+    assert scores["token_acc"] == 1.0
+
+    nlp = English()
+    example.predicted = Doc(nlp.vocab, words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."], spaces=[True, True, True, True, True, False])
+    example.predicted[1].is_sent_start = False
+    scores = scorer.score([example])
+    assert scores["token_acc"] == approx(0.66666666)
+    assert scores["token_p"] == 0.5
+    assert scores["token_r"] == approx(0.33333333)
+    assert scores["token_f"] == 0.4
+
+
+def test_sents(sented_doc):
+    scorer = Scorer()
+    gold = {"sent_starts": [t.sent_start for t in sented_doc]}
+    example = Example.from_dict(sented_doc, gold)
+    scores = scorer.score([example])
+    assert scores["sents_f"] == 1.0
+
+    # One sentence start is moved
+    gold["sent_starts"][3] = 0
+    gold["sent_starts"][4] = 1
+    example = Example.from_dict(sented_doc, gold)
+    scores = scorer.score([example])
+    assert scores["sents_f"] == approx(0.3333333)
+
+
 def test_las_per_type(en_vocab):
     # Gold and Doc are identical
     scorer = Scorer()
+    examples = []
     for input_, annot in test_las_apple:
         doc = get_doc(
             en_vocab,
@@ -93,20 +142,21 @@ def test_las_per_type(en_vocab):
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         example = Example.from_dict(doc, gold)
-        scorer.score(example)
-    results = scorer.scores
+        examples.append(example)
+    results = scorer.score(examples)
 
-    assert results["uas"] == 100
-    assert results["las"] == 100
-    assert results["las_per_type"]["nsubj"]["p"] == 100
-    assert results["las_per_type"]["nsubj"]["r"] == 100
-    assert results["las_per_type"]["nsubj"]["f"] == 100
-    assert results["las_per_type"]["compound"]["p"] == 100
-    assert results["las_per_type"]["compound"]["r"] == 100
-    assert results["las_per_type"]["compound"]["f"] == 100
+    assert results["dep_uas"] == 1.0
+    assert results["dep_las"] == 1.0
+    assert results["dep_las_per_type"]["nsubj"]["p"] == 1.0
+    assert results["dep_las_per_type"]["nsubj"]["r"] == 1.0
+    assert results["dep_las_per_type"]["nsubj"]["f"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["p"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["r"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["f"] == 1.0
 
     # One dep is incorrect in Doc
     scorer = Scorer()
+    examples = []
     for input_, annot in test_las_apple:
         doc = get_doc(
             en_vocab,
@@ -117,22 +167,23 @@ def test_las_per_type(en_vocab):
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         doc[0].dep_ = "compound"
         example = Example.from_dict(doc, gold)
-        scorer.score(example)
-    results = scorer.scores
+        examples.append(example)
+    results = scorer.score(examples)
 
-    assert results["uas"] == 100
-    assert_almost_equal(results["las"], 90.9090909)
-    assert results["las_per_type"]["nsubj"]["p"] == 0
-    assert results["las_per_type"]["nsubj"]["r"] == 0
-    assert results["las_per_type"]["nsubj"]["f"] == 0
-    assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
-    assert results["las_per_type"]["compound"]["r"] == 100
-    assert results["las_per_type"]["compound"]["f"] == 80
+    assert results["dep_uas"] == 1.0
+    assert_almost_equal(results["dep_las"], 0.9090909)
+    assert results["dep_las_per_type"]["nsubj"]["p"] == 0
+    assert results["dep_las_per_type"]["nsubj"]["r"] == 0
+    assert results["dep_las_per_type"]["nsubj"]["f"] == 0
+    assert_almost_equal(results["dep_las_per_type"]["compound"]["p"], 0.666666666)
+    assert results["dep_las_per_type"]["compound"]["r"] == 1.0
+    assert results["dep_las_per_type"]["compound"]["f"] == 0.8
 
 
 def test_ner_per_type(en_vocab):
     # Gold and Doc are identical
     scorer = Scorer()
+    examples = []
     for input_, annot in test_ner_cardinal:
         doc = get_doc(
             en_vocab,
@@ -140,20 +191,24 @@ def test_ner_per_type(en_vocab):
             ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
-        ex = Example.from_dict(doc, {"entities": entities})
-        scorer.score(ex)
-    results = scorer.scores
+        example = Example.from_dict(doc, {"entities": entities})
+        # a hack for sentence boundaries
+        example.predicted[1].is_sent_start = False
+        example.reference[1].is_sent_start = False
+        examples.append(example)
+    results = scorer.score(examples)
 
-    assert results["ents_p"] == 100
-    assert results["ents_f"] == 100
-    assert results["ents_r"] == 100
-    assert results["ents_per_type"]["CARDINAL"]["p"] == 100
-    assert results["ents_per_type"]["CARDINAL"]["f"] == 100
-    assert results["ents_per_type"]["CARDINAL"]["r"] == 100
+    assert results["ents_p"] == 1.0
+    assert results["ents_r"] == 1.0
+    assert results["ents_f"] == 1.0
+    assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0
+    assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0
+    assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0
 
     # Doc has one missing and one extra entity
     # Entity type MONEY is not present in Doc
     scorer = Scorer()
+    examples = []
     for input_, annot in test_ner_apple:
         doc = get_doc(
             en_vocab,
@@ -161,25 +216,28 @@ def test_ner_per_type(en_vocab):
             ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
-        ex = Example.from_dict(doc, {"entities": entities})
-        scorer.score(ex)
-    results = scorer.scores
+        example = Example.from_dict(doc, {"entities": entities})
+        # a hack for sentence boundaries
+        example.predicted[1].is_sent_start = False
+        example.reference[1].is_sent_start = False
+        examples.append(example)
+    results = scorer.score(examples)
 
-    assert results["ents_p"] == approx(66.66666)
-    assert results["ents_r"] == approx(66.66666)
-    assert results["ents_f"] == approx(66.66666)
+    assert results["ents_p"] == approx(0.6666666)
+    assert results["ents_r"] == approx(0.6666666)
+    assert results["ents_f"] == approx(0.6666666)
     assert "GPE" in results["ents_per_type"]
     assert "MONEY" in results["ents_per_type"]
     assert "ORG" in results["ents_per_type"]
-    assert results["ents_per_type"]["GPE"]["p"] == 100
-    assert results["ents_per_type"]["GPE"]["r"] == 100
-    assert results["ents_per_type"]["GPE"]["f"] == 100
+    assert results["ents_per_type"]["GPE"]["p"] == 1.0
+    assert results["ents_per_type"]["GPE"]["r"] == 1.0
+    assert results["ents_per_type"]["GPE"]["f"] == 1.0
     assert results["ents_per_type"]["MONEY"]["p"] == 0
     assert results["ents_per_type"]["MONEY"]["r"] == 0
     assert results["ents_per_type"]["MONEY"]["f"] == 0
-    assert results["ents_per_type"]["ORG"]["p"] == 50
-    assert results["ents_per_type"]["ORG"]["r"] == 100
-    assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
+    assert results["ents_per_type"]["ORG"]["p"] == 0.5
+    assert results["ents_per_type"]["ORG"]["r"] == 1.0
+    assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)
 
 
 def test_tag_score(tagged_doc):
@@ -189,17 +247,17 @@ def test_tag_score(tagged_doc):
         "tags": [t.tag_ for t in tagged_doc],
         "pos": [t.pos_ for t in tagged_doc],
         "morphs": [t.morph_ for t in tagged_doc],
+        "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
     }
     example = Example.from_dict(tagged_doc, gold)
-    scorer.score(example)
-    results = scorer.scores
+    results = scorer.score([example])
 
-    assert results["tags_acc"] == 100
-    assert results["pos_acc"] == 100
-    assert results["morphs_acc"] == 100
-    assert results["morphs_per_type"]["NounType"]["f"] == 100
+    assert results["tag_acc"] == 1.0
+    assert results["pos_acc"] == 1.0
+    assert results["morph_acc"] == 1.0
+    assert results["morph_per_feat"]["NounType"].fscore == 1.0
 
-    # Gold and Doc are identical
+    # Gold annotation is modified
     scorer = Scorer()
     tags = [t.tag_ for t in tagged_doc]
     tags[0] = "NN"
@@ -208,16 +266,21 @@ def test_tag_score(tagged_doc):
     morphs = [t.morph_ for t in tagged_doc]
     morphs[1] = "Number=sing"
     morphs[2] = "Number=plur"
-    gold = {"tags": tags, "pos": pos, "morphs": morphs}
+    gold = {
+        "tags": tags,
+        "pos": pos,
+        "morphs": morphs,
+        "sent_starts": gold["sent_starts"],
+    }
     example = Example.from_dict(tagged_doc, gold)
-    scorer.score(example)
-    results = scorer.scores
+    results = scorer.score([example])
 
-    assert results["tags_acc"] == 90
-    assert results["pos_acc"] == 90
-    assert results["morphs_acc"] == approx(80)
-    assert results["morphs_per_type"]["Poss"]["f"] == 0.0
-    assert results["morphs_per_type"]["Number"]["f"] == approx(72.727272)
+    assert results["tag_acc"] == 0.9
+    assert results["pos_acc"] == 0.9
+    assert results["morph_acc"] == approx(0.8)
+    assert results["morph_per_feat"]["NounType"].fscore == 1.0
+    assert results["morph_per_feat"]["Poss"].fscore == 0.0
+    assert results["morph_per_feat"]["Number"].fscore == approx(0.72727272)
 
 
 def test_roc_auc_score():
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 114d227c8..c84dd8627 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -24,6 +24,7 @@ from . import util
 from .util import registry
 from .attrs import intify_attrs
 from .symbols import ORTH
+from .scorer import Scorer
 
 
 @registry.tokenizers("spacy.Tokenizer.v1")
@@ -743,6 +744,9 @@ cdef class Tokenizer:
             tokens.extend(reversed(suffixes))
         return tokens
 
+    def score(self, examples, **kwargs):
+        return Scorer.score_tokenization(examples)
+
     def to_disk(self, path, **kwargs):
         """Save the current state to a directory.
 
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 3ba93b360..be402532c 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -108,8 +108,8 @@ Evaluate a model's pipeline components.
 > #### Example
 >
 > ```python
-> scorer = nlp.evaluate(examples, verbose=True)
-> print(scorer.scores)
+> scores = nlp.evaluate(examples, verbose=True)
+> print(scores)
 > ```
 
 | Name                                         | Type                | Description                                                                           |
@@ -119,7 +119,7 @@ Evaluate a model's pipeline components.
 | `batch_size`                                 | int                 | The batch size to use.                                                                |
 | `scorer`                                     | `Scorer`            | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
 | `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]`   | Config parameters for specific pipeline components, keyed by component name.          |
-| **RETURNS**                                  | Scorer              | The scorer containing the evaluation scores.                                          |
+| **RETURNS**                                  | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores.                                        |
 
 ## Language.begin_training {#begin_training tag="method"}
 
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index cd720d26c..ef4396e1b 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -5,9 +5,12 @@ tag: class
 source: spacy/scorer.py
 ---
 
-The `Scorer` computes and stores evaluation scores. It's typically created by
+The `Scorer` computes evaluation scores. It's typically created by
 [`Language.evaluate`](/api/language#evaluate).
 
+In addition, the `Scorer` provides a number of evaluation methods for
+evaluating `Token` and `Doc` attributes.
+
 ## Scorer.\_\_init\_\_ {#init tag="method"}
 
 Create a new `Scorer`.
@@ -17,46 +20,114 @@ Create a new `Scorer`.
 > ```python
 > from spacy.scorer import Scorer
 >
+> # default scoring pipeline
 > scorer = Scorer()
+>
+> # provided scoring pipeline
+> nlp = spacy.load("en_core_web_sm")
+> scorer = Scorer(nlp)
 > ```
 
 | Name         | Type     | Description                                                  |
 | ------------ | -------- | ------------------------------------------------------------ |
-| `eval_punct` | bool     | Evaluate the dependency attachments to and from punctuation. |
+| `nlp`  | Language       | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`.  |
 | **RETURNS**  | `Scorer` | The newly created object.                                    |
 
 ## Scorer.score {#score tag="method"}
 
-Update the evaluation scores from a single [`Example`](/api/example) object.
+Calculate the scores for a list of [`Example`](/api/example) objects using the
+scoring methods provided by the components in the pipeline.
 
+The returned `Dict` contains the scores provided by the individual pipeline
+components. For the scoring methods provided by the `Scorer` and use by the
+core pipeline components, the individual score names start with the `Token` or
+`Doc` attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`,
+`tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`,
+`dep_las`, `dep_las_per_type`, `ents_p/r/f`, `ents_per_type`,
+`textcat_macro_auc`, `textcat_macro_f`.
+ 
 > #### Example
 >
 > ```python
 > scorer = Scorer()
-> scorer.score(example)
+> scorer.score(examples)
 > ```
 
-| Name           | Type      | Description                                                                                                          |
-| -------------- | --------- | -------------------------------------------------------------------------------------------------------------------- |
-| `example`      | `Example` | The `Example` object holding both the predictions and the correct gold-standard annotations.                         |
-| `verbose`      | bool      | Print debugging information.                                                                                         |
-| `punct_labels` | tuple     | Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is `True`. |
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| **RETURNS** | `Dict`              | A dictionary of scores.                                                                       |
+## Scorer.score_tokenization {#score_tokenization tag="staticmethod"}
 
-## Properties
+Scores the tokenization:
+
+* `token_acc`: # correct tokens / # gold tokens
+* `token_p/r/f`: PRF for token character spans
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| **RETURNS** | `Dict`              | A dictionary containing the scores `token_acc/p/r/f`.                                         |
+
+## Scorer.score_token_attr {#score_token_attr tag="staticmethod"}
+
+Scores a single token attribute.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| **RETURNS** | `Dict`              | A dictionary containing the score `attr_acc`.                                                 |
+
+## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"}
+
+Scores a single token attribute per feature for a token attribute in UFEATS format.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| **RETURNS** | `Dict`              | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. |
+
+## Scorer.score_spans {#score_spans tag="staticmethod"}
+
+Returns PRF scores for labeled or unlabeled spans.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
+| **RETURNS** | `Dict`              | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. |
+
+## Scorer.score_deps {#score_deps tag="staticmethod"}
+
+Calculate the UAS, LAS, and LAS per type scores for dependency parses.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute containing the dependency label. |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
+| `head_attr` | `str`               | The attribute containing the head token. |
+| `head_getter` | `callable`          | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
+| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`).
+| **RETURNS** | `Dict`              | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. |
+
+## Scorer.score_cats {#score_cats tag="staticmethod"}
+
+Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
+containing scores for each label like `Doc.cats`.
+
+| Name        | Type      | Description                                                                                                          |
+| ----------- | --------- | --------------------------------------------------------------------------------------------------------|
+| `examples`  | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
+| `attr`      | `str`               | The attribute to score.                                                                       |
+| `getter`    | `callable`          | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
+| labels      | `Iterable[str]`     | The set of possible labels. Defaults to `[]`. |
+| multi_label | `bool`              | Whether the attribute allows multiple labels. Defaults to `True`. |
+| positive_label | `str`            | The positive label for a binary task with exclusive classes. Defaults to `None`. |
+| **RETURNS** | `Dict`              | A dictionary containing the scores: 1) for binary exclusive with positive label: `attr_p/r/f`; 2) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 3) for multilabel, macro-averaged AUC: `attr_macro_auc`; 4) for all: `attr_f_per_type`, `attr_auc_per_type` |
 
-| Name                                                | Type  | Description                                                                            |
-| --------------------------------------------------- | ----- | -------------------------------------------------------------------------------------- |
-| `token_acc`                                         | float | Tokenization accuracy.                                                                 |
-| `tags_acc`                                          | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`).                     |
-| `uas`                                               | float | Unlabelled dependency score.                                                           |
-| `las`                                               | float | Labelled dependency score.                                                             |
-| `ents_p`                                            | float | Named entity accuracy (precision).                                                     |
-| `ents_r`                                            | float | Named entity accuracy (recall).                                                        |
-| `ents_f`                                            | float | Named entity accuracy (F-score).                                                       |
-| `ents_per_type` <Tag variant="new">2.1.5</Tag>      | dict  | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores.  |
-| `textcat_f` <Tag variant="new">3.0</Tag>            | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. |
-| `textcat_auc` <Tag variant="new">3.0</Tag>          | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined).        |
-| `textcats_f_per_cat` <Tag variant="new">3.0</Tag>   | dict  | F-scores per textcat label, keyed by label.                                            |
-| `textcats_auc_per_cat` <Tag variant="new">3.0</Tag> | dict  | ROC AUC scores per textcat label, keyed by label.                                      |
-| `las_per_type` <Tag variant="new">2.2.3</Tag>       | dict  | Labelled dependency scores, keyed by label.                                            |
-| `scores`                                            | dict  | All scores, keyed by type.                                                             |