Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer Handle missing values in reference doc during scoring where it is possible to detect an unset state for the attribute. If no reference docs contain annotation, `None` is returned instead of a score. `spacy evaluate` displays `-` for missing scores and the missing scores are saved as `None`/`null` in the metrics. Attributes without unset states: * `token.head`: relies on `token.dep` to recognize unset values * `doc.cats`: unable to handle missing annotation Additional changes: * add optional `has_annotation` check to `score_scans` to replace `doc.sents` hack * update `score_token_attr_per_feat` to handle missing and empty morph representations * fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START` vs. `SENT_START` * Fix import * Update return types
2025-08-07 13:44:55 +03:00 · 2020-11-03 15:47:18 +01:00 · 2020-11-03 15:47:18 +01:00 · a4b32b9552
commit a4b32b9552
parent 5d2cb86c34
14 changed files with 294 additions and 158 deletions
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -98,10 +98,13 @@ def evaluate(
        if key in scores:
            if key == "cats_score":
                metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
-            if key == "speed":
-                results[metric] = f"{scores[key]:.0f}"
+            if isinstance(scores[key], (int, float)):
+                if key == "speed":
+                    results[metric] = f"{scores[key]:.0f}"
+                else:
+                    results[metric] = f"{scores[key]*100:.2f}"
            else:
-                results[metric] = f"{scores[key]*100:.2f}"
+                results[metric] = "-"
            data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]

    msg.table(results, title="Results")
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -226,6 +226,9 @@ class AttributeRuler(Pipe):

        DOCS: https://nightly.spacy.io/api/tagger#score
        """
+        def morph_key_getter(token, attr):
+            return getattr(token, attr).key
+
        validate_examples(examples, "AttributeRuler.score")
        results = {}
        attrs = set()
@ -237,7 +240,8 @@ class AttributeRuler(Pipe):
            elif attr == POS:
                results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
            elif attr == MORPH:
-                results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
+                results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
+                results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs))
            elif attr == LEMMA:
                results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
        return results
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -155,13 +155,16 @@ cdef class DependencyParser(Parser):

        DOCS: https://nightly.spacy.io/api/dependencyparser#score
        """
+        def has_sents(doc):
+            return doc.has_annotation("SENT_START")
+
        validate_examples(examples, "DependencyParser.score")
        def dep_getter(token, attr):
            dep = getattr(token, attr)
            dep = token.vocab.strings.as_string(dep).lower()
            return dep
        results = {}
-        results.update(Scorer.score_spans(examples, "sents", **kwargs))
+        results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
        kwargs.setdefault("getter", dep_getter)
        kwargs.setdefault("ignore_labels", ("p", "punct"))
        results.update(Scorer.score_deps(examples, "dep", **kwargs))
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -10,7 +10,7 @@ from ..errors import Errors
 from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
-from ..scorer import Scorer
+from ..scorer import get_ner_prf
 from ..training import validate_examples


@ -340,7 +340,7 @@ class EntityRuler(Pipe):

    def score(self, examples, **kwargs):
        validate_examples(examples, "EntityRuler.score")
-        return Scorer.score_spans(examples, "ents", **kwargs)
+        return get_ner_prf(examples)

    def from_bytes(
        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -251,10 +251,13 @@ class Morphologizer(Tagger):

        DOCS: https://nightly.spacy.io/api/morphologizer#score
        """
+        def morph_key_getter(token, attr):
+            return getattr(token, attr).key
+
        validate_examples(examples, "Morphologizer.score")
        results = {}
        results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-        results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
+        results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
        results.update(Scorer.score_token_attr_per_feat(examples,
-            "morph", **kwargs))
+            "morph", getter=morph_key_getter, **kwargs))
        return results
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser):
        DOCS: https://nightly.spacy.io/api/entityrecognizer#score
        """
        validate_examples(examples, "EntityRecognizer.score")
-        score_per_type = get_ner_prf(examples)
-        totals = PRFScore()
-        for prf in score_per_type.values():
-            totals += prf
-        return {
-            "ents_p": totals.precision,
-            "ents_r": totals.recall,
-            "ents_f": totals.fscore,
-            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
-        }
+        return get_ner_prf(examples)
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -155,8 +155,11 @@ class Sentencizer(Pipe):

        DOCS: https://nightly.spacy.io/api/sentencizer#score
        """
+        def has_sents(doc):
+            return doc.has_annotation("SENT_START")
+
        validate_examples(examples, "Sentencizer.score")
-        results = Scorer.score_spans(examples, "sents", **kwargs)
+        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
        del results["sents_per_type"]
        return results

--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger):
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
        DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
        """
+        def has_sents(doc):
+            return doc.has_annotation("SENT_START")
+
        validate_examples(examples, "SentenceRecognizer.score")
-        results = Scorer.score_spans(examples, "sents", **kwargs)
+        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
        del results["sents_per_type"]
        return results
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,9 +1,9 @@
-from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
+from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
 import numpy as np
 from collections import defaultdict

 from .training import Example
-from .tokens import Token, Doc, Span
+from .tokens import Token, Doc, Span, MorphAnalysis
 from .errors import Errors
 from .util import get_lang_class, SimpleFrozenList
 from .morphology import Morphology
@ -13,7 +13,8 @@ if TYPE_CHECKING:
    from .language import Language  # noqa: F401


-DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
+DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
+MISSING_VALUES = frozenset([None, 0, ""])


 class PRFScore:
@ -24,6 +25,9 @@ class PRFScore:
        self.fp = 0
        self.fn = 0

+    def __len__(self) -> int:
+        return self.tp + self.fp + self.fn
+
    def __iadd__(self, other):
        self.tp += other.tp
        self.fp += other.fp
@ -94,7 +98,7 @@ class Scorer:
        self,
        nlp: Optional["Language"] = None,
        default_lang: str = "xx",
-        default_pipeline=DEFAULT_PIPELINE,
+        default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
        **cfg,
    ) -> None:
        """Initialize the Scorer.
@ -126,13 +130,13 @@ class Scorer:
        return scores

    @staticmethod
-    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
+    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
        """Returns accuracy and PRF scores for tokenization.
        * token_acc: # correct tokens / # gold tokens
        * token_p/r/f: PRF for token character spans

        examples (Iterable[Example]): Examples to score
-        RETURNS (Dict[str, float]): A dictionary containing the scores
+        RETURNS (Dict[str, Any]): A dictionary containing the scores
            token_acc/p/r/f.

        DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
@ -142,6 +146,8 @@ class Scorer:
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
+            if gold_doc.has_unknown_spaces:
+                continue
            align = example.alignment
            gold_spans = set()
            pred_spans = set()
@ -158,12 +164,20 @@ class Scorer:
                else:
                    acc_score.tp += 1
            prf_score.score_set(pred_spans, gold_spans)
-        return {
-            "token_acc": acc_score.fscore,
-            "token_p": prf_score.precision,
-            "token_r": prf_score.recall,
-            "token_f": prf_score.fscore,
-        }
+        if len(acc_score) > 0:
+            return {
+                "token_acc": acc_score.fscore,
+                "token_p": prf_score.precision,
+                "token_r": prf_score.recall,
+                "token_f": prf_score.fscore,
+            }
+        else:
+            return {
+                "token_acc": None,
+                "token_p": None,
+                "token_r": None,
+                "token_f": None
+            }

    @staticmethod
    def score_token_attr(
@ -171,8 +185,9 @@ class Scorer:
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
+        missing_values: Set[Any] = MISSING_VALUES,
        **cfg,
-    ) -> Dict[str, float]:
+    ) -> Dict[str, Any]:
        """Returns an accuracy score for a token-level attribute.

        examples (Iterable[Example]): Examples to score
@ -180,7 +195,7 @@ class Scorer:
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
-        RETURNS (Dict[str, float]): A dictionary containing the accuracy score
+        RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
            under the key attr_acc.

        DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
@ -191,17 +206,27 @@ class Scorer:
            pred_doc = example.predicted
            align = example.alignment
            gold_tags = set()
+            missing_indices = set()
            for gold_i, token in enumerate(gold_doc):
-                gold_tags.add((gold_i, getter(token, attr)))
+                value = getter(token, attr)
+                if value not in missing_values:
+                    gold_tags.add((gold_i, getter(token, attr)))
+                else:
+                    missing_indices.add(gold_i)
            pred_tags = set()
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
-                    pred_tags.add((gold_i, getter(token, attr)))
+                    if gold_i not in missing_indices:
+                        pred_tags.add((gold_i, getter(token, attr)))
            tag_score.score_set(pred_tags, gold_tags)
-        return {f"{attr}_acc": tag_score.fscore}
+        score_key = f"{attr}_acc"
+        if len(tag_score) == 0:
+            return {score_key: None}
+        else:
+            return {score_key: tag_score.fscore}

    @staticmethod
    def score_token_attr_per_feat(
@ -209,8 +234,9 @@ class Scorer:
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
+        missing_values: Set[Any] = MISSING_VALUES,
        **cfg,
-    ):
+    ) -> Dict[str, Any]:
        """Return PRF scores per feat for a token attribute in UFEATS format.

        examples (Iterable[Example]): Examples to score
@ -218,7 +244,7 @@ class Scorer:
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
-        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
+        RETURNS (dict): A dictionary containing the per-feat PRF scores under
            the key attr_per_feat.
        """
        per_feat = {}
@ -227,9 +253,11 @@ class Scorer:
            gold_doc = example.reference
            align = example.alignment
            gold_per_feat = {}
+            missing_indices = set()
            for gold_i, token in enumerate(gold_doc):
-                morph = str(getter(token, attr))
-                if morph:
+                value = getter(token, attr)
+                morph = gold_doc.vocab.strings[value]
+                if value not in missing_values and morph != Morphology.EMPTY_MORPH:
                    for feat in morph.split(Morphology.FEATURE_SEP):
                        field, values = feat.split(Morphology.FIELD_SEP)
                        if field not in per_feat:
@ -237,27 +265,35 @@ class Scorer:
                        if field not in gold_per_feat:
                            gold_per_feat[field] = set()
                        gold_per_feat[field].add((gold_i, feat))
+                else:
+                    missing_indices.add(gold_i)
            pred_per_feat = {}
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
-                    morph = str(getter(token, attr))
-                    if morph:
-                        for feat in morph.split("|"):
-                            field, values = feat.split("=")
-                            if field not in per_feat:
-                                per_feat[field] = PRFScore()
-                            if field not in pred_per_feat:
-                                pred_per_feat[field] = set()
-                            pred_per_feat[field].add((gold_i, feat))
+                    if gold_i not in missing_indices:
+                        value = getter(token, attr)
+                        morph = gold_doc.vocab.strings[value]
+                        if value not in missing_values and morph != Morphology.EMPTY_MORPH:
+                            for feat in morph.split(Morphology.FEATURE_SEP):
+                                field, values = feat.split(Morphology.FIELD_SEP)
+                                if field not in per_feat:
+                                    per_feat[field] = PRFScore()
+                                if field not in pred_per_feat:
+                                    pred_per_feat[field] = set()
+                                pred_per_feat[field].add((gold_i, feat))
            for field in per_feat:
                per_feat[field].score_set(
                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                )
-        result = {k: v.to_dict() for k, v in per_feat.items()}
-        return {f"{attr}_per_feat": result}
+        score_key = f"{attr}_per_feat"
+        if any([len(v) for v in per_feat.values()]):
+            result = {k: v.to_dict() for k, v in per_feat.items()}
+            return {score_key: result}
+        else:
+            return {score_key: None}

    @staticmethod
    def score_spans(
@ -265,6 +301,7 @@ class Scorer:
        attr: str,
        *,
        getter: Callable[[Doc, str], Iterable[Span]] = getattr,
+        has_annotation: Optional[Callable[[Doc], bool]] = None,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns PRF scores for labeled spans.
@ -284,18 +321,10 @@ class Scorer:
        for example in examples:
            pred_doc = example.predicted
            gold_doc = example.reference
-            # TODO
-            # This is a temporary hack to work around the problem that the scorer
-            # fails if you have examples that are not fully annotated for all
-            # the tasks in your pipeline. For instance, you might have a corpus
-            # of NER annotations that does not set sentence boundaries, but the
-            # pipeline includes a parser or senter, and then the score_weights
-            # are used to evaluate that component. When the scorer attempts
-            # to read the sentences from the gold document, it fails.
-            try:
-                list(getter(gold_doc, attr))
-            except ValueError:
-                continue
+            # Option to handle docs without sents
+            if has_annotation is not None:
+                if not has_annotation(gold_doc):
+                    continue
            # Find all labels in gold and doc
            labels = set(
                [k.label_ for k in getter(gold_doc, attr)]
@ -323,13 +352,21 @@ class Scorer:
                    v.score_set(pred_per_type[k], gold_per_type[k])
            # Score for all labels
            score.score_set(pred_spans, gold_spans)
-        results = {
-            f"{attr}_p": score.precision,
-            f"{attr}_r": score.recall,
-            f"{attr}_f": score.fscore,
-            f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
-        }
-        return results
+        if len(score) > 0:
+            return {
+                f"{attr}_p": score.precision,
+                f"{attr}_r": score.recall,
+                f"{attr}_f": score.fscore,
+                f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+            }
+        else:
+            return {
+                f"{attr}_p": None,
+                f"{attr}_r": None,
+                f"{attr}_f": None,
+                f"{attr}_per_type": None,
+            }
+

    @staticmethod
    def score_cats(
@ -390,9 +427,6 @@ class Scorer:
            pred_cats = getter(example.predicted, attr)
            gold_cats = getter(example.reference, attr)

-            # I think the AUC metric is applicable regardless of whether we're
-            # doing multi-label classification? Unsure. If not, move this into
-            # the elif pred_cats and gold_cats block below.
            for label in labels:
                pred_score = pred_cats.get(label, 0.0)
                gold_score = gold_cats.get(label, 0.0)
@ -542,6 +576,7 @@ class Scorer:
        head_attr: str = "head",
        head_getter: Callable[[Token, str], Token] = getattr,
        ignore_labels: Iterable[str] = SimpleFrozenList(),
+        missing_values: Set[Any] = MISSING_VALUES,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns the UAS, LAS, and LAS per type scores for dependency
@ -566,6 +601,7 @@ class Scorer:
        unlabelled = PRFScore()
        labelled = PRFScore()
        labelled_per_dep = dict()
+        missing_indices = set()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
@ -575,13 +611,16 @@ class Scorer:
            for gold_i, token in enumerate(gold_doc):
                dep = getter(token, attr)
                head = head_getter(token, head_attr)
-                if dep not in ignore_labels:
-                    gold_deps.add((gold_i, head.i, dep))
-                    if dep not in labelled_per_dep:
-                        labelled_per_dep[dep] = PRFScore()
-                    if dep not in gold_deps_per_dep:
-                        gold_deps_per_dep[dep] = set()
-                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
+                if dep not in missing_values:
+                    if dep not in ignore_labels:
+                        gold_deps.add((gold_i, head.i, dep))
+                        if dep not in labelled_per_dep:
+                            labelled_per_dep[dep] = PRFScore()
+                        if dep not in gold_deps_per_dep:
+                            gold_deps_per_dep[dep] = set()
+                        gold_deps_per_dep[dep].add((gold_i, head.i, dep))
+                else:
+                    missing_indices.add(gold_i)
            pred_deps = set()
            pred_deps_per_dep = {}
            for token in pred_doc:
@ -591,25 +630,26 @@ class Scorer:
                    gold_i = None
                else:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
-                dep = getter(token, attr)
-                head = head_getter(token, head_attr)
-                if dep not in ignore_labels and token.orth_.strip():
-                    if align.x2y.lengths[head.i] == 1:
-                        gold_head = align.x2y[head.i].dataXd[0, 0]
-                    else:
-                        gold_head = None
-                    # None is indistinct, so we can't just add it to the set
-                    # Multiple (None, None) deps are possible
-                    if gold_i is None or gold_head is None:
-                        unlabelled.fp += 1
-                        labelled.fp += 1
-                    else:
-                        pred_deps.add((gold_i, gold_head, dep))
-                        if dep not in labelled_per_dep:
-                            labelled_per_dep[dep] = PRFScore()
-                        if dep not in pred_deps_per_dep:
-                            pred_deps_per_dep[dep] = set()
-                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
+                if gold_i not in missing_indices:
+                    dep = getter(token, attr)
+                    head = head_getter(token, head_attr)
+                    if dep not in ignore_labels and token.orth_.strip():
+                        if align.x2y.lengths[head.i] == 1:
+                            gold_head = align.x2y[head.i].dataXd[0, 0]
+                        else:
+                            gold_head = None
+                        # None is indistinct, so we can't just add it to the set
+                        # Multiple (None, None) deps are possible
+                        if gold_i is None or gold_head is None:
+                            unlabelled.fp += 1
+                            labelled.fp += 1
+                        else:
+                            pred_deps.add((gold_i, gold_head, dep))
+                            if dep not in labelled_per_dep:
+                                labelled_per_dep[dep] = PRFScore()
+                            if dep not in pred_deps_per_dep:
+                                pred_deps_per_dep[dep] = set()
+                            pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
            labelled.score_set(pred_deps, gold_deps)
            for dep in labelled_per_dep:
                labelled_per_dep[dep].score_set(
@ -618,29 +658,34 @@ class Scorer:
            unlabelled.score_set(
                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
            )
-        return {
-            f"{attr}_uas": unlabelled.fscore,
-            f"{attr}_las": labelled.fscore,
-            f"{attr}_las_per_type": {
-                k: v.to_dict() for k, v in labelled_per_dep.items()
-            },
-        }
+        if len(unlabelled) > 0:
+            return {
+                f"{attr}_uas": unlabelled.fscore,
+                f"{attr}_las": labelled.fscore,
+                f"{attr}_las_per_type": {
+                    k: v.to_dict() for k, v in labelled_per_dep.items()
+                },
+            }
+        else:
+            return {
+                f"{attr}_uas": None,
+                f"{attr}_las": None,
+                f"{attr}_las_per_type": None,
+            }


-def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
-    """Compute per-entity PRFScore objects for a sequence of examples. The
-    results are returned as a dictionary keyed by the entity type. You can
-    add the PRFScore objects to get micro-averaged total.
+def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
+    """Compute micro-PRF and per-entity PRF scores for a sequence of examples.
    """
-    scores = defaultdict(PRFScore)
+    score_per_type = defaultdict(PRFScore)
    for eg in examples:
        if not eg.y.has_annotation("ENT_IOB"):
            continue
        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
        align_x2y = eg.alignment.x2y
        for pred_ent in eg.x.ents:
-            if pred_ent.label_ not in scores:
-                scores[pred_ent.label_] = PRFScore()
+            if pred_ent.label_ not in score_per_type:
+                score_per_type[pred_ent.label_] = PRFScore()
            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
            if len(indices):
                g_span = eg.y[indices[0] : indices[-1] + 1]
@ -650,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
                if all(token.ent_iob != 0 for token in g_span):
                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
                    if key in golds:
-                        scores[pred_ent.label_].tp += 1
+                        score_per_type[pred_ent.label_].tp += 1
                        golds.remove(key)
                    else:
-                        scores[pred_ent.label_].fp += 1
+                        score_per_type[pred_ent.label_].fp += 1
        for label, start, end in golds:
-            scores[label].fn += 1
-    return scores
+            score_per_type[label].fn += 1
+    totals = PRFScore()
+    for prf in score_per_type.values():
+        totals += prf
+    if len(totals) > 0:
+        return {
+            "ents_p": totals.precision,
+            "ents_r": totals.recall,
+            "ents_f": totals.fscore,
+            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
+    else:
+        return {
+            "ents_p": None,
+            "ents_r": None,
+            "ents_f": None,
+            "ents_per_type": None,
+        }


 #############################################################################
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts):
    scores = nlp.evaluate(dev_examples)
    # "cat" is the only correct lemma
    assert scores["lemma_acc"] == pytest.approx(0.2)
-    # the empty morphs are correct
-    assert scores["morph_acc"] == pytest.approx(0.6)
+    # no morphs are set
+    assert scores["morph_acc"] == None


 def test_attributeruler_rule_order(nlp):
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -277,6 +277,62 @@ def test_tag_score(tagged_doc):
    assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)


+def test_partial_annotation(en_tokenizer):
+    pred_doc = en_tokenizer("a b c d e")
+    pred_doc[0].tag_ = "A"
+    pred_doc[0].pos_ = "X"
+    pred_doc[0].set_morph("Feat=Val")
+    pred_doc[0].dep_ = "dep"
+
+    # unannotated reference
+    ref_doc = en_tokenizer("a b c d e")
+    ref_doc.has_unknown_spaces = True
+    example = Example(pred_doc, ref_doc)
+    scorer = Scorer()
+    scores = scorer.score([example])
+    for key in scores:
+        # cats doesn't have an unset state
+        if key.startswith("cats"):
+            continue
+        assert scores[key] == None
+
+    # partially annotated reference, not overlapping with predicted annotation
+    ref_doc = en_tokenizer("a b c d e")
+    ref_doc.has_unknown_spaces = True
+    ref_doc[1].tag_ = "A"
+    ref_doc[1].pos_ = "X"
+    ref_doc[1].set_morph("Feat=Val")
+    ref_doc[1].dep_ = "dep"
+    example = Example(pred_doc, ref_doc)
+    scorer = Scorer()
+    scores = scorer.score([example])
+    assert scores["token_acc"] == None
+    assert scores["tag_acc"] == 0.0
+    assert scores["pos_acc"] == 0.0
+    assert scores["morph_acc"] == 0.0
+    assert scores["dep_uas"] == 1.0
+    assert scores["dep_las"] == 0.0
+    assert scores["sents_f"] == None
+
+    # partially annotated reference, overlapping with predicted annotation
+    ref_doc = en_tokenizer("a b c d e")
+    ref_doc.has_unknown_spaces = True
+    ref_doc[0].tag_ = "A"
+    ref_doc[0].pos_ = "X"
+    ref_doc[1].set_morph("Feat=Val")
+    ref_doc[1].dep_ = "dep"
+    example = Example(pred_doc, ref_doc)
+    scorer = Scorer()
+    scores = scorer.score([example])
+    assert scores["token_acc"] == None
+    assert scores["tag_acc"] == 1.0
+    assert scores["pos_acc"] == 1.0
+    assert scores["morph_acc"] == 0.0
+    assert scores["dep_uas"] == 1.0
+    assert scores["dep_las"] == 0.0
+    assert scores["sents_f"] == None
+
+
 def test_roc_auc_score():
    # Binary classification, toy tests from scikit-learn test suite
    y_true = [0, 1]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -399,14 +399,13 @@ cdef class Doc:
            return True
        cdef int i
        cdef int range_start = 0
+        if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
+            attr = SENT_START
        attr = intify_attr(attr)
        # adjust attributes
        if attr == HEAD:
            # HEAD does not have an unset state, so rely on DEP
            attr = DEP
-        elif attr == self.vocab.strings["IS_SENT_START"]:
-            # as in Matcher, allow IS_SENT_START as an alias of SENT_START
-            attr = SENT_START
        # special cases for sentence boundaries
        if attr == SENT_START:
            if "sents" in self.user_hooks:
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -683,6 +683,7 @@ The L2 norm of the document's vector representation.
 | `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   |
 | `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           |
 | `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            |
+| `has_unknown_spaces`                 | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~         |
 | `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               |

 ## Serialization fields {#serialization-fields}
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -68,6 +68,8 @@ Scores the tokenization:
 - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
  character spans

+Docs with `has_unknown_spaces` are skipped during scoring.
+
 > #### Example
 >
 > ```python
@ -81,7 +83,8 @@ Scores the tokenization:

 ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}

-Scores a single token attribute.
+Scores a single token attribute. Tokens with missing values in the reference doc
+are skipped during scoring.

 > #### Example
 >
@ -90,20 +93,22 @@ Scores a single token attribute.
 > print(scores["pos_acc"])
 > ```

-| Name           | Description                                                                                                                                                   |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`         | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                               |
-| `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| **RETURNS**    | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          |
+| Name             | Description                                                                                                                                                   |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
+| _keyword-only_   |                                                                                                                                                               |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
+| **RETURNS**      | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          |

 ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}

 Scores a single token attribute per feature for a token attribute in the
 Universal Dependencies
 [FEATS](https://universaldependencies.org/format.html#morphological-annotation)
-format.
+format. Tokens with missing values in the reference doc are skipped during
+scoring.

 > #### Example
 >
@ -112,13 +117,14 @@ format.
 > print(scores["morph_per_feat"])
 > ```

-| Name           | Description                                                                                                                                                   |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`         | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                               |
-| `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| **RETURNS**    | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           |
+| Name             | Description                                                                                                                                                   |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
+| _keyword-only_   |                                                                                                                                                               |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
+| **RETURNS**      | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           |

 ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}

@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans.
 > print(scores["ents_f"])
 > ```

-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         |
-| `attr`         | The attribute to score. ~~str~~                                                                                                                                                             |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `getter`       | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  |
-| **RETURNS**    | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name             | Description                                                                                                                                                                                 |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                                                             |
+| _keyword-only_   |                                                                                                                                                                                             |
+| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  |
+| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~      |
+| **RETURNS**      | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |

 ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}

-Calculate the UAS, LAS, and LAS per type scores for dependency parses.
+Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens
+with missing values for the `attr` (typically `dep`) are skipped during scoring.

 > #### Example
 >
@ -160,16 +168,17 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
 > print(scores["dep_uas"], scores["dep_las"])
 > ```

-| Name            | Description                                                                                                                                                   |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`          | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_  |                                                                                                                                                               |
-| `getter`        | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| `head_attr`     | The attribute containing the head token. ~~str~~                                                                                                              |
-| `head_getter`   | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              |
-| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            |
-| **RETURNS**     | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      |
+| Name             | Description                                                                                                                                                   |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
+| _keyword-only_   |                                                                                                                                                               |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
+| `head_attr`      | The attribute containing the head token. ~~str~~                                                                                                              |
+| `head_getter`    | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              |
+| `ignore_labels`  | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
+| **RETURNS**      | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      |

 ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}