From a77f50baa43029d3676fdaa6079e0635444de21b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 23 Nov 2021 15:17:19 +0100
Subject: [PATCH] Allow Scorer.score_spans to handle pred docs with missing
 annotation (#9701)

If the predicted docs are missing annotation according to
`has_annotation`, treat the docs as having no predictions rather than
raising errors when the annotation is missing.

The motivation for this is a combined tokenization+sents scorer for a
component where the sents annotation is optional. To provide a single
scorer in the component factory, it needs to be possible for the scorer
to continue despite missing sents annotation in the case where the
component is not annotating sents.
---
 spacy/scorer.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index cfdf34e62..4d596b5e1 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -359,14 +359,15 @@ class Scorer:
             pred_doc = example.predicted
             gold_doc = example.reference
             # Option to handle docs without annotation for this attribute
-            if has_annotation is not None:
-                if not has_annotation(gold_doc):
-                    continue
-            # Find all labels in gold and doc
-            labels = set(
-                [k.label_ for k in getter(gold_doc, attr)]
-                + [k.label_ for k in getter(pred_doc, attr)]
-            )
+            if has_annotation is not None and not has_annotation(gold_doc):
+                continue
+            # Find all labels in gold
+            labels = set([k.label_ for k in getter(gold_doc, attr)])
+            # If labeled, find all labels in pred
+            if has_annotation is None or (
+                has_annotation is not None and has_annotation(pred_doc)
+            ):
+                labels |= set([k.label_ for k in getter(pred_doc, attr)])
             # Set up all labels for per type scoring and prepare gold per type
             gold_per_type: Dict[str, Set] = {label: set() for label in labels}
             for label in labels:
@@ -384,16 +385,19 @@ class Scorer:
                 gold_spans.add(gold_span)
                 gold_per_type[span.label_].add(gold_span)
             pred_per_type: Dict[str, Set] = {label: set() for label in labels}
-            for span in example.get_aligned_spans_x2y(
-                getter(pred_doc, attr), allow_overlap
+            if has_annotation is None or (
+                has_annotation is not None and has_annotation(pred_doc)
             ):
-                pred_span: Tuple
-                if labeled:
-                    pred_span = (span.label_, span.start, span.end - 1)
-                else:
-                    pred_span = (span.start, span.end - 1)
-                pred_spans.add(pred_span)
-                pred_per_type[span.label_].add(pred_span)
+                for span in example.get_aligned_spans_x2y(
+                    getter(pred_doc, attr), allow_overlap
+                ):
+                    pred_span: Tuple
+                    if labeled:
+                        pred_span = (span.label_, span.start, span.end - 1)
+                    else:
+                        pred_span = (span.start, span.end - 1)
+                    pred_spans.add(pred_span)
+                    pred_per_type[span.label_].add(pred_span)
             # Scores per label
             if labeled:
                 for k, v in score_per_type.items():