From a77f50baa43029d3676fdaa6079e0635444de21b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 23 Nov 2021 15:17:19 +0100 Subject: [PATCH] Allow Scorer.score_spans to handle pred docs with missing annotation (#9701) If the predicted docs are missing annotation according to `has_annotation`, treat the docs as having no predictions rather than raising errors when the annotation is missing. The motivation for this is a combined tokenization+sents scorer for a component where the sents annotation is optional. To provide a single scorer in the component factory, it needs to be possible for the scorer to continue despite missing sents annotation in the case where the component is not annotating sents. --- spacy/scorer.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index cfdf34e62..4d596b5e1 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -359,14 +359,15 @@ class Scorer: pred_doc = example.predicted gold_doc = example.reference # Option to handle docs without annotation for this attribute - if has_annotation is not None: - if not has_annotation(gold_doc): - continue - # Find all labels in gold and doc - labels = set( - [k.label_ for k in getter(gold_doc, attr)] - + [k.label_ for k in getter(pred_doc, attr)] - ) + if has_annotation is not None and not has_annotation(gold_doc): + continue + # Find all labels in gold + labels = set([k.label_ for k in getter(gold_doc, attr)]) + # If labeled, find all labels in pred + if has_annotation is None or ( + has_annotation is not None and has_annotation(pred_doc) + ): + labels |= set([k.label_ for k in getter(pred_doc, attr)]) # Set up all labels for per type scoring and prepare gold per type gold_per_type: Dict[str, Set] = {label: set() for label in labels} for label in labels: @@ -384,16 +385,19 @@ class Scorer: gold_spans.add(gold_span) gold_per_type[span.label_].add(gold_span) pred_per_type: Dict[str, Set] = {label: set() for label in labels} - for span in example.get_aligned_spans_x2y( - getter(pred_doc, attr), allow_overlap + if has_annotation is None or ( + has_annotation is not None and has_annotation(pred_doc) ): - pred_span: Tuple - if labeled: - pred_span = (span.label_, span.start, span.end - 1) - else: - pred_span = (span.start, span.end - 1) - pred_spans.add(pred_span) - pred_per_type[span.label_].add(pred_span) + for span in example.get_aligned_spans_x2y( + getter(pred_doc, attr), allow_overlap + ): + pred_span: Tuple + if labeled: + pred_span = (span.label_, span.start, span.end - 1) + else: + pred_span = (span.start, span.end - 1) + pred_spans.add(pred_span) + pred_per_type[span.label_].add(pred_span) # Scores per label if labeled: for k, v in score_per_type.items():