Allow Scorer.score_spans to handle pred docs with missing annotation (#9701)

If the predicted docs are missing annotation according to
`has_annotation`, treat the docs as having no predictions rather than
raising errors when the annotation is missing.

The motivation for this is a combined tokenization+sents scorer for a
component where the sents annotation is optional. To provide a single
scorer in the component factory, it needs to be possible for the scorer
to continue despite missing sents annotation in the case where the
component is not annotating sents.
This commit is contained in:
Adriane Boyd 2021-11-23 15:17:19 +01:00 committed by GitHub
parent 36c7047946
commit a77f50baa4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -359,14 +359,15 @@ class Scorer:
pred_doc = example.predicted pred_doc = example.predicted
gold_doc = example.reference gold_doc = example.reference
# Option to handle docs without annotation for this attribute # Option to handle docs without annotation for this attribute
if has_annotation is not None: if has_annotation is not None and not has_annotation(gold_doc):
if not has_annotation(gold_doc): continue
continue # Find all labels in gold
# Find all labels in gold and doc labels = set([k.label_ for k in getter(gold_doc, attr)])
labels = set( # If labeled, find all labels in pred
[k.label_ for k in getter(gold_doc, attr)] if has_annotation is None or (
+ [k.label_ for k in getter(pred_doc, attr)] has_annotation is not None and has_annotation(pred_doc)
) ):
labels |= set([k.label_ for k in getter(pred_doc, attr)])
# Set up all labels for per type scoring and prepare gold per type # Set up all labels for per type scoring and prepare gold per type
gold_per_type: Dict[str, Set] = {label: set() for label in labels} gold_per_type: Dict[str, Set] = {label: set() for label in labels}
for label in labels: for label in labels:
@ -384,16 +385,19 @@ class Scorer:
gold_spans.add(gold_span) gold_spans.add(gold_span)
gold_per_type[span.label_].add(gold_span) gold_per_type[span.label_].add(gold_span)
pred_per_type: Dict[str, Set] = {label: set() for label in labels} pred_per_type: Dict[str, Set] = {label: set() for label in labels}
for span in example.get_aligned_spans_x2y( if has_annotation is None or (
getter(pred_doc, attr), allow_overlap has_annotation is not None and has_annotation(pred_doc)
): ):
pred_span: Tuple for span in example.get_aligned_spans_x2y(
if labeled: getter(pred_doc, attr), allow_overlap
pred_span = (span.label_, span.start, span.end - 1) ):
else: pred_span: Tuple
pred_span = (span.start, span.end - 1) if labeled:
pred_spans.add(pred_span) pred_span = (span.label_, span.start, span.end - 1)
pred_per_type[span.label_].add(pred_span) else:
pred_span = (span.start, span.end - 1)
pred_spans.add(pred_span)
pred_per_type[span.label_].add(pred_span)
# Scores per label # Scores per label
if labeled: if labeled:
for k, v in score_per_type.items(): for k, v in score_per_type.items():