diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 1e11a0417..af40d9b06 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -218,6 +218,13 @@ class CoreferenceResolver(TrainablePipe): total_loss = 0 for eg in examples: + if eg.x.text != eg.y.text: + # TODO assign error number + raise ValueError( + """Text, including whitespace, must match between reference and + predicted docs in coref training. + """ + ) # TODO check this causes no issues (in practice it runs) preds, backprop = self.model.begin_update([eg.predicted]) score_matrix, mention_idx = preds @@ -277,7 +284,7 @@ class CoreferenceResolver(TrainablePipe): if span is None: # TODO log more details raise IndexError(Errors.E1043) - cc.append( (span.start, span.end) ) + cc.append((span.start, span.end)) clusters.append(cc) span_idxs = create_head_span_idxs(ops, len(example.predicted)) diff --git a/spacy/pipeline/span_predictor.py b/spacy/pipeline/span_predictor.py index c9343a97e..aee11ba8e 100644 --- a/spacy/pipeline/span_predictor.py +++ b/spacy/pipeline/span_predictor.py @@ -178,6 +178,13 @@ class SpanPredictor(TrainablePipe): total_loss = 0 for eg in examples: + if eg.x.text != eg.y.text: + # TODO assign error number + raise ValueError( + """Text, including whitespace, must match between reference and + predicted docs in span predictor training. + """ + ) span_scores, backprop = self.model.begin_update([eg.predicted]) # FIXME, this only happens once in the first 1000 docs of OntoNotes # and I'm not sure yet why. diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py index 9a969acdd..7fc4864a3 100644 --- a/spacy/tests/pipeline/test_coref.py +++ b/spacy/tests/pipeline/test_coref.py @@ -218,3 +218,20 @@ def test_sentence_map(snlp): doc = snlp("I like text. This is text.") sm = get_sentence_ids(doc) assert sm == [0, 0, 0, 0, 1, 1, 1, 1] + + +@pytest.mark.skipif(not has_torch, reason="Torch not available") +def test_whitespace_mismatch(nlp): + train_examples = [] + for text, annot in TRAIN_DATA: + eg = Example.from_dict(nlp.make_doc(text), annot) + eg.predicted = nlp.make_doc(" " + text) + train_examples.append(eg) + + nlp.add_pipe("coref", config=CONFIG) + optimizer = nlp.initialize() + test_text = TRAIN_DATA[0][0] + doc = nlp(test_text) + + with pytest.raises(ValueError, match="whitespace"): + nlp.update(train_examples, sgd=optimizer) diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py index 3a3111bd4..a79756d88 100644 --- a/spacy/tests/pipeline/test_span_predictor.py +++ b/spacy/tests/pipeline/test_span_predictor.py @@ -106,7 +106,7 @@ def test_overfitting_IO(nlp): pred = eg.predicted for key, spans in ref.spans.items(): if key.startswith("coref_head_clusters"): - pred.spans[key] = [pred[span.start:span.end] for span in spans] + pred.spans[key] = [pred[span.start : span.end] for span in spans] train_examples.append(eg) nlp.add_pipe("span_predictor", config=CONFIG) @@ -209,3 +209,19 @@ def test_tokenization_mismatch(nlp): assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0]) assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0]) + +@pytest.mark.skipif(not has_torch, reason="Torch not available") +def test_whitespace_mismatch(nlp): + train_examples = [] + for text, annot in TRAIN_DATA: + eg = Example.from_dict(nlp.make_doc(text), annot) + eg.predicted = nlp.make_doc(" " + text) + train_examples.append(eg) + + nlp.add_pipe("span_predictor", config=CONFIG) + optimizer = nlp.initialize() + test_text = TRAIN_DATA[0][0] + doc = nlp(test_text) + + with pytest.raises(ValueError, match="whitespace"): + nlp.update(train_examples, sgd=optimizer)