diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py index 7d7a75279..9281df354 100644 --- a/spacy/tests/pipeline/test_span_predictor.py +++ b/spacy/tests/pipeline/test_span_predictor.py @@ -21,22 +21,22 @@ TRAIN_DATA = [ { "spans": { f"{DEFAULT_CLUSTER_PREFIX}_1": [ - (0, 11, "MENTION"), # John Smith - (38, 41, "MENTION"), # he + (0, 10, "MENTION"), # John Smith + (38, 40, "MENTION"), # he ], f"{DEFAULT_CLUSTER_PREFIX}_2": [ (25, 33, "MENTION"), # red ball - (47, 50, "MENTION"), # it + (47, 49, "MENTION"), # it ], f"coref_head_clusters_1": [ - (5, 11, "MENTION"), # Smith - (38, 41, "MENTION"), # he + (5, 10, "MENTION"), # Smith + (38, 40, "MENTION"), # he ], f"coref_head_clusters_2": [ (29, 33, "MENTION"), # red ball - (47, 50, "MENTION"), # it + (47, 49, "MENTION"), # it ] } }, @@ -129,3 +129,75 @@ def test_overfitting_IO(nlp): docs3 = [nlp(text) for text in texts] assert spans2ints(docs1[0]) == spans2ints(docs2[0]) assert spans2ints(docs1[0]) == spans2ints(docs3[0]) + + +@pytest.mark.skipif(not has_torch, reason="Torch not available") +def test_tokenization_mismatch(nlp): + train_examples = [] + for text, annot in TRAIN_DATA: + eg = Example.from_dict(nlp.make_doc(text), annot) + ref = eg.reference + char_spans = {} + for key, cluster in ref.spans.items(): + char_spans[key] = [] + for span in cluster: + char_spans[key].append((span[0].idx, span[-1].idx + len(span[-1]))) + with ref.retokenize() as retokenizer: + # merge "picked up" + retokenizer.merge(ref[2:4]) + + # Note this works because it's the same doc and we know the keys + for key, _ in ref.spans.items(): + spans = char_spans[key] + ref.spans[key] = [ref.char_span(*span) for span in spans] + + # Finally, copy over the head spans to the pred + pred = eg.predicted + for key, val in ref.spans.items(): + if key.startswith("coref_head_clusters"): + spans = char_spans[key] + pred.spans[key] = [pred.char_span(*span) for span in spans] + + train_examples.append(eg) + + nlp.add_pipe("span_predictor", config=CONFIG) + optimizer = nlp.initialize() + test_text = TRAIN_DATA[0][0] + doc = nlp(test_text) + + for i in range(100): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + doc = nlp(test_text) + + # test the trained model; need to use doc with head spans on it already + test_doc = train_examples[0].predicted + doc = nlp(test_doc) + + # XXX DEBUG + print("SPANS", len(doc.spans)) + for key, val in doc.spans.items(): + print(key, val) + print("...") + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + + # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions + texts = [ + test_text, + "I noticed many friends around me", + "They received it. They received the SMS.", + ] + + # save the docs so they don't get garbage collected + docs1 = list(nlp.pipe(texts)) + docs2 = list(nlp.pipe(texts)) + docs3 = [nlp(text) for text in texts] + assert spans2ints(docs1[0]) == spans2ints(docs2[0]) + assert spans2ints(docs1[0]) == spans2ints(docs3[0]) + assert False +