diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py index 61f8ca50d..ae1f4f4a1 100644 --- a/spacy/tests/doc/test_span_merge.py +++ b/spacy/tests/doc/test_span_merge.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc import pytest @@ -95,6 +97,21 @@ def test_spans_entity_merge(en_tokenizer): assert len(doc) == 15 +def test_spans_entity_merge_iob(): + # Test entity IOB stays consistent after merging + words = ["a", "b", "c", "d", "e"] + doc = Doc(Vocab(), words=words) + doc.ents = [(doc.vocab.strings.add('ent-abc'), 0, 3), + (doc.vocab.strings.add('ent-d'), 3, 4)] + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + assert doc[2].ent_iob_ == "I" + assert doc[3].ent_iob_ == "B" + doc[0:1].merge() + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + + def test_spans_sentence_update_after_merge(en_tokenizer): text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]