Switch Doc.__init__(ents=) to IOB tags (#6173)

* Switch Doc.__init__(ents=) to IOB tags * Fix check for "-" * Allow "" or None as missing IOB tag
2025-08-16 18:14:56 +03:00 · 2020-10-01 16:22:18 +02:00 · 2020-10-01 16:22:18 +02:00 · 73538782a0
commit 73538782a0
parent df98d3ef9f
7 changed files with 119 additions and 15 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -533,5 +533,52 @@ def test_doc_ents_setter():
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
    vocab = Vocab()
    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
+    ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
    doc = Doc(vocab, words=words, ents=ents)
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
+
+
+def test_doc_init_iob():
+    """Test ents validation/normalization in Doc.__init__"""
+    words = ["a", "b", "c", "d", "e"]
+    ents = ["O"] * len(words)
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert doc.ents == ()
+
+    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 3
+
+    # None is missing
+    ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    # empty tag is missing
+    ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    # invalid IOB
+    ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # no dash
+    ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # no ent type
+    ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # not strings or None
+    ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
    tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
+    ents = ["O"] * len(heads)
+    ents[0] = "B-PERSON"
+    ents[1] = "I-PERSON"
+    ents[10] = "B-GPE"
+    ents[13] = "B-PERSON"
+    ents[14] = "I-PERSON"
    # fmt: on
    tokens = en_tokenizer(text)
    doc = Doc(
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # if there is a parse, span.root provides default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
-    ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
+    ents = ["O"] * len(words)
+    ents[3] = "B-ent-de"
+    ents[4] = "I-ent-de"
+    ents[5] = "B-ent-fg"
+    ents[6] = "I-ent-fg"
    deps = ["dep"] * len(words)
    en_vocab.strings.add("ent-de")
    en_vocab.strings.add("ent-fg")
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # check that B is preserved if span[start] is B
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
-    ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
+    ents = ["O"] * len(words)
+    ents[3] = "B-ent-de"
+    ents[4] = "I-ent-de"
+    ents[5] = "B-ent-de"
+    ents[6] = "I-ent-de"
    deps = ["dep"] * len(words)
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    with doc.retokenize() as retokenizer:
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@ -9,7 +9,7 @@ def doc(en_vocab):
    tags = ["VBP", "NN", "NN"]
    heads = [0, 0, 0]
    deps = ["ROOT", "dobj", "dobj"]
-    ents = [("ORG", 1, 2)]
+    ents = ["O", "B-ORG", "O"]
    return Doc(
        en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
    )
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = [("PERCENT", 2, 4)]
+    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.has_annotation("TAG")
    expected = ("10", "NUM", "CD", "PERCENT")
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
        doc = Doc(
            en_vocab,
            words=input_.split(" "),
-            ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
+            ents=["B-CARDINAL", "O", "B-CARDINAL"],
        )
        entities = offsets_to_biluo_tags(doc, annot["entities"])
        example = Example.from_dict(doc, {"entities": entities})
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
        doc = Doc(
            en_vocab,
            words=input_.split(" "),
-            ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
+            ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
        )
        entities = offsets_to_biluo_tags(doc, annot["entities"])
        example = Example.from_dict(doc, {"entities": entities})
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -30,7 +30,12 @@ def doc(en_vocab):
    heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
    deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
    lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
+    ents = ["O"] * len(words)
+    ents[0] = "B-PERSON"
+    ents[1] = "I-PERSON"
+    ents[5] = "B-LOC"
+    ents[6] = "I-LOC"
+    ents[8] = "B-GPE"
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
    # fmt: on
    doc = Doc(
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -213,8 +213,9 @@ cdef class Doc:
        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
            the same length as words, to assign as token.is_sent_start. Will be
            overridden by heads if heads is provided. Defaults to None.
-        ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
-            (label, start, end) tuples to assign as doc.ents. Defaults to None.
+        ents (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, as IOB tags to assign as token.ent_iob and
+            token.ent_type. Defaults to None.

        DOCS: https://nightly.spacy.io/api/doc#init
        """
@ -275,16 +276,55 @@ cdef class Doc:
                    sent_starts[i] = -1
                elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
                    sent_starts[i] = 0
+        ent_iobs = None
+        ent_types = None
+        if ents is not None:
+            iob_strings = Token.iob_strings()
+            # make valid IOB2 out of IOB1 or IOB2
+            for i, ent in enumerate(ents):
+                if ent is "":
+                    ents[i] = None
+                elif ent is not None and not isinstance(ent, str):
+                    raise ValueError(Errors.E177.format(tag=ent))
+                if i < len(ents) - 1:
+                    # OI -> OB
+                    if (ent is None or ent.startswith("O")) and \
+                            (ents[i+1] is not None and ents[i+1].startswith("I")):
+                        ents[i+1] = "B" + ents[i+1][1:]
+                    # B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
+                    if ent is not None and ents[i+1] is not None and \
+                            (ent.startswith("B") or ent.startswith("I")) and \
+                            ents[i+1].startswith("I") and \
+                            ent[1:] != ents[i+1][1:]:
+                        ents[i+1] = "B" + ents[i+1][1:]
+            ent_iobs = []
+            ent_types = []
+            for ent in ents:
+                if ent is None:
+                    ent_iobs.append(iob_strings.index(""))
+                    ent_types.append("")
+                elif ent == "O":
+                    ent_iobs.append(iob_strings.index(ent))
+                    ent_types.append("")
+                else:
+                    if len(ent) < 3 or ent[1] != "-":
+                        raise ValueError(Errors.E177.format(tag=ent))
+                    ent_iob, ent_type = ent.split("-", 1) 
+                    if ent_iob not in iob_strings:
+                        raise ValueError(Errors.E177.format(tag=ent))
+                    ent_iob = iob_strings.index(ent_iob)
+                    ent_iobs.append(ent_iob)
+                    ent_types.append(ent_type)
        headings = []
        values = []
-        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
-        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
+        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
        for a, annot in enumerate(annotations):
            if annot is not None:
                if len(annot) != len(words):
                    raise ValueError(Errors.E189)
                headings.append(possible_headings[a])
-                if annot is not heads and annot is not sent_starts:
+                if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
                    values.extend(annot)
        for value in values:
            self.vocab.strings.add(value)
@ -296,7 +336,7 @@ cdef class Doc:
            j = 0
            for annot in annotations:
                if annot:
-                    if annot is heads or annot is sent_starts:
+                    if annot is heads or annot is sent_starts or annot is ent_iobs:
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
@ -317,8 +357,6 @@ cdef class Doc:
                                attrs[i, j] = self.vocab.strings[annot[i]]
                    j += 1
            self.from_array(headings, attrs)
-        if ents is not None:
-            self.ents = ents

    @property
    def _(self):