From 73538782a0c3c15d113adec391acc8f7d8b28026 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 1 Oct 2020 16:22:18 +0200 Subject: [PATCH] Switch Doc.__init__(ents=) to IOB tags (#6173) * Switch Doc.__init__(ents=) to IOB tags * Fix check for "-" * Allow "" or None as missing IOB tag --- spacy/tests/doc/test_doc_api.py | 47 ++++++++++++++++ spacy/tests/doc/test_retokenize_merge.py | 18 ++++++- spacy/tests/doc/test_to_json.py | 2 +- spacy/tests/regression/test_issue3001-3500.py | 2 +- spacy/tests/test_scorer.py | 4 +- spacy/tests/training/test_training.py | 7 ++- spacy/tokens/doc.pyx | 54 ++++++++++++++++--- 7 files changed, 119 insertions(+), 15 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index b4b853701..55a1c1ad2 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -533,5 +533,52 @@ def test_doc_ents_setter(): assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] vocab = Vocab() ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] + ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"] doc = Doc(vocab, words=words, ents=ents) assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] + + +def test_doc_init_iob(): + """Test ents validation/normalization in Doc.__init__""" + words = ["a", "b", "c", "d", "e"] + ents = ["O"] * len(words) + doc = Doc(Vocab(), words=words, ents=ents) + assert doc.ents == () + + ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 2 + + ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 3 + + # None is missing + ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 2 + + # empty tag is missing + ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 2 + + # invalid IOB + ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) + + # no dash + ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) + + # no ent type + ents = ["O", "B-", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) + + # not strings or None + ents = [0, "B-", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 806c4b46f..ab186b062 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] + ents = ["O"] * len(heads) + ents[0] = "B-PERSON" + ents[1] = "I-PERSON" + ents[10] = "B-GPE" + ents[13] = "B-PERSON" + ents[14] = "I-PERSON" # fmt: on tokens = en_tokenizer(text) doc = Doc( @@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab): # if there is a parse, span.root provides default values words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 0, 0, 0, 5, 0, 0] - ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)] + ents = ["O"] * len(words) + ents[3] = "B-ent-de" + ents[4] = "I-ent-de" + ents[5] = "B-ent-fg" + ents[6] = "I-ent-fg" deps = ["dep"] * len(words) en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-fg") @@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab): # check that B is preserved if span[start] is B words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 4, 0, 0, 5, 0, 0] - ents = [("ent-de", 3, 5), ("ent-de", 5, 7)] + ents = ["O"] * len(words) + ents[3] = "B-ent-de" + ents[4] = "I-ent-de" + ents[5] = "B-ent-de" + ents[6] = "I-ent-de" deps = ["dep"] * len(words) doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) with doc.retokenize() as retokenizer: diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index c9bcafcfa..9abe5779d 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -9,7 +9,7 @@ def doc(en_vocab): tags = ["VBP", "NN", "NN"] heads = [0, 0, 0] deps = ["ROOT", "dobj", "dobj"] - ents = [("ORG", 1, 2)] + ents = ["O", "B-ORG", "O"] return Doc( en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents ) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 6fc42e83f..01f58ae77 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -59,7 +59,7 @@ def test_issue3012(en_vocab): words = ["This", "is", "10", "%", "."] tags = ["DT", "VBZ", "CD", "NN", "."] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] - ents = [("PERCENT", 2, 4)] + ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) assert doc.has_annotation("TAG") expected = ("10", "NUM", "CD", "PERCENT") diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 89864d579..187aa1b52 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab): doc = Doc( en_vocab, words=input_.split(" "), - ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], + ents=["B-CARDINAL", "O", "B-CARDINAL"], ) entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) @@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab): doc = Doc( en_vocab, words=input_.split(" "), - ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], + ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"], ) entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index af3fe63c2..28a411e6d 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -30,7 +30,12 @@ def doc(en_vocab): heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."] - ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9)) + ents = ["O"] * len(words) + ents[0] = "B-PERSON" + ents[1] = "I-PERSON" + ents[5] = "B-LOC" + ents[6] = "I-LOC" + ents[8] = "B-GPE" cats = {"TRAVEL": 1.0, "BAKING": 0.0} # fmt: on doc = Doc( diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b4027f87e..29fbb6076 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -213,8 +213,9 @@ cdef class Doc: sent_starts (Optional[List[Union[bool, None]]]): A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to None. - ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of - (label, start, end) tuples to assign as doc.ents. Defaults to None. + ents (Optional[List[str]]): A list of unicode strings, of the same + length as words, as IOB tags to assign as token.ent_iob and + token.ent_type. Defaults to None. DOCS: https://nightly.spacy.io/api/doc#init """ @@ -275,16 +276,55 @@ cdef class Doc: sent_starts[i] = -1 elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]: sent_starts[i] = 0 + ent_iobs = None + ent_types = None + if ents is not None: + iob_strings = Token.iob_strings() + # make valid IOB2 out of IOB1 or IOB2 + for i, ent in enumerate(ents): + if ent is "": + ents[i] = None + elif ent is not None and not isinstance(ent, str): + raise ValueError(Errors.E177.format(tag=ent)) + if i < len(ents) - 1: + # OI -> OB + if (ent is None or ent.startswith("O")) and \ + (ents[i+1] is not None and ents[i+1].startswith("I")): + ents[i+1] = "B" + ents[i+1][1:] + # B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2 + if ent is not None and ents[i+1] is not None and \ + (ent.startswith("B") or ent.startswith("I")) and \ + ents[i+1].startswith("I") and \ + ent[1:] != ents[i+1][1:]: + ents[i+1] = "B" + ents[i+1][1:] + ent_iobs = [] + ent_types = [] + for ent in ents: + if ent is None: + ent_iobs.append(iob_strings.index("")) + ent_types.append("") + elif ent == "O": + ent_iobs.append(iob_strings.index(ent)) + ent_types.append("") + else: + if len(ent) < 3 or ent[1] != "-": + raise ValueError(Errors.E177.format(tag=ent)) + ent_iob, ent_type = ent.split("-", 1) + if ent_iob not in iob_strings: + raise ValueError(Errors.E177.format(tag=ent)) + ent_iob = iob_strings.index(ent_iob) + ent_iobs.append(ent_iob) + ent_types.append(ent_type) headings = [] values = [] - annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts] - possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START] + annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE] for a, annot in enumerate(annotations): if annot is not None: if len(annot) != len(words): raise ValueError(Errors.E189) headings.append(possible_headings[a]) - if annot is not heads and annot is not sent_starts: + if annot is not heads and annot is not sent_starts and annot is not ent_iobs: values.extend(annot) for value in values: self.vocab.strings.add(value) @@ -296,7 +336,7 @@ cdef class Doc: j = 0 for annot in annotations: if annot: - if annot is heads or annot is sent_starts: + if annot is heads or annot is sent_starts or annot is ent_iobs: for i in range(len(words)): if attrs.ndim == 1: attrs[i] = annot[i] @@ -317,8 +357,6 @@ cdef class Doc: attrs[i, j] = self.vocab.strings[annot[i]] j += 1 self.from_array(headings, attrs) - if ents is not None: - self.ents = ents @property def _(self):