Switch Doc.__init__(ents=) to IOB tags (#6173)

* Switch Doc.__init__(ents=) to IOB tags

* Fix check for "-"

* Allow "" or None as missing IOB tag
This commit is contained in:
Adriane Boyd 2020-10-01 16:22:18 +02:00 committed by GitHub
parent df98d3ef9f
commit 73538782a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 119 additions and 15 deletions

View File

@ -533,5 +533,52 @@ def test_doc_ents_setter():
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
vocab = Vocab() vocab = Vocab()
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
doc = Doc(vocab, words=words, ents=ents) doc = Doc(vocab, words=words, ents=ents)
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
def test_doc_init_iob():
"""Test ents validation/normalization in Doc.__init__"""
words = ["a", "b", "c", "d", "e"]
ents = ["O"] * len(words)
doc = Doc(Vocab(), words=words, ents=ents)
assert doc.ents == ()
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 3
# None is missing
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# empty tag is missing
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# invalid IOB
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no dash
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no ent type
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# not strings or None
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)

View File

@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
ents = ["O"] * len(heads)
ents[0] = "B-PERSON"
ents[1] = "I-PERSON"
ents[10] = "B-GPE"
ents[13] = "B-PERSON"
ents[14] = "I-PERSON"
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = Doc( doc = Doc(
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# if there is a parse, span.root provides default values # if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0] heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)] ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-fg"
ents[6] = "I-ent-fg"
deps = ["dep"] * len(words) deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg") en_vocab.strings.add("ent-fg")
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# check that B is preserved if span[start] is B # check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0] heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)] ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-de"
ents[6] = "I-ent-de"
deps = ["dep"] * len(words) deps = ["dep"] * len(words)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:

View File

@ -9,7 +9,7 @@ def doc(en_vocab):
tags = ["VBP", "NN", "NN"] tags = ["VBP", "NN", "NN"]
heads = [0, 0, 0] heads = [0, 0, 0]
deps = ["ROOT", "dobj", "dobj"] deps = ["ROOT", "dobj", "dobj"]
ents = [("ORG", 1, 2)] ents = ["O", "B-ORG", "O"]
return Doc( return Doc(
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
) )

View File

@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
words = ["This", "is", "10", "%", "."] words = ["This", "is", "10", "%", "."]
tags = ["DT", "VBZ", "CD", "NN", "."] tags = ["DT", "VBZ", "CD", "NN", "."]
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
ents = [("PERCENT", 2, 4)] ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
assert doc.has_annotation("TAG") assert doc.has_annotation("TAG")
expected = ("10", "NUM", "CD", "PERCENT") expected = ("10", "NUM", "CD", "PERCENT")

View File

@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
doc = Doc( doc = Doc(
en_vocab, en_vocab,
words=input_.split(" "), words=input_.split(" "),
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], ents=["B-CARDINAL", "O", "B-CARDINAL"],
) )
entities = offsets_to_biluo_tags(doc, annot["entities"]) entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities}) example = Example.from_dict(doc, {"entities": entities})
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
doc = Doc( doc = Doc(
en_vocab, en_vocab,
words=input_.split(" "), words=input_.split(" "),
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
) )
entities = offsets_to_biluo_tags(doc, annot["entities"]) entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities}) example = Example.from_dict(doc, {"entities": entities})

View File

@ -30,7 +30,12 @@ def doc(en_vocab):
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."] lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9)) ents = ["O"] * len(words)
ents[0] = "B-PERSON"
ents[1] = "I-PERSON"
ents[5] = "B-LOC"
ents[6] = "I-LOC"
ents[8] = "B-GPE"
cats = {"TRAVEL": 1.0, "BAKING": 0.0} cats = {"TRAVEL": 1.0, "BAKING": 0.0}
# fmt: on # fmt: on
doc = Doc( doc = Doc(

View File

@ -213,8 +213,9 @@ cdef class Doc:
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
the same length as words, to assign as token.is_sent_start. Will be the same length as words, to assign as token.is_sent_start. Will be
overridden by heads if heads is provided. Defaults to None. overridden by heads if heads is provided. Defaults to None.
ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of ents (Optional[List[str]]): A list of unicode strings, of the same
(label, start, end) tuples to assign as doc.ents. Defaults to None. length as words, as IOB tags to assign as token.ent_iob and
token.ent_type. Defaults to None.
DOCS: https://nightly.spacy.io/api/doc#init DOCS: https://nightly.spacy.io/api/doc#init
""" """
@ -275,16 +276,55 @@ cdef class Doc:
sent_starts[i] = -1 sent_starts[i] = -1
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]: elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
sent_starts[i] = 0 sent_starts[i] = 0
ent_iobs = None
ent_types = None
if ents is not None:
iob_strings = Token.iob_strings()
# make valid IOB2 out of IOB1 or IOB2
for i, ent in enumerate(ents):
if ent is "":
ents[i] = None
elif ent is not None and not isinstance(ent, str):
raise ValueError(Errors.E177.format(tag=ent))
if i < len(ents) - 1:
# OI -> OB
if (ent is None or ent.startswith("O")) and \
(ents[i+1] is not None and ents[i+1].startswith("I")):
ents[i+1] = "B" + ents[i+1][1:]
# B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
if ent is not None and ents[i+1] is not None and \
(ent.startswith("B") or ent.startswith("I")) and \
ents[i+1].startswith("I") and \
ent[1:] != ents[i+1][1:]:
ents[i+1] = "B" + ents[i+1][1:]
ent_iobs = []
ent_types = []
for ent in ents:
if ent is None:
ent_iobs.append(iob_strings.index(""))
ent_types.append("")
elif ent == "O":
ent_iobs.append(iob_strings.index(ent))
ent_types.append("")
else:
if len(ent) < 3 or ent[1] != "-":
raise ValueError(Errors.E177.format(tag=ent))
ent_iob, ent_type = ent.split("-", 1)
if ent_iob not in iob_strings:
raise ValueError(Errors.E177.format(tag=ent))
ent_iob = iob_strings.index(ent_iob)
ent_iobs.append(ent_iob)
ent_types.append(ent_type)
headings = [] headings = []
values = [] values = []
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts] annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START] possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
for a, annot in enumerate(annotations): for a, annot in enumerate(annotations):
if annot is not None: if annot is not None:
if len(annot) != len(words): if len(annot) != len(words):
raise ValueError(Errors.E189) raise ValueError(Errors.E189)
headings.append(possible_headings[a]) headings.append(possible_headings[a])
if annot is not heads and annot is not sent_starts: if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
values.extend(annot) values.extend(annot)
for value in values: for value in values:
self.vocab.strings.add(value) self.vocab.strings.add(value)
@ -296,7 +336,7 @@ cdef class Doc:
j = 0 j = 0
for annot in annotations: for annot in annotations:
if annot: if annot:
if annot is heads or annot is sent_starts: if annot is heads or annot is sent_starts or annot is ent_iobs:
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1:
attrs[i] = annot[i] attrs[i] = annot[i]
@ -317,8 +357,6 @@ cdef class Doc:
attrs[i, j] = self.vocab.strings[annot[i]] attrs[i, j] = self.vocab.strings[annot[i]]
j += 1 j += 1
self.from_array(headings, attrs) self.from_array(headings, attrs)
if ents is not None:
self.ents = ents
@property @property
def _(self): def _(self):