mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Switch Doc.__init__(ents=) to IOB tags (#6173)
* Switch Doc.__init__(ents=) to IOB tags * Fix check for "-" * Allow "" or None as missing IOB tag
This commit is contained in:
parent
df98d3ef9f
commit
73538782a0
|
@ -533,5 +533,52 @@ def test_doc_ents_setter():
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||||
|
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
|
||||||
doc = Doc(vocab, words=words, ents=ents)
|
doc = Doc(vocab, words=words, ents=ents)
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_init_iob():
|
||||||
|
"""Test ents validation/normalization in Doc.__init__"""
|
||||||
|
words = ["a", "b", "c", "d", "e"]
|
||||||
|
ents = ["O"] * len(words)
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert doc.ents == ()
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 3
|
||||||
|
|
||||||
|
# None is missing
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# empty tag is missing
|
||||||
|
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# invalid IOB
|
||||||
|
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no dash
|
||||||
|
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no ent type
|
||||||
|
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# not strings or None
|
||||||
|
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
|
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||||
|
ents = ["O"] * len(heads)
|
||||||
|
ents[0] = "B-PERSON"
|
||||||
|
ents[1] = "I-PERSON"
|
||||||
|
ents[10] = "B-GPE"
|
||||||
|
ents[13] = "B-PERSON"
|
||||||
|
ents[14] = "I-PERSON"
|
||||||
# fmt: on
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
|
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# if there is a parse, span.root provides default values
|
# if there is a parse, span.root provides default values
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-fg"
|
||||||
|
ents[6] = "I-ent-fg"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
en_vocab.strings.add("ent-de")
|
en_vocab.strings.add("ent-de")
|
||||||
en_vocab.strings.add("ent-fg")
|
en_vocab.strings.add("ent-fg")
|
||||||
|
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# check that B is preserved if span[start] is B
|
# check that B is preserved if span[start] is B
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-de"
|
||||||
|
ents[6] = "I-ent-de"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -9,7 +9,7 @@ def doc(en_vocab):
|
||||||
tags = ["VBP", "NN", "NN"]
|
tags = ["VBP", "NN", "NN"]
|
||||||
heads = [0, 0, 0]
|
heads = [0, 0, 0]
|
||||||
deps = ["ROOT", "dobj", "dobj"]
|
deps = ["ROOT", "dobj", "dobj"]
|
||||||
ents = [("ORG", 1, 2)]
|
ents = ["O", "B-ORG", "O"]
|
||||||
return Doc(
|
return Doc(
|
||||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||||
)
|
)
|
||||||
|
|
|
@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
|
||||||
words = ["This", "is", "10", "%", "."]
|
words = ["This", "is", "10", "%", "."]
|
||||||
tags = ["DT", "VBZ", "CD", "NN", "."]
|
tags = ["DT", "VBZ", "CD", "NN", "."]
|
||||||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||||||
ents = [("PERCENT", 2, 4)]
|
ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
|
||||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||||
assert doc.has_annotation("TAG")
|
assert doc.has_annotation("TAG")
|
||||||
expected = ("10", "NUM", "CD", "PERCENT")
|
expected = ("10", "NUM", "CD", "PERCENT")
|
||||||
|
|
|
@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
|
ents=["B-CARDINAL", "O", "B-CARDINAL"],
|
||||||
)
|
)
|
||||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||||
example = Example.from_dict(doc, {"entities": entities})
|
example = Example.from_dict(doc, {"entities": entities})
|
||||||
|
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
|
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
|
||||||
)
|
)
|
||||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||||
example = Example.from_dict(doc, {"entities": entities})
|
example = Example.from_dict(doc, {"entities": entities})
|
||||||
|
|
|
@ -30,7 +30,12 @@ def doc(en_vocab):
|
||||||
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||||
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||||
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
||||||
ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
|
ents = ["O"] * len(words)
|
||||||
|
ents[0] = "B-PERSON"
|
||||||
|
ents[1] = "I-PERSON"
|
||||||
|
ents[5] = "B-LOC"
|
||||||
|
ents[6] = "I-LOC"
|
||||||
|
ents[8] = "B-GPE"
|
||||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
|
|
|
@ -213,8 +213,9 @@ cdef class Doc:
|
||||||
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||||
the same length as words, to assign as token.is_sent_start. Will be
|
the same length as words, to assign as token.is_sent_start. Will be
|
||||||
overridden by heads if heads is provided. Defaults to None.
|
overridden by heads if heads is provided. Defaults to None.
|
||||||
ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
|
ents (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
(label, start, end) tuples to assign as doc.ents. Defaults to None.
|
length as words, as IOB tags to assign as token.ent_iob and
|
||||||
|
token.ent_type. Defaults to None.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#init
|
DOCS: https://nightly.spacy.io/api/doc#init
|
||||||
"""
|
"""
|
||||||
|
@ -275,16 +276,55 @@ cdef class Doc:
|
||||||
sent_starts[i] = -1
|
sent_starts[i] = -1
|
||||||
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
||||||
sent_starts[i] = 0
|
sent_starts[i] = 0
|
||||||
|
ent_iobs = None
|
||||||
|
ent_types = None
|
||||||
|
if ents is not None:
|
||||||
|
iob_strings = Token.iob_strings()
|
||||||
|
# make valid IOB2 out of IOB1 or IOB2
|
||||||
|
for i, ent in enumerate(ents):
|
||||||
|
if ent is "":
|
||||||
|
ents[i] = None
|
||||||
|
elif ent is not None and not isinstance(ent, str):
|
||||||
|
raise ValueError(Errors.E177.format(tag=ent))
|
||||||
|
if i < len(ents) - 1:
|
||||||
|
# OI -> OB
|
||||||
|
if (ent is None or ent.startswith("O")) and \
|
||||||
|
(ents[i+1] is not None and ents[i+1].startswith("I")):
|
||||||
|
ents[i+1] = "B" + ents[i+1][1:]
|
||||||
|
# B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
|
||||||
|
if ent is not None and ents[i+1] is not None and \
|
||||||
|
(ent.startswith("B") or ent.startswith("I")) and \
|
||||||
|
ents[i+1].startswith("I") and \
|
||||||
|
ent[1:] != ents[i+1][1:]:
|
||||||
|
ents[i+1] = "B" + ents[i+1][1:]
|
||||||
|
ent_iobs = []
|
||||||
|
ent_types = []
|
||||||
|
for ent in ents:
|
||||||
|
if ent is None:
|
||||||
|
ent_iobs.append(iob_strings.index(""))
|
||||||
|
ent_types.append("")
|
||||||
|
elif ent == "O":
|
||||||
|
ent_iobs.append(iob_strings.index(ent))
|
||||||
|
ent_types.append("")
|
||||||
|
else:
|
||||||
|
if len(ent) < 3 or ent[1] != "-":
|
||||||
|
raise ValueError(Errors.E177.format(tag=ent))
|
||||||
|
ent_iob, ent_type = ent.split("-", 1)
|
||||||
|
if ent_iob not in iob_strings:
|
||||||
|
raise ValueError(Errors.E177.format(tag=ent))
|
||||||
|
ent_iob = iob_strings.index(ent_iob)
|
||||||
|
ent_iobs.append(ent_iob)
|
||||||
|
ent_types.append(ent_type)
|
||||||
headings = []
|
headings = []
|
||||||
values = []
|
values = []
|
||||||
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
|
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
|
||||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
|
||||||
for a, annot in enumerate(annotations):
|
for a, annot in enumerate(annotations):
|
||||||
if annot is not None:
|
if annot is not None:
|
||||||
if len(annot) != len(words):
|
if len(annot) != len(words):
|
||||||
raise ValueError(Errors.E189)
|
raise ValueError(Errors.E189)
|
||||||
headings.append(possible_headings[a])
|
headings.append(possible_headings[a])
|
||||||
if annot is not heads and annot is not sent_starts:
|
if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
|
||||||
values.extend(annot)
|
values.extend(annot)
|
||||||
for value in values:
|
for value in values:
|
||||||
self.vocab.strings.add(value)
|
self.vocab.strings.add(value)
|
||||||
|
@ -296,7 +336,7 @@ cdef class Doc:
|
||||||
j = 0
|
j = 0
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads or annot is sent_starts:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = annot[i]
|
||||||
|
@ -317,8 +357,6 @@ cdef class Doc:
|
||||||
attrs[i, j] = self.vocab.strings[annot[i]]
|
attrs[i, j] = self.vocab.strings[annot[i]]
|
||||||
j += 1
|
j += 1
|
||||||
self.from_array(headings, attrs)
|
self.from_array(headings, attrs)
|
||||||
if ents is not None:
|
|
||||||
self.ents = ents
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _(self):
|
def _(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user