mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-03 12:43:15 +03:00
Merge pull request #2159 from explosion/feature/fix-merged-entity-iob (resolves #1554, resolves #1752)
💫 Fix token.ent_iob after doc.merge(), and ensure consistency in doc.ents
This commit is contained in:
commit
0de599b16b
|
@ -18,7 +18,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
assert [w.ent_iob_ for w in doc] == (['O'] * len(doc))
|
assert [w.ent_iob_ for w in doc] == (['O'] * len(doc))
|
||||||
|
|
||||||
doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
|
doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
|
||||||
assert [w.ent_iob_ for w in doc] == ['O', 'O', 'O', 'B']
|
assert [w.ent_iob_ for w in doc] == ['', '', '', 'B']
|
||||||
|
|
||||||
doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
|
doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
|
||||||
assert [w.ent_iob_ for w in doc] == ['B', 'I', 'O', 'O']
|
assert [w.ent_iob_ for w in doc] == ['B', 'I', '', '']
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -95,6 +97,21 @@ def test_spans_entity_merge(en_tokenizer):
|
||||||
assert len(doc) == 15
|
assert len(doc) == 15
|
||||||
|
|
||||||
|
|
||||||
|
def test_spans_entity_merge_iob():
|
||||||
|
# Test entity IOB stays consistent after merging
|
||||||
|
words = ["a", "b", "c", "d", "e"]
|
||||||
|
doc = Doc(Vocab(), words=words)
|
||||||
|
doc.ents = [(doc.vocab.strings.add('ent-abc'), 0, 3),
|
||||||
|
(doc.vocab.strings.add('ent-d'), 3, 4)]
|
||||||
|
assert doc[0].ent_iob_ == "B"
|
||||||
|
assert doc[1].ent_iob_ == "I"
|
||||||
|
assert doc[2].ent_iob_ == "I"
|
||||||
|
assert doc[3].ent_iob_ == "B"
|
||||||
|
doc[0:1].merge()
|
||||||
|
assert doc[0].ent_iob_ == "B"
|
||||||
|
assert doc[1].ent_iob_ == "I"
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sentence_update_after_merge(en_tokenizer):
|
def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||||
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
||||||
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
||||||
|
|
|
@ -421,7 +421,12 @@ cdef class Doc:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
if token.ent_iob == 1:
|
if token.ent_iob == 1:
|
||||||
assert start != -1
|
if start == -1:
|
||||||
|
seq = ['%s|%s' % (t.text, t.ent_iob_) for t in self[i-5:i+5]]
|
||||||
|
raise ValueError(
|
||||||
|
"token.ent_iob values make invalid sequence: "
|
||||||
|
"I without B\n"
|
||||||
|
"{seq}".format(seq=' '.join(seq)))
|
||||||
elif token.ent_iob == 2 or token.ent_iob == 0:
|
elif token.ent_iob == 2 or token.ent_iob == 0:
|
||||||
if start != -1:
|
if start != -1:
|
||||||
output.append(Span(self, start, i, label=label))
|
output.append(Span(self, start, i, label=label))
|
||||||
|
@ -446,10 +451,7 @@ cdef class Doc:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
# At this point we don't know whether the NER has run over the
|
self.c[i].ent_iob = 0 # Means missing.
|
||||||
# Doc. If the ent_iob is missing, leave it missing.
|
|
||||||
if self.c[i].ent_iob != 0:
|
|
||||||
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
|
|
||||||
cdef attr_t ent_type
|
cdef attr_t ent_type
|
||||||
cdef int start, end
|
cdef int start, end
|
||||||
for ent_info in ents:
|
for ent_info in ents:
|
||||||
|
@ -947,6 +949,13 @@ cdef class Doc:
|
||||||
self.vocab.morphology.assign_tag(token, attr_value)
|
self.vocab.morphology.assign_tag(token, attr_value)
|
||||||
else:
|
else:
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
# Make sure ent_iob remains consistent
|
||||||
|
if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
||||||
|
if token.ent_type == self.c[end].ent_type:
|
||||||
|
token.ent_iob = 3
|
||||||
|
else:
|
||||||
|
# If they're not the same entity type, let them be two entities
|
||||||
|
self.c[end].ent_iob = 3
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a
|
# Before thinking of something simpler, beware the case where a
|
||||||
|
|
Loading…
Reference in New Issue
Block a user