Improve GoldParse NER alignment (#5335)

Improve GoldParse NER alignment by including all cases where the start
and end of the NER span can be aligned, regardless of internal
tokenization differences.

To do this, convert BILUO tags to character offsets, check start/end
alignment with `doc.char_span()`, and assign the BILUO tags for the
aligned spans. Alignment for `O/-` tags is handled through the
one-to-one and multi alignments.
This commit is contained in:
adrianeboyd 2020-04-23 16:58:23 +02:00 committed by GitHub
parent 481574cbc8
commit 84e06f9fb7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 133 additions and 27 deletions

View File

@ -110,6 +110,8 @@ class Warnings(object):
W028 = ("Doc.from_array was called with a vector of type '{type}', " W028 = ("Doc.from_array was called with a vector of type '{type}', "
"but is expecting one of type 'uint64' instead. This may result " "but is expecting one of type 'uint64' instead. This may result "
"in problems with the vocab further on in the pipeline.") "in problems with the vocab further on in the pipeline.")
W029 = ("Unable to align tokens with entities from character offsets. "
"Discarding entity annotation for the text: {text}.")
@add_codes @add_codes

View File

@ -648,6 +648,9 @@ cdef class GoldParse:
# if self.lenght > 0, this is modified latter. # if self.lenght > 0, this is modified latter.
self.orig_annot = [] self.orig_annot = []
# temporary doc for aligning entity annotation
entdoc = None
# avoid allocating memory if the doc does not contain any tokens # avoid allocating memory if the doc does not contain any tokens
if self.length > 0: if self.length > 0:
if words is None: if words is None:
@ -670,7 +673,25 @@ cdef class GoldParse:
entities = [(ent if ent is not None else "-") for ent in entities] entities = [(ent if ent is not None else "-") for ent in entities]
if not isinstance(entities[0], basestring): if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset. # Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities) # Create a temporary Doc corresponding to provided words
# (to preserve gold tokenization) and text (to preserve
# character offsets).
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
# There may be some additional whitespace tokens in the
# temporary doc, so check that the annotations align with
# the provided words while building a list of BILUO labels.
entities = []
words_offset = 0
for i in range(len(entdoc_words)):
if words[i + words_offset] == entdoc_words[i]:
entities.append(entdoc_entities[i])
else:
words_offset -= 1
if len(entities) != len(words):
user_warning(Warnings.W029.format(text=doc.text))
entities = ["-" for _ in words]
# These are filled by the tagger/parser/entity recogniser # These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
@ -697,7 +718,8 @@ cdef class GoldParse:
# If we under-segment, we'll have one predicted word that covers a # If we under-segment, we'll have one predicted word that covers a
# sequence of gold words. # sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering # If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that. # a sequence of gold words. That's many-to-many -- we don't do that
# except for NER spans where the start and end can be aligned.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
@ -720,7 +742,6 @@ cdef class GoldParse:
self.tags[i] = tags[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]]
self.morphology[i] = morphology[i2j_multi[i]] self.morphology[i] = morphology[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1) is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last # Set next word in multi-token span as head, until last
if not is_last: if not is_last:
self.heads[i] = i+1 self.heads[i] = i+1
@ -730,30 +751,10 @@ cdef class GoldParse:
if head_i: if head_i:
self.heads[i] = self.gold_to_cand[head_i] self.heads[i] = self.gold_to_cand[head_i]
self.labels[i] = deps[i2j_multi[i]] self.labels[i] = deps[i2j_multi[i]]
# Now set NER...This is annoying because if we've split
# got an entity word split into two, we need to adjust the
# BILUO tags. We can't have BB or LL etc.
# Case 1: O -- easy.
ner_tag = entities[i2j_multi[i]] ner_tag = entities[i2j_multi[i]]
if ner_tag == "O": # Assign O/- for many-to-one O/- NER tags
self.ner[i] = "O" if ner_tag in ("O", "-"):
# Case 2: U. This has to become a B I* L sequence. self.ner[i] = ner_tag
elif ner_tag.startswith("U-"):
if is_first:
self.ner[i] = ner_tag.replace("U-", "B-", 1)
elif is_last:
self.ner[i] = ner_tag.replace("U-", "L-", 1)
else:
self.ner[i] = ner_tag.replace("U-", "I-", 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith("L-"):
if is_last:
self.ner[i] = ner_tag
else:
self.ner[i] = ner_tag.replace("L-", "I-", 1)
# Case 4: I. Stays correct
elif ner_tag.startswith("I-"):
self.ner[i] = ner_tag
else: else:
self.words[i] = words[gold_i] self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i] self.tags[i] = tags[gold_i]
@ -764,6 +765,39 @@ cdef class GoldParse:
self.heads[i] = self.gold_to_cand[heads[gold_i]] self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i] self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i] self.ner[i] = entities[gold_i]
# Assign O/- for one-to-many O/- NER tags
for j, cand_j in enumerate(self.gold_to_cand):
if cand_j is None:
if j in j2i_multi:
i = j2i_multi[j]
ner_tag = entities[j]
if ner_tag in ("O", "-"):
self.ner[i] = ner_tag
# If there is entity annotation and some tokens remain unaligned,
# align all entities at the character level to account for all
# possible token misalignments within the entity spans
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
# If the temporary entdoc wasn't created above, initialize it
if not entdoc:
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
# Get offsets based on gold words and BILUO entities
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
aligned_offsets = []
aligned_spans = []
# Filter offsets to identify those that align with doc tokens
for offset in entdoc_offsets:
span = doc.char_span(offset[0], offset[1])
if span and not span.text.isspace():
aligned_offsets.append(offset)
aligned_spans.append(span)
# Convert back to BILUO for doc tokens and assign NER for all
# aligned spans
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
for span in aligned_spans:
for i in range(span.start, span.end):
self.ner[i] = biluo_tags[i]
# Prevent whitespace that isn't within entities from being tagged as # Prevent whitespace that isn't within entities from being tagged as
# an entity. # an entity.

View File

@ -6,6 +6,7 @@ from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
from spacy.gold import GoldCorpus, docs_to_json, align from spacy.gold import GoldCorpus, docs_to_json, align
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.util import get_words_and_spaces
from .util import make_tempdir from .util import make_tempdir
import pytest import pytest
import srsly import srsly
@ -59,6 +60,75 @@ def test_gold_biluo_misalign(en_vocab):
assert tags == ["O", "O", "O", "-", "-", "-"] assert tags == ["O", "O", "O", "-", "-", "-"]
def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
# one-to-many
words = ["I", "flew to", "San Francisco Valley", "."]
spaces = [True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gp = GoldParse(
doc,
words=["I", "flew", "to", "San", "Francisco", "Valley", "."],
entities=entities,
)
assert gp.ner == ["O", "O", "U-LOC", "O"]
# many-to-one
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
spaces = [True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gp = GoldParse(
doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities
)
assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
# misaligned
words = ["I flew", "to", "San Francisco", "Valley", "."]
spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gp = GoldParse(
doc, words=["I", "flew to", "San", "Francisco Valley", "."], entities=entities,
)
assert gp.ner == ["O", "O", "B-LOC", "L-LOC", "O"]
# additional whitespace tokens in GoldParse words
words, spaces = get_words_and_spaces(
["I", "flew", "to", "San Francisco", "Valley", "."],
"I flew to San Francisco Valley.",
)
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gp = GoldParse(
doc,
words=["I", "flew", " ", "to", "San Francisco Valley", "."],
entities=entities,
)
assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
# from issue #4791
data = (
"I'll return the ₹54 amount",
{
"words": ["I", "'ll", "return", "the", "", "54", "amount",],
"entities": [(16, 19, "MONEY")],
},
)
gp = GoldParse(en_tokenizer(data[0]), **data[1])
assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"]
data = (
"I'll return the $54 amount",
{
"words": ["I", "'ll", "return", "the", "$", "54", "amount",],
"entities": [(16, 19, "MONEY")],
},
)
gp = GoldParse(en_tokenizer(data[0]), **data[1])
assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"]
def test_roundtrip_offsets_biluo_conversion(en_tokenizer): def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
text = "I flew to Silicon Valley via London." text = "I flew to Silicon Valley via London."
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]

View File

@ -758,7 +758,7 @@ def get_serialization_exclude(serializers, exclude, kwargs):
def get_words_and_spaces(words, text): def get_words_and_spaces(words, text):
if "".join("".join(words).split())!= "".join(text.split()): if "".join("".join(words).split()) != "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words)) raise ValueError(Errors.E194.format(text=text, words=words))
text_words = [] text_words = []
text_spaces = [] text_spaces = []