diff --git a/spacy/errors.py b/spacy/errors.py index b1cdb89ec..e52241be1 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -110,6 +110,8 @@ class Warnings(object): W028 = ("Doc.from_array was called with a vector of type '{type}', " "but is expecting one of type 'uint64' instead. This may result " "in problems with the vocab further on in the pipeline.") + W029 = ("Unable to align tokens with entities from character offsets. " + "Discarding entity annotation for the text: {text}.") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index a41f06898..8b61de683 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -648,6 +648,9 @@ cdef class GoldParse: # if self.lenght > 0, this is modified latter. self.orig_annot = [] + # temporary doc for aligning entity annotation + entdoc = None + # avoid allocating memory if the doc does not contain any tokens if self.length > 0: if words is None: @@ -670,7 +673,25 @@ cdef class GoldParse: entities = [(ent if ent is not None else "-") for ent in entities] if not isinstance(entities[0], basestring): # Assume we have entities specified by character offset. - entities = biluo_tags_from_offsets(doc, entities) + # Create a temporary Doc corresponding to provided words + # (to preserve gold tokenization) and text (to preserve + # character offsets). + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + entdoc_entities = biluo_tags_from_offsets(entdoc, entities) + # There may be some additional whitespace tokens in the + # temporary doc, so check that the annotations align with + # the provided words while building a list of BILUO labels. + entities = [] + words_offset = 0 + for i in range(len(entdoc_words)): + if words[i + words_offset] == entdoc_words[i]: + entities.append(entdoc_entities[i]) + else: + words_offset -= 1 + if len(entities) != len(words): + user_warning(Warnings.W029.format(text=doc.text)) + entities = ["-" for _ in words] # These are filled by the tagger/parser/entity recogniser self.c.tags = self.mem.alloc(len(doc), sizeof(int)) @@ -697,7 +718,8 @@ cdef class GoldParse: # If we under-segment, we'll have one predicted word that covers a # sequence of gold words. # If we "mis-segment", we'll have a sequence of predicted words covering - # a sequence of gold words. That's many-to-many -- we don't do that. + # a sequence of gold words. That's many-to-many -- we don't do that + # except for NER spans where the start and end can be aligned. cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] @@ -720,7 +742,6 @@ cdef class GoldParse: self.tags[i] = tags[i2j_multi[i]] self.morphology[i] = morphology[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) - is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last if not is_last: self.heads[i] = i+1 @@ -730,30 +751,10 @@ cdef class GoldParse: if head_i: self.heads[i] = self.gold_to_cand[head_i] self.labels[i] = deps[i2j_multi[i]] - # Now set NER...This is annoying because if we've split - # got an entity word split into two, we need to adjust the - # BILUO tags. We can't have BB or LL etc. - # Case 1: O -- easy. ner_tag = entities[i2j_multi[i]] - if ner_tag == "O": - self.ner[i] = "O" - # Case 2: U. This has to become a B I* L sequence. - elif ner_tag.startswith("U-"): - if is_first: - self.ner[i] = ner_tag.replace("U-", "B-", 1) - elif is_last: - self.ner[i] = ner_tag.replace("U-", "L-", 1) - else: - self.ner[i] = ner_tag.replace("U-", "I-", 1) - # Case 3: L. If not last, change to I. - elif ner_tag.startswith("L-"): - if is_last: - self.ner[i] = ner_tag - else: - self.ner[i] = ner_tag.replace("L-", "I-", 1) - # Case 4: I. Stays correct - elif ner_tag.startswith("I-"): - self.ner[i] = ner_tag + # Assign O/- for many-to-one O/- NER tags + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] @@ -764,6 +765,39 @@ cdef class GoldParse: self.heads[i] = self.gold_to_cand[heads[gold_i]] self.labels[i] = deps[gold_i] self.ner[i] = entities[gold_i] + # Assign O/- for one-to-many O/- NER tags + for j, cand_j in enumerate(self.gold_to_cand): + if cand_j is None: + if j in j2i_multi: + i = j2i_multi[j] + ner_tag = entities[j] + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag + + # If there is entity annotation and some tokens remain unaligned, + # align all entities at the character level to account for all + # possible token misalignments within the entity spans + if any([e not in ("O", "-") for e in entities]) and None in self.ner: + # If the temporary entdoc wasn't created above, initialize it + if not entdoc: + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + # Get offsets based on gold words and BILUO entities + entdoc_offsets = offsets_from_biluo_tags(entdoc, entities) + aligned_offsets = [] + aligned_spans = [] + # Filter offsets to identify those that align with doc tokens + for offset in entdoc_offsets: + span = doc.char_span(offset[0], offset[1]) + if span and not span.text.isspace(): + aligned_offsets.append(offset) + aligned_spans.append(span) + # Convert back to BILUO for doc tokens and assign NER for all + # aligned spans + biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None) + for span in aligned_spans: + for i in range(span.start, span.end): + self.ner[i] = biluo_tags[i] # Prevent whitespace that isn't within entities from being tagged as # an entity. diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index b546e079b..fc9e624eb 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -6,6 +6,7 @@ from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json, align from spacy.lang.en import English from spacy.tokens import Doc +from spacy.util import get_words_and_spaces from .util import make_tempdir import pytest import srsly @@ -59,6 +60,75 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): + # one-to-many + words = ["I", "flew to", "San Francisco Valley", "."] + spaces = [True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, + words=["I", "flew", "to", "San", "Francisco", "Valley", "."], + entities=entities, + ) + assert gp.ner == ["O", "O", "U-LOC", "O"] + + # many-to-one + words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + spaces = [True, True, True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities + ) + assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + + # misaligned + words = ["I flew", "to", "San Francisco", "Valley", "."] + spaces = [True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, words=["I", "flew to", "San", "Francisco Valley", "."], entities=entities, + ) + assert gp.ner == ["O", "O", "B-LOC", "L-LOC", "O"] + + # additional whitespace tokens in GoldParse words + words, spaces = get_words_and_spaces( + ["I", "flew", "to", "San Francisco", "Valley", "."], + "I flew to San Francisco Valley.", + ) + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, + words=["I", "flew", " ", "to", "San Francisco Valley", "."], + entities=entities, + ) + assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] + + # from issue #4791 + data = ( + "I'll return the ₹54 amount", + { + "words": ["I", "'ll", "return", "the", "₹", "54", "amount",], + "entities": [(16, 19, "MONEY")], + }, + ) + gp = GoldParse(en_tokenizer(data[0]), **data[1]) + assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"] + + data = ( + "I'll return the $54 amount", + { + "words": ["I", "'ll", "return", "the", "$", "54", "amount",], + "entities": [(16, 19, "MONEY")], + }, + ) + gp = GoldParse(en_tokenizer(data[0]), **data[1]) + assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"] + + def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] diff --git a/spacy/util.py b/spacy/util.py index 1c627af46..a5e27a210 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -758,7 +758,7 @@ def get_serialization_exclude(serializers, exclude, kwargs): def get_words_and_spaces(words, text): - if "".join("".join(words).split())!= "".join(text.split()): + if "".join("".join(words).split()) != "".join(text.split()): raise ValueError(Errors.E194.format(text=text, words=words)) text_words = [] text_spaces = []