Modify retokenizer to use span root attributes (#4219)

* Modify retokenizer to use span root attributes

* tag/pos/morph are set to root tag/pos/morph

* lemma and norm are reset and end up as orth (not ideal, but better
than orth of first token)

* Also handle individual merge case

* Add test

* Attempt to handle ent_iob and ent_type in merges

* Fix check for whether B-ENT should become I-ENT

* Move IOB consistency check to after attrs

Move all IOB consistency checks after attrs are set and simplify to
check entire document, modifying I to B at the beginning of the document
or if the entity type of the previous token isn't the same.

* Move IOB consistency check for single merge

Move IOB consistency check after the token array is compressed for the
single merge case.

* Update spacy/tokens/_retokenize.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Remove single vs. multiple merge distinction

Remove original single-instance `_merge()` and use `_bulk_merge()` (now
renamed `_merge()`) for all merges.

* Add out-of-bound check in previous entity check
This commit is contained in:
adrianeboyd 2019-09-08 13:04:49 +02:00 committed by Ines Montani
parent 53a9ca45c9
commit aec755d3a3
2 changed files with 130 additions and 110 deletions

View File

@ -99,6 +99,41 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
assert doc[0].ent_type_ == "GPE" assert doc[0].ent_type_ == "GPE"
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
text = "The players start."
heads = [1, 1, 0, -1]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
assert len(doc) == 3
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
retokenizer.merge(doc[2:4])
assert len(doc) == 2
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
assert doc[1].text == "start ."
assert doc[1].tag_ == "VBZ"
assert doc[1].pos_ == "VERB"
assert doc[1].lemma_ == "start ."
def test_doc_retokenize_spans_merge_heads(en_tokenizer): def test_doc_retokenize_spans_merge_heads(en_tokenizer):
text = "I found a pilates class near work." text = "I found a pilates class near work."
heads = [1, 0, 2, 1, -3, -1, -1, -6] heads = [1, 0, 2, 1, -3, -1, -1, -6]
@ -182,7 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
assert len(doc) == 15 assert len(doc) == 15
def test_doc_retokenize_spans_entity_merge_iob(): def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# Test entity IOB stays consistent after merging # Test entity IOB stays consistent after merging
words = ["a", "b", "c", "d", "e"] words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
@ -195,10 +230,23 @@ def test_doc_retokenize_spans_entity_merge_iob():
assert doc[2].ent_iob_ == "I" assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "B" assert doc[3].ent_iob_ == "B"
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1]) retokenizer.merge(doc[0:2])
assert len(doc) == len(words) - 1
assert doc[0].ent_iob_ == "B" assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I" assert doc[1].ent_iob_ == "I"
# Test that IOB stays consistent with provided IOB
words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words)
with doc.retokenize() as retokenizer:
attrs = {"ent_type": "ent-abc", "ent_iob": 1}
retokenizer.merge(doc[0:3], attrs=attrs)
retokenizer.merge(doc[3:5], attrs=attrs)
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
# if no parse/heads, the first word in the span is the root and provides
# default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
doc.ents = [ doc.ents = [
@ -215,7 +263,53 @@ def test_doc_retokenize_spans_entity_merge_iob():
retokenizer.merge(doc[7:9]) retokenizer.merge(doc[7:9])
assert len(doc) == 6 assert len(doc) == 6
assert doc[3].ent_iob_ == "B" assert doc[3].ent_iob_ == "B"
assert doc[4].ent_iob_ == "I" assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-fg"
# if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-fg"),
]
deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg")
en_vocab.strings.add("dep")
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
assert doc[2:4].root == doc[3] # root of 'c d' is d
assert doc[4:6].root == doc[4] # root is 'e f' is e
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4])
retokenizer.merge(doc[4:6])
retokenizer.merge(doc[7:9])
assert len(doc) == 6
assert doc[2].ent_iob_ == "B"
assert doc[2].ent_type_ == "ent-de"
assert doc[3].ent_iob_ == "I"
assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-fg"
# check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-de"),
]
deps = ["dep"] * len(words)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5])
retokenizer.merge(doc[5:7])
assert len(doc) == 7
assert doc[3].ent_iob_ == "B"
assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-de"
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer): def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):

View File

@ -109,13 +109,8 @@ cdef class Retokenizer:
def __exit__(self, *args): def __exit__(self, *args):
# Do the actual merging here # Do the actual merging here
if len(self.merges) > 1: if len(self.merges) >= 1:
_bulk_merge(self.doc, self.merges) _merge(self.doc, self.merges)
elif len(self.merges) == 1:
(span, attrs) = self.merges[0]
start = span.start
end = span.end
_merge(self.doc, start, end, attrs)
# Iterate in order, to keep things simple. # Iterate in order, to keep things simple.
for start_char, orths, heads, attrs in sorted(self.splits): for start_char, orths, heads, attrs in sorted(self.splits):
# Resolve token index # Resolve token index
@ -140,95 +135,7 @@ cdef class Retokenizer:
_split(self.doc, token_index, orths, head_indices, attrs) _split(self.doc, token_index, orths, head_indices, attrs)
def _merge(Doc doc, int start, int end, attributes): def _merge(Doc doc, merges):
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If
`start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): Character index of the start of the slice to merge.
end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
cdef Span span = doc[start:end]
cdef int start_char = span.start_char
cdef int end_char = span.end_char
# Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
# Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span])
if span[-1].whitespace_:
new_orth = new_orth[:-len(span[-1].whitespace_)]
cdef const LexemeC* lex = doc.vocab.get(doc.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &doc.c[start]
token.spacy = doc.c[end-1].spacy
for attr_name, attr_value in attributes.items():
if attr_name == "_": # Set extension attributes
for ext_attr_key, ext_attr_value in attr_value.items():
doc[start]._.set(ext_attr_key, ext_attr_value)
elif attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value)
else:
# Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate them.
# If an attribute name is not valid, set_struct_attr will ignore it.
Token.set_struct_attr(token, attr_name, attr_value)
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
# Make sure ent_iob remains consistent
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
if token.ent_type == doc.c[end].ent_type:
token.ent_iob = 3
else:
# If they're not the same entity type, let them be two entities
doc.c[end].ent_iob = 3
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a
# dependency bridges over the entity. Here the alignment of the
# tokens changes.
span_root = span.root.i
token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since
# setting token.lex will change span.start and span.end properties
# as it modifies the character offsets in the doc
token.lex = lex
for i in range(doc.length):
doc.c[i].head += i
# Set the head of the merged token, and its dep relation, from the Span
token.head = doc.c[span_root].head
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
offset = (end - start) - 1
for i in range(doc.length):
head_idx = doc.c[i].head
if start <= head_idx < end:
doc.c[i].head = start
elif head_idx >= end:
doc.c[i].head -= offset
# Now compress the token array
for i in range(end, doc.length):
doc.c[i - offset] = doc.c[i]
for i in range(doc.length - offset, doc.length):
memset(&doc.c[i], 0, sizeof(TokenC))
doc.c[i].lex = &EMPTY_LEXEME
doc.length -= offset
for i in range(doc.length):
# ...And, set heads back to a relative position
doc.c[i].head -= i
# Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length)
# Return the merged Python object
return doc[start]
def _bulk_merge(Doc doc, merges):
"""Retokenize the document, such that the spans described in 'merges' """Retokenize the document, such that the spans described in 'merges'
are merged into a single token. This method assumes that the merges are merged into a single token. This method assumes that the merges
are in the same order at which they appear in the doc, and that merges are in the same order at which they appear in the doc, and that merges
@ -256,6 +163,26 @@ def _bulk_merge(Doc doc, merges):
spans.append(span) spans.append(span)
# House the new merged token where it starts # House the new merged token where it starts
token = &doc.c[start] token = &doc.c[start]
# Initially set attributes to attributes of span root
token.tag = doc.c[span.root.i].tag
token.pos = doc.c[span.root.i].pos
token.morph = doc.c[span.root.i].morph
token.ent_iob = doc.c[span.root.i].ent_iob
token.ent_type = doc.c[span.root.i].ent_type
merged_iob = token.ent_iob
# If span root is part of an entity, merged token is B-ENT
if token.ent_iob in (1, 3):
merged_iob = 3
# If start token is I-ENT and previous token is of the same
# type, then I-ENT (could check I-ENT from start to span root)
if doc.c[start].ent_iob == 1 and start > 0 \
and doc.c[start].ent_type == token.ent_type \
and doc.c[start - 1].ent_type == token.ent_type:
merged_iob = 1
token.ent_iob = merged_iob
# Unset attributes that don't match new token
token.lemma = 0
token.norm = 0
tokens[merge_index] = token tokens[merge_index] = token
# Resize the doc.tensor, if it's set. Let the last row for each token stand # Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating # for the merged region. To do this, we create a boolean array indicating
@ -351,17 +278,7 @@ def _bulk_merge(Doc doc, merges):
# Set the left/right children, left/right edges # Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length) set_children_from_heads(doc.c, doc.length)
# Make sure ent_iob remains consistent # Make sure ent_iob remains consistent
for (span, _) in merges: make_iob_consistent(doc.c, doc.length)
if(span.end < len(offsets)):
# If it's not the last span
token_after_span_position = offsets[span.end]
if doc.c[token_after_span_position].ent_iob == 1\
and doc.c[token_after_span_position - 1].ent_iob in (0, 2):
if doc.c[token_after_span_position - 1].ent_type == doc.c[token_after_span_position].ent_type:
doc.c[token_after_span_position - 1].ent_iob = 3
else:
# If they're not the same entity type, let them be two entities
doc.c[token_after_span_position].ent_iob = 3
# Return the merged Python object # Return the merged Python object
return doc[spans[0].start] return doc[spans[0].start]
@ -480,3 +397,12 @@ def _validate_extensions(extensions):
raise ValueError(Errors.E118.format(attr=key)) raise ValueError(Errors.E118.format(attr=key))
if not is_writable_attr(extension): if not is_writable_attr(extension):
raise ValueError(Errors.E119.format(attr=key)) raise ValueError(Errors.E119.format(attr=key))
cdef make_iob_consistent(TokenC* tokens, int length):
cdef int i
if tokens[0].ent_iob == 1:
tokens[0].ent_iob = 3
for i in range(1, length):
if tokens[i].ent_iob == 1 and tokens[i - 1].ent_type != tokens[i].ent_type:
tokens[i].ent_iob = 3