Add split one token into several (resolves #2838) (#3253)

* Add split one token into several (resolves #2838)

* Improve error message for token splitting

* Make retokenizer.split() tests use a Token object

Change retokenizer.split() to use a Token object, instead of an index.

* Pass Token into retokenize.split()

Tweak retokenize.split() API so that we pass the `Token` object, not the index.

* Fix token.idx in retokenize.split()

* Test that token.idx is correct after split

* Fix token.idx for split tokens

* Fix retokenize.split()

* Fix retokenize.split

* Fix retokenize.split() test
This commit is contained in:
Grivaz 2019-02-14 09:27:13 -05:00 committed by Matthew Honnibal
parent 11d6b874db
commit 39815513e2
3 changed files with 233 additions and 4 deletions

View File

@ -253,6 +253,9 @@ class Errors(object):
E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token" E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
" can only be part of one entity, so make sure the entities you're " " can only be part of one entity, so make sure the entities you're "
"setting don't overlap.") "setting don't overlap.")
E099 = ("The newly split token can only have one root (head = 0).")
E100 = ("The newly split token needs to have a root (head = 0)")
E101 = ("All subtokens must have associated heads")
@add_codes @add_codes
class TempErrors(object): class TempErrors(object):

View File

@ -0,0 +1,114 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import get_doc
from ...vocab import Vocab
from ...tokens import Doc
from ...tokens import Span
import pytest
def test_doc_split(en_tokenizer):
text = "LosAngeles start."
heads = [1, 1, 0]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
assert len(doc) == 3
assert len(str(doc)) == 19
assert doc[0].head.text == 'start'
assert doc[1].head.text == '.'
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
assert len(doc) == 4
assert doc[0].text == 'Los'
assert doc[0].head.text == 'Angeles'
assert doc[0].idx == 0
assert doc[1].idx == 3
assert doc[1].text == 'Angeles'
assert doc[1].head.text == 'start'
assert doc[2].text == 'start'
assert doc[2].head.text == '.'
assert doc[3].text == '.'
assert doc[3].head.text == '.'
assert len(str(doc)) == 19
def test_split_dependencies(en_tokenizer):
text = "LosAngeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
dep1 = doc.vocab.strings.add('amod')
dep2 = doc.vocab.strings.add('subject')
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
assert doc[0].dep == dep1
assert doc[1].dep == dep2
def test_split_heads_error(en_tokenizer):
text = "LosAngeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
#Not enough heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [0])
#Too many heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
#No token head
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
#Several token heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
def test_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging
words = ["abc", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "I"
def test_spans_sentence_update_after_merge(en_tokenizer):
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
'compound', 'punct']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
sent1, sent2 = list(doc.sents)
init_len = len(sent1)
init_len2 = len(sent2)
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
sent1, sent2 = list(doc.sents)
assert len(sent1) == init_len + 1
assert len(sent2) == init_len2 + 1

View File

@ -43,12 +43,12 @@ cdef class Retokenizer:
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.merges.append((span, attrs)) self.merges.append((span, attrs))
def split(self, Token token, orths, attrs=SimpleFrozenDict()): def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()):
"""Mark a Token for splitting, into the specified orths. The attrs """Mark a Token for splitting, into the specified orths. The attrs
will be applied to each subtoken. will be applied to each subtoken.
""" """
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.splits.append((token.start_char, orths, attrs)) self.splits.append((token.i, orths, heads, deps, attrs))
def __enter__(self): def __enter__(self):
self.merges = [] self.merges = []
@ -65,8 +65,12 @@ cdef class Retokenizer:
end = span.end end = span.end
_merge(self.doc, start, end, attrs) _merge(self.doc, start, end, attrs)
for start_char, orths, attrs in self.splits: offset = 0
raise NotImplementedError # Iterate in order, to keep the offset simple.
for token_index, orths, heads, deps, attrs in sorted(self.splits):
_split(self.doc, token_index + offset, orths, heads, deps, attrs)
# Adjust for the previous tokens
offset += len(orths)-1
def _merge(Doc doc, int start, int end, attributes): def _merge(Doc doc, int start, int end, attributes):
"""Retokenize the document, such that the span at """Retokenize the document, such that the span at
@ -279,3 +283,111 @@ def _bulk_merge(Doc doc, merges):
# Return the merged Python object # Return the merged Python object
return doc[spans[0].start] return doc[spans[0].start]
def _split(Doc doc, int token_index, orths, heads, deps, attrs):
"""Retokenize the document, such that the token at
`doc[token_index]` is split into tokens with the orth 'orths'
token_index(int): token index of the token to split.
orths: IDs of the verbatim text content of the tokens to create
**attributes: Attributes to assign to each of the newly created tokens. By default,
attributes are inherited from the original token.
RETURNS (Token): The first newly created token.
"""
cdef int nb_subtokens = len(orths)
cdef const LexemeC* lex
cdef TokenC* token
cdef TokenC orig_token = doc.c[token_index]
if(len(heads) != nb_subtokens):
raise ValueError(Errors.E101)
token_head_index = -1
for index, head in enumerate(heads):
if head == 0:
if token_head_index != -1:
raise ValueError(Errors.E098)
token_head_index = index
if token_head_index == -1:
raise ValueError(Errors.E099)
# First, make the dependencies absolutes, and adjust all possible dependencies before
# creating the tokens
for i in range(doc.length):
doc.c[i].head += i
# Adjust dependencies
offset = nb_subtokens - 1
for i in range(doc.length):
head_idx = doc.c[i].head
if head_idx == token_index:
doc.c[i].head = token_head_index
elif head_idx > token_index:
doc.c[i].head += offset
new_token_head = doc.c[token_index].head
# Double doc.c max_length if necessary (until big enough for all new tokens)
while doc.length + nb_subtokens - 1 >= doc.max_length:
doc._realloc(doc.length * 2)
# Move tokens after the split to create space for the new tokens
doc.length = len(doc) + nb_subtokens -1
for token_to_move in range(doc.length - 1, token_index, -1):
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
# Host the tokens in the newly created space
cdef int idx_offset = 0
for i, orth in enumerate(orths):
token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth)
token.lex = lex
# Update the character offset of the subtokens
if i != 0:
token.idx = orig_token.idx + idx_offset
idx_offset += len(orth)
# Set token.spacy to False for all non-last split tokens, and
# to origToken.spacy for the last token
if (i < nb_subtokens - 1):
token.spacy = False
else:
token.spacy = orig_token.spacy
# Apply attrs to each subtoken
for attr_name, attr_value in attrs.items():
if attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value)
else:
Token.set_struct_attr(token, attr_name, attr_value)
# Make IOB consistent
if (orig_token.ent_iob == 3):
if i == 0:
token.ent_iob = 3
else:
token.ent_iob = 1
else:
# In all other cases subtokens inherit iob from origToken
token.ent_iob = orig_token.ent_iob
# Use the head of the new token everywhere. This will be partially overwritten later on.
token.head = new_token_head
# Transform the dependencies into relative ones again
for i in range(doc.length):
doc.c[i].head -= i
# Assign correct dependencies to the inner token
for i, head in enumerate(heads):
if head != 0:
# the token's head's head is already correct
doc.c[token_index + i].head = head
for i, dep in enumerate(deps):
doc[token_index + i].dep = dep
# set children from head
set_children_from_heads(doc.c, doc.length)