From 39815513e29157cca9b5949e97211a98d8132f08 Mon Sep 17 00:00:00 2001 From: Grivaz <33332500+grivaz@users.noreply.github.com> Date: Thu, 14 Feb 2019 09:27:13 -0500 Subject: [PATCH] Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test --- spacy/errors.py | 3 + spacy/tests/doc/test_doc_spilt.py | 114 ++++++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 120 +++++++++++++++++++++++++++++- 3 files changed, 233 insertions(+), 4 deletions(-) create mode 100644 spacy/tests/doc/test_doc_spilt.py diff --git a/spacy/errors.py b/spacy/errors.py index 138de0f57..00204d8e3 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -253,6 +253,9 @@ class Errors(object): E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token" " can only be part of one entity, so make sure the entities you're " "setting don't overlap.") + E099 = ("The newly split token can only have one root (head = 0).") + E100 = ("The newly split token needs to have a root (head = 0)") + E101 = ("All subtokens must have associated heads") @add_codes class TempErrors(object): diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py new file mode 100644 index 000000000..827fd565e --- /dev/null +++ b/spacy/tests/doc/test_doc_spilt.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc +from ...tokens import Span + +import pytest + + +def test_doc_split(en_tokenizer): + text = "LosAngeles start." + heads = [1, 1, 0] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + + assert len(doc) == 3 + assert len(str(doc)) == 19 + assert doc[0].head.text == 'start' + assert doc[1].head.text == '.' + + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'}) + + assert len(doc) == 4 + assert doc[0].text == 'Los' + assert doc[0].head.text == 'Angeles' + assert doc[0].idx == 0 + assert doc[1].idx == 3 + + assert doc[1].text == 'Angeles' + assert doc[1].head.text == 'start' + + assert doc[2].text == 'start' + assert doc[2].head.text == '.' + + assert doc[3].text == '.' + assert doc[3].head.text == '.' + + assert len(str(doc)) == 19 + +def test_split_dependencies(en_tokenizer): + text = "LosAngeles start." + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens]) + dep1 = doc.vocab.strings.add('amod') + dep2 = doc.vocab.strings.add('subject') + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2]) + + assert doc[0].dep == dep1 + assert doc[1].dep == dep2 + + + +def test_split_heads_error(en_tokenizer): + text = "LosAngeles start." + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens]) + #Not enough heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [0]) + + #Too many heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0]) + + #No token head + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1]) + + #Several token heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0]) + + +def test_spans_entity_merge_iob(): + # Test entity IOB stays consistent after merging + words = ["abc", "d", "e"] + doc = Doc(Vocab(), words=words) + doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)] + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0]) + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + assert doc[2].ent_iob_ == "I" + assert doc[3].ent_iob_ == "I" + +def test_spans_sentence_update_after_merge(en_tokenizer): + text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale." + heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2] + deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', + 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', + 'compound', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) + sent1, sent2 = list(doc.sents) + init_len = len(sent1) + init_len2 = len(sent2) + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0]) + retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0]) + sent1, sent2 = list(doc.sents) + assert len(sent1) == init_len + 1 + assert len(sent2) == init_len2 + 1 diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 60ed63ee7..e0dc4bdf4 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -43,12 +43,12 @@ cdef class Retokenizer: attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.merges.append((span, attrs)) - def split(self, Token token, orths, attrs=SimpleFrozenDict()): + def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()): """Mark a Token for splitting, into the specified orths. The attrs will be applied to each subtoken. """ attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - self.splits.append((token.start_char, orths, attrs)) + self.splits.append((token.i, orths, heads, deps, attrs)) def __enter__(self): self.merges = [] @@ -65,8 +65,12 @@ cdef class Retokenizer: end = span.end _merge(self.doc, start, end, attrs) - for start_char, orths, attrs in self.splits: - raise NotImplementedError + offset = 0 + # Iterate in order, to keep the offset simple. + for token_index, orths, heads, deps, attrs in sorted(self.splits): + _split(self.doc, token_index + offset, orths, heads, deps, attrs) + # Adjust for the previous tokens + offset += len(orths)-1 def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at @@ -279,3 +283,111 @@ def _bulk_merge(Doc doc, merges): # Return the merged Python object return doc[spans[0].start] + + +def _split(Doc doc, int token_index, orths, heads, deps, attrs): + """Retokenize the document, such that the token at + `doc[token_index]` is split into tokens with the orth 'orths' + token_index(int): token index of the token to split. + orths: IDs of the verbatim text content of the tokens to create + **attributes: Attributes to assign to each of the newly created tokens. By default, + attributes are inherited from the original token. + RETURNS (Token): The first newly created token. + """ + cdef int nb_subtokens = len(orths) + cdef const LexemeC* lex + cdef TokenC* token + cdef TokenC orig_token = doc.c[token_index] + + if(len(heads) != nb_subtokens): + raise ValueError(Errors.E101) + token_head_index = -1 + for index, head in enumerate(heads): + if head == 0: + if token_head_index != -1: + raise ValueError(Errors.E098) + token_head_index = index + if token_head_index == -1: + raise ValueError(Errors.E099) + + # First, make the dependencies absolutes, and adjust all possible dependencies before + # creating the tokens + + for i in range(doc.length): + doc.c[i].head += i + + # Adjust dependencies + offset = nb_subtokens - 1 + for i in range(doc.length): + head_idx = doc.c[i].head + if head_idx == token_index: + doc.c[i].head = token_head_index + elif head_idx > token_index: + doc.c[i].head += offset + + new_token_head = doc.c[token_index].head + + # Double doc.c max_length if necessary (until big enough for all new tokens) + while doc.length + nb_subtokens - 1 >= doc.max_length: + doc._realloc(doc.length * 2) + + # Move tokens after the split to create space for the new tokens + doc.length = len(doc) + nb_subtokens -1 + for token_to_move in range(doc.length - 1, token_index, -1): + doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] + + # Host the tokens in the newly created space + cdef int idx_offset = 0 + for i, orth in enumerate(orths): + + token = &doc.c[token_index + i] + lex = doc.vocab.get(doc.mem, orth) + token.lex = lex + # Update the character offset of the subtokens + if i != 0: + token.idx = orig_token.idx + idx_offset + idx_offset += len(orth) + + # Set token.spacy to False for all non-last split tokens, and + # to origToken.spacy for the last token + if (i < nb_subtokens - 1): + token.spacy = False + else: + token.spacy = orig_token.spacy + + # Apply attrs to each subtoken + for attr_name, attr_value in attrs.items(): + if attr_name == TAG: + doc.vocab.morphology.assign_tag(token, attr_value) + else: + Token.set_struct_attr(token, attr_name, attr_value) + + # Make IOB consistent + if (orig_token.ent_iob == 3): + if i == 0: + token.ent_iob = 3 + else: + token.ent_iob = 1 + else: + # In all other cases subtokens inherit iob from origToken + token.ent_iob = orig_token.ent_iob + + # Use the head of the new token everywhere. This will be partially overwritten later on. + token.head = new_token_head + + # Transform the dependencies into relative ones again + for i in range(doc.length): + doc.c[i].head -= i + + # Assign correct dependencies to the inner token + for i, head in enumerate(heads): + if head != 0: + # the token's head's head is already correct + doc.c[token_index + i].head = head + + for i, dep in enumerate(deps): + doc[token_index + i].dep = dep + + # set children from head + set_children_from_heads(doc.c, doc.length) +