Add split one token into several (resolves #2838) (#3253)

* Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test
2026-01-10 18:51:21 +03:00 · 2019-02-14 09:27:13 -05:00 · 2019-02-14 09:27:13 -05:00 · 39815513e2
commit 39815513e2
parent 11d6b874db
3 changed files with 233 additions and 4 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -253,6 +253,9 @@ class Errors(object):
    E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
            " can only be part of one entity, so make sure the entities you're "
            "setting don't overlap.")
+    E099 = ("The newly split token can only have one root (head = 0).")
+    E100 = ("The newly split token needs to have a root (head = 0)")
+    E101 = ("All subtokens must have associated heads")

@add_codes
 class TempErrors(object):
--- a/spacy/tests/doc/test_doc_spilt.py
+++ b/spacy/tests/doc/test_doc_spilt.py
@ -0,0 +1,114 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ..util import get_doc
+from ...vocab import Vocab
+from ...tokens import Doc
+from ...tokens import Span
+
+import pytest
+
+
+def test_doc_split(en_tokenizer):
+    text = "LosAngeles start."
+    heads = [1, 1, 0]
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
+
+    assert len(doc) == 3
+    assert len(str(doc)) == 19
+    assert doc[0].head.text == 'start'
+    assert doc[1].head.text == '.'
+
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
+
+    assert len(doc) == 4
+    assert doc[0].text == 'Los'
+    assert doc[0].head.text == 'Angeles'
+    assert doc[0].idx == 0
+    assert doc[1].idx == 3
+
+    assert doc[1].text == 'Angeles'
+    assert doc[1].head.text == 'start'
+
+    assert doc[2].text == 'start'
+    assert doc[2].head.text == '.'
+
+    assert doc[3].text == '.'
+    assert doc[3].head.text == '.'
+
+    assert len(str(doc)) == 19
+
+def test_split_dependencies(en_tokenizer):
+    text = "LosAngeles start."
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens])
+    dep1 = doc.vocab.strings.add('amod')
+    dep2 = doc.vocab.strings.add('subject')
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
+
+    assert doc[0].dep == dep1
+    assert doc[1].dep == dep2
+
+
+
+def test_split_heads_error(en_tokenizer):
+    text = "LosAngeles start."
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens])
+    #Not enough heads
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(doc[0], ["Los", "Angeles"], [0])
+
+    #Too many heads
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
+
+    #No token head
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
+
+    #Several token heads
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
+
+
+def test_spans_entity_merge_iob():
+    # Test entity IOB stays consistent after merging
+    words = ["abc", "d", "e"]
+    doc = Doc(Vocab(), words=words)
+    doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
+    assert doc[0].ent_iob_ == "B"
+    assert doc[1].ent_iob_ == "I"
+
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
+    assert doc[0].ent_iob_ == "B"
+    assert doc[1].ent_iob_ == "I"
+    assert doc[2].ent_iob_ == "I"
+    assert doc[3].ent_iob_ == "I"
+
+def test_spans_sentence_update_after_merge(en_tokenizer):
+    text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
+    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
+    deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
+            'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
+            'compound', 'punct']
+
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
+    sent1, sent2 = list(doc.sents)
+    init_len = len(sent1)
+    init_len2 = len(sent2)
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
+        retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
+    sent1, sent2 = list(doc.sents)
+    assert len(sent1) == init_len + 1
+    assert len(sent2) == init_len2 + 1
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -43,12 +43,12 @@ cdef class Retokenizer:
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.merges.append((span, attrs))

-    def split(self, Token token, orths, attrs=SimpleFrozenDict()):
+    def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()):
        """Mark a Token for splitting, into the specified orths. The attrs
        will be applied to each subtoken.
        """
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
-        self.splits.append((token.start_char, orths, attrs))
+        self.splits.append((token.i, orths, heads, deps, attrs))

    def __enter__(self):
        self.merges = []
@ -65,8 +65,12 @@ cdef class Retokenizer:
            end = span.end
            _merge(self.doc, start, end, attrs)

-        for start_char, orths, attrs in self.splits:
-            raise NotImplementedError
+        offset = 0
+        # Iterate in order, to keep the offset simple.
+        for token_index, orths, heads, deps, attrs in sorted(self.splits):
+             _split(self.doc, token_index + offset, orths, heads, deps, attrs)
+             # Adjust for the previous tokens
+             offset += len(orths)-1

 def _merge(Doc doc, int start, int end, attributes):
    """Retokenize the document, such that the span at
@ -279,3 +283,111 @@ def _bulk_merge(Doc doc, merges):

    # Return the merged Python object
    return doc[spans[0].start]
+
+
+def _split(Doc doc, int token_index, orths, heads, deps, attrs):
+    """Retokenize the document, such that the token at
+    `doc[token_index]` is split into tokens with the orth 'orths'
+    token_index(int): token index of the token to split.
+    orths: IDs of the verbatim text content of the tokens to create
+    **attributes: Attributes to assign to each of the newly created tokens. By default,
+        attributes are inherited from the original token.
+    RETURNS (Token): The first newly created token.
+    """
+    cdef int nb_subtokens = len(orths)
+    cdef const LexemeC* lex
+    cdef TokenC* token
+    cdef TokenC orig_token = doc.c[token_index]
+
+    if(len(heads) != nb_subtokens):
+        raise ValueError(Errors.E101)
+    token_head_index = -1
+    for index, head in enumerate(heads):
+        if head == 0:
+            if token_head_index != -1:
+                raise ValueError(Errors.E098)
+            token_head_index = index
+    if token_head_index == -1:
+        raise ValueError(Errors.E099)
+
+    # First, make the dependencies absolutes, and adjust all possible dependencies before
+    # creating the tokens
+
+    for i in range(doc.length):
+        doc.c[i].head += i
+
+    # Adjust dependencies
+    offset = nb_subtokens - 1
+    for i in range(doc.length):
+        head_idx = doc.c[i].head
+        if head_idx == token_index:
+            doc.c[i].head = token_head_index
+        elif head_idx > token_index:
+            doc.c[i].head += offset
+
+    new_token_head = doc.c[token_index].head
+
+    # Double doc.c max_length if necessary (until big enough for all new tokens)
+    while doc.length + nb_subtokens - 1 >= doc.max_length:
+        doc._realloc(doc.length * 2)
+
+    # Move tokens after the split to create space for the new tokens
+    doc.length = len(doc) + nb_subtokens -1
+    for token_to_move in range(doc.length - 1, token_index, -1):
+        doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
+
+    # Host the tokens in the newly created space
+    cdef int idx_offset = 0
+    for i, orth in enumerate(orths):
+
+        token = &doc.c[token_index + i]
+        lex = doc.vocab.get(doc.mem, orth)
+        token.lex = lex
+        # Update the character offset of the subtokens
+        if i != 0:
+            token.idx = orig_token.idx + idx_offset
+        idx_offset += len(orth)
+
+        # Set token.spacy to False for all non-last split tokens, and
+        # to origToken.spacy for the last token
+        if (i < nb_subtokens - 1):
+            token.spacy = False
+        else:
+            token.spacy = orig_token.spacy
+
+        # Apply attrs to each subtoken
+        for attr_name, attr_value in attrs.items():
+            if attr_name == TAG:
+                doc.vocab.morphology.assign_tag(token, attr_value)
+            else:
+                Token.set_struct_attr(token, attr_name, attr_value)
+
+        # Make IOB consistent
+        if (orig_token.ent_iob == 3):
+            if i == 0:
+                token.ent_iob = 3
+            else:
+                token.ent_iob = 1
+        else:
+            # In all other cases subtokens inherit iob from origToken
+            token.ent_iob = orig_token.ent_iob
+
+         # Use the head of the new token everywhere. This will be partially overwritten later on.
+        token.head = new_token_head
+
+    # Transform the dependencies into relative ones again
+    for i in range(doc.length):
+        doc.c[i].head -= i
+
+    # Assign correct dependencies to the inner token
+    for i, head in enumerate(heads):
+        if head != 0:
+            # the token's head's head is already correct
+            doc.c[token_index + i].head = head
+
+    for i, dep in enumerate(deps):
+        doc[token_index + i].dep = dep
+
+    # set children from head
+    set_children_from_heads(doc.c, doc.length)
+