Add split one token into several (resolves #2838) (#3253)

* Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test
2025-10-27 14:11:04 +03:00 · 2019-02-14 09:27:13 -05:00 · 2019-02-14 09:27:13 -05:00 · 39815513e2
commit 39815513e2
parent 11d6b874db
3 changed files with 233 additions and 4 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -253,6 +253,9 @@ class Errors(object):
    E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
            " can only be part of one entity, so make sure the entities you're "
            "setting don't overlap.")
    E099 = ("The newly split token can only have one root (head = 0).")
    E100 = ("The newly split token needs to have a root (head = 0)")
    E101 = ("All subtokens must have associated heads")
@add_codes
 class TempErrors(object):
--- a/spacy/tests/doc/test_doc_spilt.py
+++ b/spacy/tests/doc/test_doc_spilt.py
@ -0,0 +1,114 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ..util import get_doc
 from ...vocab import Vocab
 from ...tokens import Doc
 from ...tokens import Span
 import pytest
 def test_doc_split(en_tokenizer):
    text = "LosAngeles start."
    heads = [1, 1, 0]
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
    assert len(doc) == 3
    assert len(str(doc)) == 19
    assert doc[0].head.text == 'start'
    assert doc[1].head.text == '.'
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
    assert len(doc) == 4
    assert doc[0].text == 'Los'
    assert doc[0].head.text == 'Angeles'
    assert doc[0].idx == 0
    assert doc[1].idx == 3
    assert doc[1].text == 'Angeles'
    assert doc[1].head.text == 'start'
    assert doc[2].text == 'start'
    assert doc[2].head.text == '.'
    assert doc[3].text == '.'
    assert doc[3].head.text == '.'
    assert len(str(doc)) == 19
 def test_split_dependencies(en_tokenizer):
    text = "LosAngeles start."
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, [t.text for t in tokens])
    dep1 = doc.vocab.strings.add('amod')
    dep2 = doc.vocab.strings.add('subject')
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
    assert doc[0].dep == dep1
    assert doc[1].dep == dep2
 def test_split_heads_error(en_tokenizer):
    text = "LosAngeles start."
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, [t.text for t in tokens])
    #Not enough heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [0])
    #Too many heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
    #No token head
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
    #Several token heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
 def test_spans_entity_merge_iob():
    # Test entity IOB stays consistent after merging
    words = ["abc", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "I"
 def test_spans_sentence_update_after_merge(en_tokenizer):
    text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
    deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
            'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
            'compound', 'punct']
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
    sent1, sent2 = list(doc.sents)
    init_len = len(sent1)
    init_len2 = len(sent2)
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
        retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
    sent1, sent2 = list(doc.sents)
    assert len(sent1) == init_len + 1
    assert len(sent2) == init_len2 + 1
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -43,12 +43,12 @@ cdef class Retokenizer:
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.merges.append((span, attrs))
-    def split(self, Token token, orths, attrs=SimpleFrozenDict()):
+    def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()):
        """Mark a Token for splitting, into the specified orths. The attrs
        will be applied to each subtoken.
        """
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
-        self.splits.append((token.start_char, orths, attrs))
+        self.splits.append((token.i, orths, heads, deps, attrs))
    def __enter__(self):
        self.merges = []
@ -65,8 +65,12 @@ cdef class Retokenizer:
            end = span.end
            _merge(self.doc, start, end, attrs)
-        for start_char, orths, attrs in self.splits:
+        offset = 0
-            raise NotImplementedError
+        # Iterate in order, to keep the offset simple.
        for token_index, orths, heads, deps, attrs in sorted(self.splits):
             _split(self.doc, token_index + offset, orths, heads, deps, attrs)
             # Adjust for the previous tokens
             offset += len(orths)-1
 def _merge(Doc doc, int start, int end, attributes):
    """Retokenize the document, such that the span at
@ -279,3 +283,111 @@ def _bulk_merge(Doc doc, merges):
    # Return the merged Python object
    return doc[spans[0].start]
 def _split(Doc doc, int token_index, orths, heads, deps, attrs):
    """Retokenize the document, such that the token at
    `doc[token_index]` is split into tokens with the orth 'orths'
    token_index(int): token index of the token to split.
    orths: IDs of the verbatim text content of the tokens to create
    **attributes: Attributes to assign to each of the newly created tokens. By default,
        attributes are inherited from the original token.
    RETURNS (Token): The first newly created token.
    """
    cdef int nb_subtokens = len(orths)
    cdef const LexemeC* lex
    cdef TokenC* token
    cdef TokenC orig_token = doc.c[token_index]
    if(len(heads) != nb_subtokens):
        raise ValueError(Errors.E101)
    token_head_index = -1
    for index, head in enumerate(heads):
        if head == 0:
            if token_head_index != -1:
                raise ValueError(Errors.E098)
            token_head_index = index
    if token_head_index == -1:
        raise ValueError(Errors.E099)
    # First, make the dependencies absolutes, and adjust all possible dependencies before
    # creating the tokens
    for i in range(doc.length):
        doc.c[i].head += i
    # Adjust dependencies
    offset = nb_subtokens - 1
    for i in range(doc.length):
        head_idx = doc.c[i].head
        if head_idx == token_index:
            doc.c[i].head = token_head_index
        elif head_idx > token_index:
            doc.c[i].head += offset
    new_token_head = doc.c[token_index].head
    # Double doc.c max_length if necessary (until big enough for all new tokens)
    while doc.length + nb_subtokens - 1 >= doc.max_length:
        doc._realloc(doc.length * 2)
    # Move tokens after the split to create space for the new tokens
    doc.length = len(doc) + nb_subtokens -1
    for token_to_move in range(doc.length - 1, token_index, -1):
        doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
    # Host the tokens in the newly created space
    cdef int idx_offset = 0
    for i, orth in enumerate(orths):
        token = &doc.c[token_index + i]
        lex = doc.vocab.get(doc.mem, orth)
        token.lex = lex
        # Update the character offset of the subtokens
        if i != 0:
            token.idx = orig_token.idx + idx_offset
        idx_offset += len(orth)
        # Set token.spacy to False for all non-last split tokens, and
        # to origToken.spacy for the last token
        if (i < nb_subtokens - 1):
            token.spacy = False
        else:
            token.spacy = orig_token.spacy
        # Apply attrs to each subtoken
        for attr_name, attr_value in attrs.items():
            if attr_name == TAG:
                doc.vocab.morphology.assign_tag(token, attr_value)
            else:
                Token.set_struct_attr(token, attr_name, attr_value)
        # Make IOB consistent
        if (orig_token.ent_iob == 3):
            if i == 0:
                token.ent_iob = 3
            else:
                token.ent_iob = 1
        else:
            # In all other cases subtokens inherit iob from origToken
            token.ent_iob = orig_token.ent_iob
         # Use the head of the new token everywhere. This will be partially overwritten later on.
        token.head = new_token_head
    # Transform the dependencies into relative ones again
    for i in range(doc.length):
        doc.c[i].head -= i
    # Assign correct dependencies to the inner token
    for i, head in enumerate(heads):
        if head != 0:
            # the token's head's head is already correct
            doc.c[token_index + i].head = head
    for i, dep in enumerate(deps):
        doc[token_index + i].dep = dep
    # set children from head
    set_children_from_heads(doc.c, doc.length)