From 963ea5e8d0290e2198b6b36fa79b420f3152f639 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 8 Aug 2019 15:09:44 +0200 Subject: [PATCH] Update lemma and vector information after splitting a token (#4097) * fixing vector and lemma attributes after retokenizer.split * fixing unit test with mockup tensor * xp instead of numpy --- spacy/tests/regression/test_issue3540.py | 44 ++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 10 ++++++ 2 files changed, 54 insertions(+) create mode 100644 spacy/tests/regression/test_issue3540.py diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py new file mode 100644 index 000000000..0078b2243 --- /dev/null +++ b/spacy/tests/regression/test_issue3540.py @@ -0,0 +1,44 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.tokens import Doc + +import numpy as np + + +def test_issue3540(en_vocab): + + words = ["I", "live", "in", "NewYork", "right", "now"] + tensor = np.asarray([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f") + doc = Doc(en_vocab, words=words) + doc.tensor = tensor + + gold_text = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.text for token in doc] == gold_text + + gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + + vectors_1 = [token.vector for token in doc] + assert len(vectors_1) == len(doc) + + with doc.retokenize() as retokenizer: + heads = [(doc[3], 1), doc[2]] + attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} + retokenizer.split(doc[3], [u"New", u"York"], heads=heads, attrs=attrs) + + gold_text = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.text for token in doc] == gold_text + + gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + + vectors_2 = [token.vector for token in doc] + assert len(vectors_2) == len(doc) + + assert vectors_1[0].tolist() == vectors_2[0].tolist() + assert vectors_1[1].tolist() == vectors_2[1].tolist() + assert vectors_1[2].tolist() == vectors_2[2].tolist() + + assert vectors_1[4].tolist() == vectors_2[5].tolist() + assert vectors_1[5].tolist() == vectors_2[6].tolist() diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index b25a1a697..7da718349 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -404,14 +404,24 @@ def _split(Doc doc, int token_index, orths, heads, attrs): doc._realloc(doc.length * 2) # Move tokens after the split to create space for the new tokens doc.length = len(doc) + nb_subtokens -1 + to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0) + if to_process_tensor: + xp = get_array_module(doc.tensor) + doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0) for token_to_move in range(doc.length - 1, token_index, -1): doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] + if to_process_tensor: + doc.tensor[token_to_move + nb_subtokens - 1] = doc.tensor[token_to_move] # Host the tokens in the newly created space cdef int idx_offset = 0 for i, orth in enumerate(orths): token = &doc.c[token_index + i] lex = doc.vocab.get(doc.mem, orth) token.lex = lex + token.lemma = 0 # reset lemma + if to_process_tensor: + # setting the tensors of the split tokens to array of zeros + doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32") # Update the character offset of the subtokens if i != 0: token.idx = orig_token.idx + idx_offset