From 91f89f9693fe1b2444242a7a514b177aac034af3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 11 Nov 2019 16:26:46 +0100 Subject: [PATCH] Fix realloc in retokenizer.split() (#4606) Always realloc to a size larger than `doc.max_length` in `retokenizer.split()` (or cymem will throw errors). --- spacy/tests/doc/test_retokenize_split.py | 15 +++++++++++++++ spacy/tokens/_retokenize.pyx | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 6c41a59be..d074fddc6 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -183,3 +183,18 @@ def test_doc_retokenizer_split_lex_attrs(en_vocab): retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].is_stop assert not doc[1].is_stop + + +def test_doc_retokenizer_realloc(en_vocab): + """#4604: realloc correctly when new tokens outnumber original tokens""" + text = "Hyperglycemic adverse events following antipsychotic drug administration in the" + doc = Doc(en_vocab, words=text.split()[:-1]) + with doc.retokenize() as retokenizer: + token = doc[0] + heads = [(token, 0)] * len(token) + retokenizer.split(doc[token.i], list(token.text), heads=heads) + doc = Doc(en_vocab, words=text.split()) + with doc.retokenize() as retokenizer: + token = doc[0] + heads = [(token, 0)] * len(token) + retokenizer.split(doc[token.i], list(token.text), heads=heads) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 5f890de45..a5d06491a 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -329,7 +329,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): doc.c[i].head += offset # Double doc.c max_length if necessary (until big enough for all new tokens) while doc.length + nb_subtokens - 1 >= doc.max_length: - doc._realloc(doc.length * 2) + doc._realloc(doc.max_length * 2) # Move tokens after the split to create space for the new tokens doc.length = len(doc) + nb_subtokens -1 to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)