From 29b058ebdc30104b08e96e57ee17826686674a4e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 8 Dec 2020 07:25:56 +0100
Subject: [PATCH] Fix spacy when retokenizing cases with affixes (#6475)

Preserve `token.spacy` corresponding to the span end token in the
original doc rather than adjusting for the current offset.

* If not modifying in place, this checks in the original document
(`doc.c` rather than `tokens`).
* If modifying in place, the document has not been modified past the
current span start position so the value at the current span end
position is valid.
---
 spacy/tests/tokenizer/test_tokenizer.py | 17 +++++++++++++++++
 spacy/tokenizer.pyx                     |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 23c2d5c47..82032b2da 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -2,6 +2,7 @@ import pytest
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.util import ensure_path
+from spacy.lang.en import English
 
 
 def test_tokenizer_handles_no_word(tokenizer):
@@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
     ]
 
 
+def test_tokenizer_special_cases_with_affixes_preserve_spacy():
+    tokenizer = English().tokenizer
+    # reset all special cases
+    tokenizer.rules = {}
+
+    # in-place modification (only merges)
+    text = "''a'' "
+    tokenizer.add_special_case("''", [{"ORTH": "''"}])
+    assert tokenizer(text).text == text
+
+    # not in-place (splits and merges)
+    tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
+    text = "ab ab ab ''ab ab'' ab'' ''ab"
+    assert tokenizer(text).text == text
+
+
 def test_tokenizer_special_cases_with_period(tokenizer):
     text = "_SPECIAL_."
     tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 17714940d..8d8fac4fd 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -338,7 +338,7 @@ cdef class Tokenizer:
                     # Copy special case tokens into doc and adjust token and
                     # character offsets
                     idx_offset = 0
-                    orig_final_spacy = doc.c[span_end + offset - 1].spacy
+                    orig_final_spacy = doc.c[span_end - 1].spacy
                     orig_idx = doc.c[i].idx
                     for j in range(cached.length):
                         tokens[i + offset + j] = cached.data.tokens[j]