From 5768df4f09e961f33847fb9d3983060e8ab7cf55 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 20 May 2018 15:13:37 +0200
Subject: [PATCH 1/2] Add SimpleFrozenDict util to use as default function
 argument

---
 spacy/errors.py |  2 ++
 spacy/util.py   | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index b60fe690a..a557be2e8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -245,6 +245,8 @@ class Errors(object):
             "the meta.json. Vector names are required to avoid issue #1660.")
     E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
     E094 = ("Error reading line {line_num} in vectors file {loc}.")
+    E095 = ("Can't write to frozen dictionary. This is likely an internal "
+            "error. Are you writing to a default function argument?")
 
 
 @add_codes
diff --git a/spacy/util.py b/spacy/util.py
index 9000127db..bf3bd6ddb 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -606,3 +606,18 @@ def use_gpu(gpu_id):
 def fix_random_seed(seed=0):
     random.seed(seed)
     numpy.random.seed(seed)
+
+
+class SimpleFrozenDict(dict):
+    """Simplified implementation of a frozen dict, mainly used as default
+    function or method argument (for arguments that should default to empty
+    dictionary). Will raise an error if user or spaCy attempts to add to dict.
+    """
+    def __setitem__(self, key, value):
+        raise NotImplementedError(Errors.E095)
+
+    def pop(self, key, default=None):
+        raise NotImplementedError(Errors.E095)
+
+    def update(self, other):
+        raise NotImplementedError(Errors.E095)

From b59e3b157f593f5f74981bf9d39deba7bc9a12a4 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 20 May 2018 15:15:37 +0200
Subject: [PATCH 2/2] Don't require attrs argument in Doc.retokenize and allow
 both ints and unicode (resolves #2304)

---
 spacy/tests/doc/test_doc_api.py | 21 +++++++++++++++++++++
 spacy/tokens/_retokenize.pyx    | 24 ++++++++++++++----------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 06f6a3d30..d9db0916b 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from ..util import get_doc
 from ...tokens import Doc
 from ...vocab import Vocab
+from ...attrs import LEMMA
 
 import pytest
 import numpy
@@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
     doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
 
 
+def test_doc_api_retokenizer(en_tokenizer):
+    doc = en_tokenizer("WKRO played songs by the beach boys all night")
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[4:7])
+    assert len(doc) == 7
+    assert doc[4].text == 'the beach boys'
+
+
+def test_doc_api_retokenizer_attrs(en_tokenizer):
+    doc = en_tokenizer("WKRO played songs by the beach boys all night")
+    # test both string and integer attributes and values
+    attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[4:7], attrs=attrs)
+    assert len(doc) == 7
+    assert doc[4].text == 'the beach boys'
+    assert doc[4].lemma_ == 'boys'
+    assert doc[4].ent_type_ == 'ORG'
+
+
 def test_doc_api_sents_empty_string(en_tokenizer):
     doc = en_tokenizer("")
     doc.is_parsed = True
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 00f724ed6..b405dd000 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -11,11 +11,13 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport *
+from ..attrs cimport TAG
+from ..attrs import intify_attrs
+from ..util import SimpleFrozenDict
 
 
 cdef class Retokenizer:
-    '''Helper class for doc.retokenize() context manager.'''
+    """Helper class for doc.retokenize() context manager."""
     cdef Doc doc
     cdef list merges
     cdef list splits
@@ -24,14 +26,18 @@ cdef class Retokenizer:
         self.merges = []
         self.splits = []
 
-    def merge(self, Span span, attrs=None):
-        '''Mark a span for merging. The attrs will be applied to the resulting
-        token.'''
+    def merge(self, Span span, attrs=SimpleFrozenDict()):
+        """Mark a span for merging. The attrs will be applied to the resulting
+        token.
+        """
+        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
         self.merges.append((span.start_char, span.end_char, attrs))
 
-    def split(self, Token token, orths, attrs=None):
-        '''Mark a Token for splitting, into the specified orths. The attrs
-        will be applied to each subtoken.'''
+    def split(self, Token token, orths, attrs=SimpleFrozenDict()):
+        """Mark a Token for splitting, into the specified orths. The attrs
+        will be applied to each subtoken.
+        """
+        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
         self.splits.append((token.start_char, orths, attrs))
 
     def __enter__(self):
@@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
     # Clear the cached Python objects
     # Return the merged Python object
     return doc[start]
-
-