From 216efaf5f53960f80519cfa2c343a1f9efdbf72e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Aug 2020 09:42:06 +0200
Subject: [PATCH] Restrict tokenizer exceptions to ORTH and NORM

---
 spacy/errors.py                         |  3 +++
 spacy/tests/tokenizer/test_tokenizer.py |  8 +++++++-
 spacy/tokenizer.pyx                     | 12 +++++++++---
 website/docs/usage/v3.md                | 14 ++++++++++++++
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 38c89c479..e53aaef07 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -645,6 +645,9 @@ class Errors:
              "Required tables '{tables}', found '{found}'. If you are not "
              "providing custom lookups, make sure you have the package "
              "spacy-lookups-data installed.")
+    E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
+             "'{chunk}'. Tokenizer exceptions are only allowed to specify "
+             "`ORTH` and `NORM`.")
 
 
 @add_codes
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index b89c0627f..ff31ae8a9 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -105,7 +105,13 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens):
     assert doc[1].text == tokens[1]["orth"]
 
 
-@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])])
+@pytest.mark.parametrize(
+    "text,tokens",
+    [
+        ("lorem", [{"orth": "lo"}, {"orth": "re"}]),
+        ("lorem", [{"orth": "lo", "tag": "A"}, {"orth": "rem"}]),
+    ],
+)
 def test_tokenizer_validate_special_case(tokenizer, text, tokens):
     with pytest.raises(ValueError):
         tokenizer.add_special_case(text, tokens)
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 9fda1800b..12c634e61 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -17,7 +17,7 @@ from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 
 from .attrs import intify_attrs
-from .symbols import ORTH
+from .symbols import ORTH, NORM
 from .errors import Errors, Warnings
 from . import util
 from .util import registry
@@ -584,9 +584,11 @@ cdef class Tokenizer:
                 self.add_special_case(chunk, substrings)
 
     def _validate_special_case(self, chunk, substrings):
-        """Check whether the `ORTH` fields match the string.
+        """Check whether the `ORTH` fields match the string. Check that
+        additional features beyond `ORTH` and `NORM` are not set by the
+        exception.
 
-        string (str): The string to specially tokenize.
+        chunk (str): The string to specially tokenize.
         substrings (iterable): A sequence of dicts, where each dict describes
             a token and its attributes.
         """
@@ -594,6 +596,10 @@ cdef class Tokenizer:
         orth = "".join([spec[ORTH] for spec in attrs])
         if chunk != orth:
             raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
+        for substring in attrs:
+            for attr in substring:
+                if attr not in (ORTH, NORM):
+                    raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
 
     def add_special_case(self, unicode string, substrings):
         """Add a special-case tokenization rule.
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index d5fea9fee..20b7a139b 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -566,6 +566,20 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
 + matcher.add("HEALTH", patterns, on_match=on_match)
 ```
 
+### Migrating attributes in tokenizer exceptions {#migrating-tokenizer-exceptions}
+
+Tokenizer exceptions are now only allowed to set `ORTH` and `NORM` values as
+part of the token attributes. Exceptions for other attributes such as `TAG` and
+`LEMMA` should be moved to an [`AttributeRuler`](/api/attributeruler) component:
+
+```diff
+nlp = spacy.blank("en")
+- nlp.tokenizer.add_special_case("don't", [{"ORTH": "do"}, {"ORTH": "n't", "LEMMA": "not"}])
++ nlp.tokenizer.add_special_case("don't", [{"ORTH": "do"}, {"ORTH": "n't"}])
++ ruler = nlp.add_pipe("attribute_ruler")
++ ruler.add(patterns=[[{"ORTH": "n't"}]], attrs={"LEMMA": "not"})
+```
+
 ### Migrating tag maps and morph rules {#migrating-training-mappings-exceptions}
 
 Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy