From 983c88d02ec4e97ce4ff81c6ab408179703f6aaf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 6 Nov 2019 12:15:13 +0100
Subject: [PATCH] Expose tokenizer rules as a property

Expose the tokenizer rules property in the same way as the other core
properties. (The cache resetting is overkill, but consistent with
`from_bytes` for now.)

Add tests and update Tokenizer API docs.
---
 .../lang/en/test_customized_tokenizer.py      | 82 ++++++++++++++++++-
 spacy/tokenizer.pyx                           | 13 +++
 website/docs/api/tokenizer.md                 | 32 ++++----
 3 files changed, 111 insertions(+), 16 deletions(-)

diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py
index fdac32a90..7f939011f 100644
--- a/spacy/tests/lang/en/test_customized_tokenizer.py
+++ b/spacy/tests/lang/en/test_customized_tokenizer.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import pytest
+import re
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy.util import compile_prefix_regex, compile_suffix_regex
@@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
         r"[\[\]!&:,()\*—–\/-]",
     ]
     infix_re = compile_infix_regex(custom_infixes)
+    token_match_re = re.compile("a-b")
     return Tokenizer(
         en_vocab,
         English.Defaults.tokenizer_exceptions,
         prefix_re.search,
         suffix_re.search,
         infix_re.finditer,
-        token_match=None,
+        token_match=token_match_re.match,
     )
 
 
@@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
         "Megaregion",
         ".",
     ]
+
+
+def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "a-b",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":)",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    rules = custom_en_tokenizer.rules
+    del rules[":)"]
+    custom_en_tokenizer.rules = rules
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":",
+        ")",
+    ]
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index b39bb1ecb..e25c2bc43 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -93,6 +93,19 @@ cdef class Tokenizer:
             self._infix_finditer = infix_finditer
             self._flush_cache()
 
+    property rules:
+        def __get__(self):
+            return self._rules
+
+        def __set__(self, rules):
+            self._rules = {}
+            self._reset_cache([key for key in self._cache])
+            self._reset_specials()
+            self._cache = PreshMap()
+            self._specials = PreshMap()
+            if rules is not None:
+                self._load_special_tokenization(rules)
+
     def __reduce__(self):
         args = (self.vocab,
                 self._rules,
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index d6ab73f14..1ac94c132 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -34,15 +34,15 @@ the
 > tokenizer = nlp.Defaults.create_tokenizer(nlp)
 > ```
 
-| Name             | Type        | Description                                                                         |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab`     | A storage container for lexical types.                                              |
-| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                     |
-| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes. |
-| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes. |
-| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes. |
-| `token_match`    | callable    | A boolean function matching strings to be recognized as tokens.                     |
-| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                       |
+| Name             | Type        | Description                                                                                                                   |
+| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                        |
+| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                               |
+| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
+| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
+| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
+| `token_match`    | callable    | A function matching the signature of `re.compile(string).match to find token matches.                                         |
+| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |
 
 ## Tokenizer.\_\_call\_\_ {#call tag="method"}
 
@@ -198,12 +198,14 @@ it.
 
 ## Attributes {#attributes}
 
-| Name             | Type    | Description                                                                                                                |
-| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                      |
-| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.            |
-| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.              |
-| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
+| Name             | Type    | Description                                                                                                                 |
+| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                       |
+| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.             |
+| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.               |
+| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects.  |
+| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
+| `rules`          | dict        | A dictionary of tokenizer exceptions and special cases.                                                                  |
 
 ## Serialization fields {#serialization-fields}