Expose tokenizer rules as a property

Expose the tokenizer rules property in the same way as the other core properties. (The cache resetting is overkill, but consistent with `from_bytes` for now.) Add tests and update Tokenizer API docs.
2025-10-24 12:41:23 +03:00 · 2019-11-06 12:15:13 +01:00 · 2019-11-06 12:15:13 +01:00 · 983c88d02e
commit 983c88d02e
parent 3ec231f7e1
3 changed files with 111 additions and 16 deletions
--- a/spacy/tests/lang/en/test_customized_tokenizer.py
+++ b/spacy/tests/lang/en/test_customized_tokenizer.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 import pytest
+import re
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy.util import compile_prefix_regex, compile_suffix_regex
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
        r"[\[\]!&:,()\*—–\/-]",
    ]
    infix_re = compile_infix_regex(custom_infixes)
+    token_match_re = re.compile("a-b")
    return Tokenizer(
        en_vocab,
        English.Defaults.tokenizer_exceptions,
        prefix_re.search,
        suffix_re.search,
        infix_re.finditer,
-        token_match=None,
+        token_match=token_match_re.match,
    )


@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
        "Megaregion",
        ".",
    ]
+
+
+def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "a-b",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":)",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    rules = custom_en_tokenizer.rules
+    del rules[":)"]
+    custom_en_tokenizer.rules = rules
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":",
+        ")",
+    ]
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -93,6 +93,19 @@ cdef class Tokenizer:
            self._infix_finditer = infix_finditer
            self._flush_cache()

+    property rules:
+        def __get__(self):
+            return self._rules
+
+        def __set__(self, rules):
+            self._rules = {}
+            self._reset_cache([key for key in self._cache])
+            self._reset_specials()
+            self._cache = PreshMap()
+            self._specials = PreshMap()
+            if rules is not None:
+                self._load_special_tokenization(rules)
+
    def __reduce__(self):
        args = (self.vocab,
                self._rules,
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -35,13 +35,13 @@ the
 > ```

 | Name             | Type        | Description                                                                                                                   |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
+| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                        |
 | `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                               |
 | `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
 | `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
 | `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
-| `token_match`    | callable    | A boolean function matching strings to be recognized as tokens.                     |
+| `token_match`    | callable    | A function matching the signature of `re.compile(string).match to find token matches.                                         |
 | **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |

 ## Tokenizer.\_\_call\_\_ {#call tag="method"}
@ -199,11 +199,13 @@ it.
 ## Attributes {#attributes}

 | Name             | Type    | Description                                                                                                                 |
-| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
+| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                       |
 | `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.             |
 | `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.               |
 | `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects.  |
+| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
+| `rules`          | dict        | A dictionary of tokenizer exceptions and special cases.                                                                  |

 ## Serialization fields {#serialization-fields}