Expose tokenizer rules as a property

Expose the tokenizer rules property in the same way as the other core properties. (The cache resetting is overkill, but consistent with `from_bytes` for now.) Add tests and update Tokenizer API docs.
2025-10-24 04:31:17 +03:00 · 2019-11-06 12:15:13 +01:00 · 2019-11-06 12:15:13 +01:00 · 983c88d02e
commit 983c88d02e
parent 3ec231f7e1
3 changed files with 111 additions and 16 deletions
--- a/spacy/tests/lang/en/test_customized_tokenizer.py
+++ b/spacy/tests/lang/en/test_customized_tokenizer.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 import pytest
 import re
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy.util import compile_prefix_regex, compile_suffix_regex
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
        r"[\[\]!&:,()\*—–\/-]",
    ]
    infix_re = compile_infix_regex(custom_infixes)
    token_match_re = re.compile("a-b")
    return Tokenizer(
        en_vocab,
        English.Defaults.tokenizer_exceptions,
        prefix_re.search,
        suffix_re.search,
        infix_re.finditer,
-        token_match=None,
+        token_match=token_match_re.match,
    )
@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
        "Megaregion",
        ".",
    ]
 def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
    sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
    context = [word.text for word in custom_en_tokenizer(sentence)]
    assert context == [
        "The",
        "8",
        "and",
        "10",
        "-",
        "county",
        "definitions",
        "a-b",
        "not",
        "used",
        "for",
        "the",
        "greater",
        "Southern",
        "California",
        "Megaregion",
        ".",
    ]
 def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
    context = [word.text for word in custom_en_tokenizer(sentence)]
    assert context == [
        "The",
        "8",
        "and",
        "10",
        "-",
        "county",
        "definitions",
        "are",
        "not",
        "used",
        "for",
        "the",
        "greater",
        "Southern",
        "California",
        "Megaregion",
        ".",
        ":)",
    ]
 def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
    rules = custom_en_tokenizer.rules
    del rules[":)"]
    custom_en_tokenizer.rules = rules
    context = [word.text for word in custom_en_tokenizer(sentence)]
    assert context == [
        "The",
        "8",
        "and",
        "10",
        "-",
        "county",
        "definitions",
        "are",
        "not",
        "used",
        "for",
        "the",
        "greater",
        "Southern",
        "California",
        "Megaregion",
        ".",
        ":",
        ")",
    ]
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -93,6 +93,19 @@ cdef class Tokenizer:
            self._infix_finditer = infix_finditer
            self._flush_cache()
    property rules:
        def __get__(self):
            return self._rules
        def __set__(self, rules):
            self._rules = {}
            self._reset_cache([key for key in self._cache])
            self._reset_specials()
            self._cache = PreshMap()
            self._specials = PreshMap()
            if rules is not None:
                self._load_special_tokenization(rules)
    def __reduce__(self):
        args = (self.vocab,
                self._rules,
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -34,15 +34,15 @@ the
 > tokenizer = nlp.Defaults.create_tokenizer(nlp)
 > ```
-| Name             | Type        | Description                                                                         |
+| Name             | Type        | Description                                                                                                                   |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
+| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab`     | A storage container for lexical types.                                              |
+| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                        |
-| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                     |
+| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                               |
-| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes. |
+| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
-| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes. |
+| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
-| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes. |
+| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
-| `token_match`    | callable    | A boolean function matching strings to be recognized as tokens.                     |
+| `token_match`    | callable    | A function matching the signature of `re.compile(string).match to find token matches.                                         |
-| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                       |
+| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |
 ## Tokenizer.\_\_call\_\_ {#call tag="method"}
@ -198,12 +198,14 @@ it.
 ## Attributes {#attributes}
-| Name             | Type    | Description                                                                                                                |
+| Name             | Type    | Description                                                                                                                 |
-| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
+| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                      |
+| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                       |
-| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.            |
+| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.             |
-| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.              |
+| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.               |
-| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
+| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects.  |
 | `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
 | `rules`          | dict        | A dictionary of tokenizer exceptions and special cases.                                                                  |
 ## Serialization fields {#serialization-fields}