From 983c88d02ec4e97ce4ff81c6ab408179703f6aaf Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 6 Nov 2019 12:15:13 +0100 Subject: [PATCH] Expose tokenizer rules as a property Expose the tokenizer rules property in the same way as the other core properties. (The cache resetting is overkill, but consistent with `from_bytes` for now.) Add tests and update Tokenizer API docs. --- .../lang/en/test_customized_tokenizer.py | 82 ++++++++++++++++++- spacy/tokenizer.pyx | 13 +++ website/docs/api/tokenizer.md | 32 ++++---- 3 files changed, 111 insertions(+), 16 deletions(-) diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py index fdac32a90..7f939011f 100644 --- a/spacy/tests/lang/en/test_customized_tokenizer.py +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import re from spacy.lang.en import English from spacy.tokenizer import Tokenizer from spacy.util import compile_prefix_regex, compile_suffix_regex @@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab): r"[\[\]!&:,()\*—–\/-]", ] infix_re = compile_infix_regex(custom_infixes) + token_match_re = re.compile("a-b") return Tokenizer( en_vocab, English.Defaults.tokenizer_exceptions, prefix_re.search, suffix_re.search, infix_re.finditer, - token_match=None, + token_match=token_match_re.match, ) @@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer): "Megaregion", ".", ] + + +def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion." + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "a-b", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ] + + +def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)" + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ":)", + ] + + +def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)" + rules = custom_en_tokenizer.rules + del rules[":)"] + custom_en_tokenizer.rules = rules + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ":", + ")", + ] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b39bb1ecb..e25c2bc43 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -93,6 +93,19 @@ cdef class Tokenizer: self._infix_finditer = infix_finditer self._flush_cache() + property rules: + def __get__(self): + return self._rules + + def __set__(self, rules): + self._rules = {} + self._reset_cache([key for key in self._cache]) + self._reset_specials() + self._cache = PreshMap() + self._specials = PreshMap() + if rules is not None: + self._load_special_tokenization(rules) + def __reduce__(self): args = (self.vocab, self._rules, diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index d6ab73f14..1ac94c132 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,15 +34,15 @@ the > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ``` -| Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A boolean function matching strings to be recognized as tokens. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -198,12 +198,14 @@ it. ## Attributes {#attributes} -| Name | Type | Description | -| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | -| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | -| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| Name | Type | Description | +| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | +| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | +| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. | +| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | ## Serialization fields {#serialization-fields}