Expose tokenizer rules as a property

Expose the tokenizer rules property in the same way as the other core
properties. (The cache resetting is overkill, but consistent with
`from_bytes` for now.)

Add tests and update Tokenizer API docs.
This commit is contained in:
Adriane Boyd 2019-11-06 12:15:13 +01:00
parent 3ec231f7e1
commit 983c88d02e
3 changed files with 111 additions and 16 deletions

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
import re
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
r"[\[\]!&:,()\*—–\/-]",
]
infix_re = compile_infix_regex(custom_infixes)
token_match_re = re.compile("a-b")
return Tokenizer(
en_vocab,
English.Defaults.tokenizer_exceptions,
prefix_re.search,
suffix_re.search,
infix_re.finditer,
token_match=None,
token_match=token_match_re.match,
)
@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
"Megaregion",
".",
]
def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"a-b",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]
def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
":)",
]
def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
rules = custom_en_tokenizer.rules
del rules[":)"]
custom_en_tokenizer.rules = rules
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
":",
")",
]

View File

@ -93,6 +93,19 @@ cdef class Tokenizer:
self._infix_finditer = infix_finditer
self._flush_cache()
property rules:
def __get__(self):
return self._rules
def __set__(self, rules):
self._rules = {}
self._reset_cache([key for key in self._cache])
self._reset_specials()
self._cache = PreshMap()
self._specials = PreshMap()
if rules is not None:
self._load_special_tokenization(rules)
def __reduce__(self):
args = (self.vocab,
self._rules,

View File

@ -35,13 +35,13 @@ the
> ```
| Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A boolean function matching strings to be recognized as tokens. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"}
@ -199,11 +199,13 @@ it.
## Attributes {#attributes}
| Name | Type | Description |
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
## Serialization fields {#serialization-fields}