Add tokenizer option to allow Matcher handling for all rules

Add tokenizer option `with_faster_rules_heuristics` that determines
whether the special cases applied by the internal `Matcher` are filtered
by whether they contain affixes or space. If `True` (default), the rules
are filtered to prioritize speed over rare edge cases. If `False`, all
rules are included in the final `Matcher`-based pass over the doc.
This commit is contained in:
Adriane Boyd 2022-03-07 12:13:36 +01:00
parent 6f4f57f317
commit 59eba273bb
5 changed files with 53 additions and 16 deletions

View File

@ -70,6 +70,7 @@ def test_issue4190():
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match,
with_faster_rules_heuristics=False,
)
nlp.tokenizer = new_tokenizer
@ -90,6 +91,7 @@ def test_issue4190():
doc_2 = nlp_2(test_string)
result_2 = [token.text for token in doc_2]
assert result_1b == result_2
assert nlp_2.tokenizer.with_faster_rules_heuristics is False
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):

View File

@ -521,3 +521,20 @@ def test_tokenizer_infix_prefix(en_vocab):
assert tokens == ["±10", "%"]
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
assert tokens == explain_tokens
@pytest.mark.issue(10086)
def test_issue10086(en_tokenizer):
"""Test special case works when part of infix substring."""
text = "No--don't see"
# without heuristics: do n't
en_tokenizer.with_faster_rules_heuristics = False
doc = en_tokenizer(text)
assert "n't" in [w.text for w in doc]
assert "do" in [w.text for w in doc]
# with (default) heuristics: don't
en_tokenizer.with_faster_rules_heuristics = True
doc = en_tokenizer(text)
assert "don't" in [w.text for w in doc]

View File

@ -23,9 +23,10 @@ cdef class Tokenizer:
cdef object _infix_finditer
cdef object _rules
cdef PhraseMatcher _special_matcher
# TODO next two are unused and should be removed in v4
# TODO convert to bool in v4
cdef int _with_faster_rules_heuristics
# TODO next one is unused and should be removed in v4
# https://github.com/explosion/spaCy/pull/9150
cdef int _unused_int1
cdef int _unused_int2
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)

View File

@ -34,7 +34,7 @@ cdef class Tokenizer:
"""
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None,
url_match=None):
url_match=None, with_faster_rules_heuristics=True):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types.
@ -43,7 +43,7 @@ cdef class Tokenizer:
`re.compile(string).search` to match prefixes.
suffix_search (callable): A function matching the signature of
`re.compile(string).search` to match suffixes.
`infix_finditer` (callable): A function matching the signature of
infix_finditer (callable): A function matching the signature of
`re.compile(string).finditer` to find infixes.
token_match (callable): A function matching the signature of
`re.compile(string).match`, for matching strings to be
@ -51,6 +51,9 @@ cdef class Tokenizer:
url_match (callable): A function matching the signature of
`re.compile(string).match`, for matching strings to be
recognized as urls.
with_faster_rules_heuristics (bool): Whether to restrict the final
Matcher-based pass for rules to those containing affixes or space.
Defaults to True.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)
@ -66,6 +69,7 @@ cdef class Tokenizer:
self.suffix_search = suffix_search
self.infix_finditer = infix_finditer
self.vocab = vocab
self.with_faster_rules_heuristics = with_faster_rules_heuristics
self._rules = {}
self._special_matcher = PhraseMatcher(self.vocab)
self._load_special_cases(rules)
@ -122,6 +126,14 @@ cdef class Tokenizer:
self._specials = PreshMap()
self._load_special_cases(rules)
property with_faster_rules_heuristics:
def __get__(self):
return bool(self._with_faster_rules_heuristics)
def __set__(self, with_faster_rules_heuristics):
self._with_faster_rules_heuristics = bool(with_faster_rules_heuristics)
self._reload_special_cases()
def __reduce__(self):
args = (self.vocab,
self.rules,
@ -602,7 +614,7 @@ cdef class Tokenizer:
self.mem.free(stale_special)
self._rules[string] = substrings
self._flush_cache()
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
if not self.with_faster_rules_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
def _reload_special_cases(self):
@ -773,7 +785,8 @@ cdef class Tokenizer:
"infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
"token_match": lambda: _get_regex_pattern(self.token_match),
"url_match": lambda: _get_regex_pattern(self.url_match),
"exceptions": lambda: dict(sorted(self._rules.items()))
"exceptions": lambda: dict(sorted(self._rules.items())),
"with_faster_rules_heuristics": lambda: self.with_faster_rules_heuristics,
}
return util.to_bytes(serializers, exclude)
@ -794,7 +807,8 @@ cdef class Tokenizer:
"infix_finditer": lambda b: data.setdefault("infix_finditer", b),
"token_match": lambda b: data.setdefault("token_match", b),
"url_match": lambda b: data.setdefault("url_match", b),
"exceptions": lambda b: data.setdefault("rules", b)
"exceptions": lambda b: data.setdefault("rules", b),
"with_faster_rules_heuristics": lambda b: data.setdefault("with_faster_rules_heuristics", b),
}
# reset all properties and flush all caches (through rules),
# reset rules first so that _reload_special_cases is trivial/fast as
@ -818,6 +832,8 @@ cdef class Tokenizer:
self.url_match = re.compile(data["url_match"]).match
if "rules" in data and isinstance(data["rules"], dict):
self.rules = data["rules"]
if "with_faster_rules_heuristics" in data:
self.with_faster_rules_heuristics = data["with_faster_rules_heuristics"]
return self

View File

@ -45,7 +45,7 @@ how to construct a custom tokenizer with different tokenization rules, see the
> ```
| Name | Description |
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
| `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
@ -53,6 +53,7 @@ how to construct a custom tokenizer with different tokenization rules, see the
| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `with_faster_rules_heuristics` | Whether to restrict the final `Matcher`-based pass for rules to those containing affixes or space. Defaults to `True`. ~~bool~~ |
## Tokenizer.\_\_call\_\_ {#call tag="method"}