mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Add tokenizer option to allow Matcher handling for all rules (#10452)
* Add tokenizer option to allow Matcher handling for all rules
Add tokenizer option `with_faster_rules_heuristics` that determines
whether the special cases applied by the internal `Matcher` are filtered
by whether they contain affixes or space. If `True` (default), the rules
are filtered to prioritize speed over rare edge cases. If `False`, all
rules are included in the final `Matcher`-based pass over the doc.
* Reset all caches when reloading special cases
* Revert "Reset all caches when reloading special cases"
This reverts commit 4ef6bd171d
.
* Initialize max_length properly
* Add new tag to API docs
* Rename to faster heuristics
This commit is contained in:
parent
31a5d99efa
commit
3711af74e5
|
@ -70,6 +70,7 @@ def test_issue4190():
|
||||||
suffix_search=suffix_re.search,
|
suffix_search=suffix_re.search,
|
||||||
infix_finditer=infix_re.finditer,
|
infix_finditer=infix_re.finditer,
|
||||||
token_match=nlp.tokenizer.token_match,
|
token_match=nlp.tokenizer.token_match,
|
||||||
|
faster_heuristics=False,
|
||||||
)
|
)
|
||||||
nlp.tokenizer = new_tokenizer
|
nlp.tokenizer = new_tokenizer
|
||||||
|
|
||||||
|
@ -90,6 +91,7 @@ def test_issue4190():
|
||||||
doc_2 = nlp_2(test_string)
|
doc_2 = nlp_2(test_string)
|
||||||
result_2 = [token.text for token in doc_2]
|
result_2 = [token.text for token in doc_2]
|
||||||
assert result_1b == result_2
|
assert result_1b == result_2
|
||||||
|
assert nlp_2.tokenizer.faster_heuristics is False
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||||||
|
|
|
@ -523,6 +523,23 @@ def test_tokenizer_infix_prefix(en_vocab):
|
||||||
assert tokens == explain_tokens
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10086)
|
||||||
|
def test_issue10086(en_tokenizer):
|
||||||
|
"""Test special case works when part of infix substring."""
|
||||||
|
text = "No--don't see"
|
||||||
|
|
||||||
|
# without heuristics: do n't
|
||||||
|
en_tokenizer.faster_heuristics = False
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert "n't" in [w.text for w in doc]
|
||||||
|
assert "do" in [w.text for w in doc]
|
||||||
|
|
||||||
|
# with (default) heuristics: don't
|
||||||
|
en_tokenizer.faster_heuristics = True
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert "don't" in [w.text for w in doc]
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_initial_special_case_explain(en_vocab):
|
def test_tokenizer_initial_special_case_explain(en_vocab):
|
||||||
tokenizer = Tokenizer(
|
tokenizer = Tokenizer(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
|
|
|
@ -23,9 +23,10 @@ cdef class Tokenizer:
|
||||||
cdef object _infix_finditer
|
cdef object _infix_finditer
|
||||||
cdef object _rules
|
cdef object _rules
|
||||||
cdef PhraseMatcher _special_matcher
|
cdef PhraseMatcher _special_matcher
|
||||||
# TODO next two are unused and should be removed in v4
|
# TODO convert to bool in v4
|
||||||
|
cdef int _faster_heuristics
|
||||||
|
# TODO next one is unused and should be removed in v4
|
||||||
# https://github.com/explosion/spaCy/pull/9150
|
# https://github.com/explosion/spaCy/pull/9150
|
||||||
cdef int _unused_int1
|
|
||||||
cdef int _unused_int2
|
cdef int _unused_int2
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||||
|
|
|
@ -34,7 +34,7 @@ cdef class Tokenizer:
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||||
suffix_search=None, infix_finditer=None, token_match=None,
|
suffix_search=None, infix_finditer=None, token_match=None,
|
||||||
url_match=None):
|
url_match=None, faster_heuristics=True):
|
||||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||||
|
|
||||||
vocab (Vocab): A storage container for lexical types.
|
vocab (Vocab): A storage container for lexical types.
|
||||||
|
@ -43,7 +43,7 @@ cdef class Tokenizer:
|
||||||
`re.compile(string).search` to match prefixes.
|
`re.compile(string).search` to match prefixes.
|
||||||
suffix_search (callable): A function matching the signature of
|
suffix_search (callable): A function matching the signature of
|
||||||
`re.compile(string).search` to match suffixes.
|
`re.compile(string).search` to match suffixes.
|
||||||
`infix_finditer` (callable): A function matching the signature of
|
infix_finditer (callable): A function matching the signature of
|
||||||
`re.compile(string).finditer` to find infixes.
|
`re.compile(string).finditer` to find infixes.
|
||||||
token_match (callable): A function matching the signature of
|
token_match (callable): A function matching the signature of
|
||||||
`re.compile(string).match`, for matching strings to be
|
`re.compile(string).match`, for matching strings to be
|
||||||
|
@ -51,6 +51,9 @@ cdef class Tokenizer:
|
||||||
url_match (callable): A function matching the signature of
|
url_match (callable): A function matching the signature of
|
||||||
`re.compile(string).match`, for matching strings to be
|
`re.compile(string).match`, for matching strings to be
|
||||||
recognized as urls.
|
recognized as urls.
|
||||||
|
faster_heuristics (bool): Whether to restrict the final
|
||||||
|
Matcher-based pass for rules to those containing affixes or space.
|
||||||
|
Defaults to True.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
@ -66,6 +69,7 @@ cdef class Tokenizer:
|
||||||
self.suffix_search = suffix_search
|
self.suffix_search = suffix_search
|
||||||
self.infix_finditer = infix_finditer
|
self.infix_finditer = infix_finditer
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
self.faster_heuristics = faster_heuristics
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
self._special_matcher = PhraseMatcher(self.vocab)
|
self._special_matcher = PhraseMatcher(self.vocab)
|
||||||
self._load_special_cases(rules)
|
self._load_special_cases(rules)
|
||||||
|
@ -122,6 +126,14 @@ cdef class Tokenizer:
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
self._load_special_cases(rules)
|
self._load_special_cases(rules)
|
||||||
|
|
||||||
|
property faster_heuristics:
|
||||||
|
def __get__(self):
|
||||||
|
return bool(self._faster_heuristics)
|
||||||
|
|
||||||
|
def __set__(self, faster_heuristics):
|
||||||
|
self._faster_heuristics = bool(faster_heuristics)
|
||||||
|
self._reload_special_cases()
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
args = (self.vocab,
|
args = (self.vocab,
|
||||||
self.rules,
|
self.rules,
|
||||||
|
@ -287,7 +299,7 @@ cdef class Tokenizer:
|
||||||
spans = [doc[match.start:match.end] for match in filtered]
|
spans = [doc[match.start:match.end] for match in filtered]
|
||||||
cdef bint modify_in_place = True
|
cdef bint modify_in_place = True
|
||||||
cdef int curr_length = doc.length
|
cdef int curr_length = doc.length
|
||||||
cdef int max_length
|
cdef int max_length = 0
|
||||||
cdef int span_length_diff = 0
|
cdef int span_length_diff = 0
|
||||||
span_data = {}
|
span_data = {}
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
@ -602,7 +614,7 @@ cdef class Tokenizer:
|
||||||
self.mem.free(stale_special)
|
self.mem.free(stale_special)
|
||||||
self._rules[string] = substrings
|
self._rules[string] = substrings
|
||||||
self._flush_cache()
|
self._flush_cache()
|
||||||
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
|
if not self.faster_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
|
||||||
self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
|
self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
|
||||||
|
|
||||||
def _reload_special_cases(self):
|
def _reload_special_cases(self):
|
||||||
|
@ -777,7 +789,8 @@ cdef class Tokenizer:
|
||||||
"infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
|
"infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
|
||||||
"token_match": lambda: _get_regex_pattern(self.token_match),
|
"token_match": lambda: _get_regex_pattern(self.token_match),
|
||||||
"url_match": lambda: _get_regex_pattern(self.url_match),
|
"url_match": lambda: _get_regex_pattern(self.url_match),
|
||||||
"exceptions": lambda: dict(sorted(self._rules.items()))
|
"exceptions": lambda: dict(sorted(self._rules.items())),
|
||||||
|
"faster_heuristics": lambda: self.faster_heuristics,
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
|
@ -798,7 +811,8 @@ cdef class Tokenizer:
|
||||||
"infix_finditer": lambda b: data.setdefault("infix_finditer", b),
|
"infix_finditer": lambda b: data.setdefault("infix_finditer", b),
|
||||||
"token_match": lambda b: data.setdefault("token_match", b),
|
"token_match": lambda b: data.setdefault("token_match", b),
|
||||||
"url_match": lambda b: data.setdefault("url_match", b),
|
"url_match": lambda b: data.setdefault("url_match", b),
|
||||||
"exceptions": lambda b: data.setdefault("rules", b)
|
"exceptions": lambda b: data.setdefault("rules", b),
|
||||||
|
"faster_heuristics": lambda b: data.setdefault("faster_heuristics", b),
|
||||||
}
|
}
|
||||||
# reset all properties and flush all caches (through rules),
|
# reset all properties and flush all caches (through rules),
|
||||||
# reset rules first so that _reload_special_cases is trivial/fast as
|
# reset rules first so that _reload_special_cases is trivial/fast as
|
||||||
|
@ -822,6 +836,8 @@ cdef class Tokenizer:
|
||||||
self.url_match = re.compile(data["url_match"]).match
|
self.url_match = re.compile(data["url_match"]).match
|
||||||
if "rules" in data and isinstance(data["rules"], dict):
|
if "rules" in data and isinstance(data["rules"], dict):
|
||||||
self.rules = data["rules"]
|
self.rules = data["rules"]
|
||||||
|
if "faster_heuristics" in data:
|
||||||
|
self.faster_heuristics = data["faster_heuristics"]
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -44,15 +44,16 @@ how to construct a custom tokenizer with different tokenization rules, see the
|
||||||
> tokenizer = nlp.tokenizer
|
> tokenizer = nlp.tokenizer
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||||
| `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
|
| `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
|
||||||
| `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
| `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||||
| `suffix_search` | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
| `suffix_search` | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||||
| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
|
| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
|
||||||
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||||
| `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
| `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||||
|
| `faster_heuristics` <Tag variant="new">3.3.0</Tag> | Whether to restrict the final `Matcher`-based pass for rules to those containing affixes or space. Defaults to `True`. ~~bool~~ |
|
||||||
|
|
||||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user