mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Add tokenizer option to allow Matcher handling for all rules
Add tokenizer option `with_faster_rules_heuristics` that determines whether the special cases applied by the internal `Matcher` are filtered by whether they contain affixes or space. If `True` (default), the rules are filtered to prioritize speed over rare edge cases. If `False`, all rules are included in the final `Matcher`-based pass over the doc.
This commit is contained in:
		
							parent
							
								
									6f4f57f317
								
							
						
					
					
						commit
						59eba273bb
					
				| 
						 | 
					@ -70,6 +70,7 @@ def test_issue4190():
 | 
				
			||||||
            suffix_search=suffix_re.search,
 | 
					            suffix_search=suffix_re.search,
 | 
				
			||||||
            infix_finditer=infix_re.finditer,
 | 
					            infix_finditer=infix_re.finditer,
 | 
				
			||||||
            token_match=nlp.tokenizer.token_match,
 | 
					            token_match=nlp.tokenizer.token_match,
 | 
				
			||||||
 | 
					            with_faster_rules_heuristics=False,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        nlp.tokenizer = new_tokenizer
 | 
					        nlp.tokenizer = new_tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -90,6 +91,7 @@ def test_issue4190():
 | 
				
			||||||
    doc_2 = nlp_2(test_string)
 | 
					    doc_2 = nlp_2(test_string)
 | 
				
			||||||
    result_2 = [token.text for token in doc_2]
 | 
					    result_2 = [token.text for token in doc_2]
 | 
				
			||||||
    assert result_1b == result_2
 | 
					    assert result_1b == result_2
 | 
				
			||||||
 | 
					    assert nlp_2.tokenizer.with_faster_rules_heuristics is False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
 | 
					def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -521,3 +521,20 @@ def test_tokenizer_infix_prefix(en_vocab):
 | 
				
			||||||
    assert tokens == ["±10", "%"]
 | 
					    assert tokens == ["±10", "%"]
 | 
				
			||||||
    explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
 | 
					    explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
 | 
				
			||||||
    assert tokens == explain_tokens
 | 
					    assert tokens == explain_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.issue(10086)
 | 
				
			||||||
 | 
					def test_issue10086(en_tokenizer):
 | 
				
			||||||
 | 
					    """Test special case works when part of infix substring."""
 | 
				
			||||||
 | 
					    text = "No--don't see"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # without heuristics: do n't
 | 
				
			||||||
 | 
					    en_tokenizer.with_faster_rules_heuristics = False
 | 
				
			||||||
 | 
					    doc = en_tokenizer(text)
 | 
				
			||||||
 | 
					    assert "n't" in [w.text for w in doc]
 | 
				
			||||||
 | 
					    assert "do" in [w.text for w in doc]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # with (default) heuristics: don't
 | 
				
			||||||
 | 
					    en_tokenizer.with_faster_rules_heuristics = True
 | 
				
			||||||
 | 
					    doc = en_tokenizer(text)
 | 
				
			||||||
 | 
					    assert "don't" in [w.text for w in doc]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -23,9 +23,10 @@ cdef class Tokenizer:
 | 
				
			||||||
    cdef object _infix_finditer
 | 
					    cdef object _infix_finditer
 | 
				
			||||||
    cdef object _rules
 | 
					    cdef object _rules
 | 
				
			||||||
    cdef PhraseMatcher _special_matcher
 | 
					    cdef PhraseMatcher _special_matcher
 | 
				
			||||||
    # TODO next two are unused and should be removed in v4
 | 
					    # TODO convert to bool in v4
 | 
				
			||||||
 | 
					    cdef int _with_faster_rules_heuristics
 | 
				
			||||||
 | 
					    # TODO next one is unused and should be removed in v4
 | 
				
			||||||
    # https://github.com/explosion/spaCy/pull/9150
 | 
					    # https://github.com/explosion/spaCy/pull/9150
 | 
				
			||||||
    cdef int _unused_int1
 | 
					 | 
				
			||||||
    cdef int _unused_int2
 | 
					    cdef int _unused_int2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
 | 
					    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,7 +34,7 @@ cdef class Tokenizer:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
 | 
					    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
 | 
				
			||||||
                 suffix_search=None, infix_finditer=None, token_match=None,
 | 
					                 suffix_search=None, infix_finditer=None, token_match=None,
 | 
				
			||||||
                 url_match=None):
 | 
					                 url_match=None, with_faster_rules_heuristics=True):
 | 
				
			||||||
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 | 
					        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        vocab (Vocab): A storage container for lexical types.
 | 
					        vocab (Vocab): A storage container for lexical types.
 | 
				
			||||||
| 
						 | 
					@ -43,7 +43,7 @@ cdef class Tokenizer:
 | 
				
			||||||
            `re.compile(string).search` to match prefixes.
 | 
					            `re.compile(string).search` to match prefixes.
 | 
				
			||||||
        suffix_search (callable): A function matching the signature of
 | 
					        suffix_search (callable): A function matching the signature of
 | 
				
			||||||
            `re.compile(string).search` to match suffixes.
 | 
					            `re.compile(string).search` to match suffixes.
 | 
				
			||||||
        `infix_finditer` (callable): A function matching the signature of
 | 
					        infix_finditer (callable): A function matching the signature of
 | 
				
			||||||
            `re.compile(string).finditer` to find infixes.
 | 
					            `re.compile(string).finditer` to find infixes.
 | 
				
			||||||
        token_match (callable): A function matching the signature of
 | 
					        token_match (callable): A function matching the signature of
 | 
				
			||||||
            `re.compile(string).match`, for matching strings to be
 | 
					            `re.compile(string).match`, for matching strings to be
 | 
				
			||||||
| 
						 | 
					@ -51,6 +51,9 @@ cdef class Tokenizer:
 | 
				
			||||||
        url_match (callable): A function matching the signature of
 | 
					        url_match (callable): A function matching the signature of
 | 
				
			||||||
            `re.compile(string).match`, for matching strings to be
 | 
					            `re.compile(string).match`, for matching strings to be
 | 
				
			||||||
            recognized as urls.
 | 
					            recognized as urls.
 | 
				
			||||||
 | 
					        with_faster_rules_heuristics (bool): Whether to restrict the final
 | 
				
			||||||
 | 
					            Matcher-based pass for rules to those containing affixes or space.
 | 
				
			||||||
 | 
					            Defaults to True.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        EXAMPLE:
 | 
					        EXAMPLE:
 | 
				
			||||||
            >>> tokenizer = Tokenizer(nlp.vocab)
 | 
					            >>> tokenizer = Tokenizer(nlp.vocab)
 | 
				
			||||||
| 
						 | 
					@ -66,6 +69,7 @@ cdef class Tokenizer:
 | 
				
			||||||
        self.suffix_search = suffix_search
 | 
					        self.suffix_search = suffix_search
 | 
				
			||||||
        self.infix_finditer = infix_finditer
 | 
					        self.infix_finditer = infix_finditer
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
 | 
					        self.with_faster_rules_heuristics = with_faster_rules_heuristics
 | 
				
			||||||
        self._rules = {}
 | 
					        self._rules = {}
 | 
				
			||||||
        self._special_matcher = PhraseMatcher(self.vocab)
 | 
					        self._special_matcher = PhraseMatcher(self.vocab)
 | 
				
			||||||
        self._load_special_cases(rules)
 | 
					        self._load_special_cases(rules)
 | 
				
			||||||
| 
						 | 
					@ -122,6 +126,14 @@ cdef class Tokenizer:
 | 
				
			||||||
            self._specials = PreshMap()
 | 
					            self._specials = PreshMap()
 | 
				
			||||||
            self._load_special_cases(rules)
 | 
					            self._load_special_cases(rules)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property with_faster_rules_heuristics:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return bool(self._with_faster_rules_heuristics)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __set__(self, with_faster_rules_heuristics):
 | 
				
			||||||
 | 
					            self._with_faster_rules_heuristics = bool(with_faster_rules_heuristics)
 | 
				
			||||||
 | 
					            self._reload_special_cases()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
        args = (self.vocab,
 | 
					        args = (self.vocab,
 | 
				
			||||||
                self.rules,
 | 
					                self.rules,
 | 
				
			||||||
| 
						 | 
					@ -602,7 +614,7 @@ cdef class Tokenizer:
 | 
				
			||||||
            self.mem.free(stale_special)
 | 
					            self.mem.free(stale_special)
 | 
				
			||||||
        self._rules[string] = substrings
 | 
					        self._rules[string] = substrings
 | 
				
			||||||
        self._flush_cache()
 | 
					        self._flush_cache()
 | 
				
			||||||
        if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
 | 
					        if not self.with_faster_rules_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
 | 
				
			||||||
            self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
 | 
					            self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _reload_special_cases(self):
 | 
					    def _reload_special_cases(self):
 | 
				
			||||||
| 
						 | 
					@ -773,7 +785,8 @@ cdef class Tokenizer:
 | 
				
			||||||
            "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
 | 
					            "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
 | 
				
			||||||
            "token_match": lambda: _get_regex_pattern(self.token_match),
 | 
					            "token_match": lambda: _get_regex_pattern(self.token_match),
 | 
				
			||||||
            "url_match": lambda: _get_regex_pattern(self.url_match),
 | 
					            "url_match": lambda: _get_regex_pattern(self.url_match),
 | 
				
			||||||
            "exceptions": lambda: dict(sorted(self._rules.items()))
 | 
					            "exceptions": lambda: dict(sorted(self._rules.items())),
 | 
				
			||||||
 | 
					            "with_faster_rules_heuristics": lambda: self.with_faster_rules_heuristics,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        return util.to_bytes(serializers, exclude)
 | 
					        return util.to_bytes(serializers, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -794,7 +807,8 @@ cdef class Tokenizer:
 | 
				
			||||||
            "infix_finditer": lambda b: data.setdefault("infix_finditer", b),
 | 
					            "infix_finditer": lambda b: data.setdefault("infix_finditer", b),
 | 
				
			||||||
            "token_match": lambda b: data.setdefault("token_match", b),
 | 
					            "token_match": lambda b: data.setdefault("token_match", b),
 | 
				
			||||||
            "url_match": lambda b: data.setdefault("url_match", b),
 | 
					            "url_match": lambda b: data.setdefault("url_match", b),
 | 
				
			||||||
            "exceptions": lambda b: data.setdefault("rules", b)
 | 
					            "exceptions": lambda b: data.setdefault("rules", b),
 | 
				
			||||||
 | 
					            "with_faster_rules_heuristics": lambda b: data.setdefault("with_faster_rules_heuristics", b),
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        # reset all properties and flush all caches (through rules),
 | 
					        # reset all properties and flush all caches (through rules),
 | 
				
			||||||
        # reset rules first so that _reload_special_cases is trivial/fast as
 | 
					        # reset rules first so that _reload_special_cases is trivial/fast as
 | 
				
			||||||
| 
						 | 
					@ -818,6 +832,8 @@ cdef class Tokenizer:
 | 
				
			||||||
            self.url_match = re.compile(data["url_match"]).match
 | 
					            self.url_match = re.compile(data["url_match"]).match
 | 
				
			||||||
        if "rules" in data and isinstance(data["rules"], dict):
 | 
					        if "rules" in data and isinstance(data["rules"], dict):
 | 
				
			||||||
            self.rules = data["rules"]
 | 
					            self.rules = data["rules"]
 | 
				
			||||||
 | 
					        if "with_faster_rules_heuristics" in data:
 | 
				
			||||||
 | 
					            self.with_faster_rules_heuristics = data["with_faster_rules_heuristics"]
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,15 +44,16 @@ how to construct a custom tokenizer with different tokenization rules, see the
 | 
				
			||||||
> tokenizer = nlp.tokenizer
 | 
					> tokenizer = nlp.tokenizer
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name             | Description                                                                                                                                                                   |
 | 
					| Name                           | Description                                                                                                                                                                   |
 | 
				
			||||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `vocab`          | A storage container for lexical types. ~~Vocab~~                                                                                                                              |
 | 
					| `vocab`                        | A storage container for lexical types. ~~Vocab~~                                                                                                                              |
 | 
				
			||||||
| `rules`          | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~                                                                                 |
 | 
					| `rules`                        | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~                                                                                 |
 | 
				
			||||||
| `prefix_search`  | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
 | 
					| `prefix_search`                | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
 | 
				
			||||||
| `suffix_search`  | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
 | 
					| `suffix_search`                | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
 | 
				
			||||||
| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~                                            |
 | 
					| `infix_finditer`               | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~                                            |
 | 
				
			||||||
| `token_match`    | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~                                         |
 | 
					| `token_match`                  | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~                                         |
 | 
				
			||||||
| `url_match`      | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
 | 
					| `url_match`                    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
 | 
				
			||||||
 | 
					| `with_faster_rules_heuristics` | Whether to restrict the final `Matcher`-based pass for rules to those containing affixes or space. Defaults to `True`. ~~bool~~                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
 | 
					## Tokenizer.\_\_call\_\_ {#call tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user